View on GitHub

data-science

Notebooks and Python about data science

If you like this project please add your Star

Get outing data from CampToCamp.org

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotly import express as px, graph_objects as go, subplots as sp
import requests as req
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import re
import io
from datetime import datetime
import itertools
In [23]:
# Avoid lengthy scrapping of data reading the output Parquet file
read_data = True
In [25]:
url_c2c_api = 'https://api.camptocamp.org'
# Max 100 per request
page_len = 100

def get_url_outings(area: int = None, route: int = None,
                    activity: str = 'skitouring', offset: int = 0):
    if area is not None:
        return url_c2c_api + f'/outings?a={area}&act={activity}&limit={page_len}&offset={offset}'
    elif route is not None:
        return url_c2c_api + f'/outings?r={route}&act={activity}&limit={page_len}&offset={offset}'
    raise ValueError('Must set either area or route')

def get_url_outing(id: int):
    return url_c2c_api + f'/outings/{id}'
In [16]:
cols_base1 = ['id', 'title']
cols_base2 = ['date_start', 'date_end', 'labande_global_rating', 'ski_rating', 'elevation_max', 'height_diff_up', 'condition_rating', 'quality']
cols_ext = ['elevation_min', 'elevation_up_snow', 'elevation_down_snow', 'snow_quality', 'snow_quantity', 'frequentation']

Mappers for ordinal columns

In [4]:
ski_rating_mapper = {**{f'{i+1}.{j+1}': 3*i + j for i,j in itertools.product(range(5), range (3))}, '5.4': 15, '5.5': 16, '5.6': 17}
rate_mapper = {'awful': 0, 'poor': 1, 'average': 2, 'good': 3, 'excellent': 4}

Season

winter season spans over two consecutive years

Day of year "centered" on winter with index 0 for February 15th

In [5]:
mid_season = datetime.fromisoformat('2020-02-15').timetuple().tm_yday
mid_season
Out[5]:
46
In [6]:
season_limit_day = mid_season + 182
season_offset = mid_season + 365

Attach temperature

Taking a single temperature reference in Megève, based on data from historique-meteo.net

In [34]:
url_temperature_megeve = 'https://www.historique-meteo.net/site/export.php?ville_id=6011'
s = req.get(url_temperature_megeve).content
df_temperature = pd.read_csv(io.StringIO(s.decode('utf-8')), skiprows=3, sep=',')
df_temperature['date'] = pd.to_datetime(df_temperature['DATE'])
In [35]:
df_temperature['temp_morning_7d'] = df_temperature.TEMPERATURE_MORNING_C.rolling(7).mean()
df_temperature['temp_morning_30d'] = df_temperature.TEMPERATURE_MORNING_C.rolling(30).mean()

Fetch data from Camptocamp.org

In [37]:
def read_outing(id):
    """ Get and extract wanted information from an outing """
    doc = req.get(get_url_outing(id)).json()
    title = doc['locales'][0]['title'] if len(doc['locales']) > 0 else ''
    cols_base2_values = {f: doc[f] for f in cols_base2}
    cols_ext_values = {f: doc[f] for f in cols_ext}
    return dict(id=id, title=title, **cols_base2_values, **cols_ext_values)
In [44]:
def map_outing(doc):
    title = doc['locales'][0]['title'] if len(doc['locales']) > 0 else ''
    cols_base2_values = {f: doc[f] for f in cols_base2}
    return {'id': doc['document_id'], 'title': title, **cols_base2_values}

def process_outings(title: str, area_id: int = None, route_id: int = None, extended_data: bool = True, write_down: bool = False):

    cols = cols_base1 + cols_base2
    if extended_data:
        cols += cols_ext
    
    df_outings = pd.DataFrame(columns=cols)
    try:
        # Get first page in order to get total number of outings
        page_offset = 0
        if area_id is not None:
            outing_list = req.get(get_url_outings(area=area_id, offset=page_offset)).json()
        else:
            outing_list = req.get(get_url_outings(route=route_id, offset=page_offset)).json()
        nb_outings = outing_list['total']
        print(f'Number of outings {nb_outings} for {title}')

        # Iterate
        cnt_outings = 0
        cnt_iter = 0
        print('page offset:', end=' ')
        while cnt_outings < nb_outings and cnt_iter < 1000: 
            print(page_offset, end=', ')
            rows = []
            if 'documents' not in outing_list:
                raise ValueError('End at offset: ' + page_offset)

            if extended_data:
                for outing_doc in outing_list['documents']:
                    outing_data = read_outing(outing_doc['document_id'])
                    rows.append(outing_data)
                    cnt_outings += 1
            else:
                rows = list(map(map_outing, outing_list['documents']))
            df_outings = df_outings.append(rows)
            page_offset += len(outing_list['documents'])
            if area_id is not None:
                outing_list = req.get(get_url_outings(area=area_id, offset=page_offset)).json()
            else:
                outing_list = req.get(get_url_outings(route=route_id, offset=page_offset)).json()
            cnt_iter += 1

        print('download completed')
       
    except Exception as e:
        print('Exception raised!:', str(e))
    
    # Complete columns and cleanup
    df_outings['area_id'] = area_id
    
    df_outings['date_start'] = pd.to_datetime(df_outings['date_start'])
    df_outings['date_end'] = pd.to_datetime(df_outings['date_end'])
    
    # Map ordinal to integers
    df_outings['ski_rating_num']  = df_outings.loc[df_outings.ski_rating != None, ['ski_rating']].replace(ski_rating_mapper)
    df_outings['condition_rating_num'] = df_outings.loc[df_outings.condition_rating != None, ['condition_rating']].replace(rate_mapper)
    df_outings['snow_quantity_num'] = df_outings.loc[df_outings.snow_quantity != None, ['snow_quantity']].replace(rate_mapper)
    df_outings['snow_quality_num'] = df_outings.loc[df_outings.snow_quality != None, ['snow_quality']].replace(rate_mapper)
        
    # Fill missing up/down on skis elevations
    df_outings.elevation_up_snow = df_outings.elevation_up_snow.fillna(df_outings.elevation_min)
    df_outings.elevation_down_snow = df_outings.elevation_down_snow.fillna(df_outings.elevation_min)
      
    # Season and day within season
    df_outings['day_of_season'] = df_outings.date_start.dt.dayofyear.apply(lambda d: d - season_offset if d > season_limit_day else d - mid_season)
    df_outings['season'] = df_outings.date_start.dt.year
    df_outings.loc[df_outings.day_of_season < 0, ['season']] = df_outings.loc[df_outings.day_of_season < 0, ['season']].apply(lambda y: y + 1) 
       
    # Temperature
    df_outings = pd.merge(df_outings, df_temperature[['TEMPERATURE_MORNING_C', 'temp_morning_7d', 'temp_morning_30d', 'date']], left_on='date_start', right_on='date')
        
    # Write to Parquet
    if write_down:
        df_outings.to_parquet(f'data/C2C/outings_{title}.parquet', compression='GZIP')
        
    return df_outings
In [32]:
if read_data:
    area_title = 'haute-savoie'
    df_outings = pd.read_parquet(f'data/C2C/outings_{area_title}.parquet')
    route_title = 'trou-de-la-mouche'
    df_outings_tdlm = pd.read_parquet(f'data/C2C/outings_{route_title}.parquet')
else:
    df_outings = process_outings('haute-savoie', area_id=14366, write_down=True)
    df_outings_tdlm = process_outings('trou-de-la-mouche', route_id=46396, write_down=True)
len(df_outings)    
Out[32]:
9881
In [46]:
df_outings_tdlm.head(3)
Out[46]:
id title date_start date_end labande_global_rating ski_rating elevation_max height_diff_up condition_rating quality ... ski_rating_num condition_rating_num snow_quantity_num snow_quality_num day_of_season season TEMPERATURE_MORNING_C temp_morning_7d temp_morning_30d date
0 1094961 Trou de la Mouche : Boucle Paccaly >> Grand Crêt 2019-03-24 2019-03-24 PD+ 2.3 2453.0 1030.0 good fine ... 5 3.0 4.0 3.0 37 2019 -1 -6.000000 -4.4 2019-03-24
1 1082262 Trou de la Mouche : Boucle Paccaly >> Grand Crêt 2019-02-14 2019-02-14 PD+ 2.3 2453.0 1030.0 excellent fine ... 5 4.0 4.0 4.0 -1 2020 -6 -6.571429 -8.4 2019-02-14
2 1070876 Trou de la Mouche : Boucle Paccaly >> Grand Crêt 2019-01-12 2019-01-12 PD+ 2.3 2453.0 1240.0 excellent fine ... 5 4.0 2.0 3.0 -34 2020 -7 -6.285714 -4.6 2019-01-12

3 rows × 27 columns

Extract maps for ordinal attributes from string to int

In [13]:
df_outings.quality.unique()
Out[13]:
array(['fine', 'medium', 'great', 'draft', 'empty'], dtype=object)
In [14]:
df_outings.labande_global_rating.unique(), df_outings.ski_rating.unique(), 
Out[14]:
(array(['TD', None, 'AD', 'F', 'PD-', 'AD+', 'D', 'TD-', 'ED', 'PD', 'D+',
        'AD-', 'PD+', 'F+', 'ED+', 'D-', 'TD+', 'ED-'], dtype=object),
 array(['5.3', None, '3.2', '1.2', '2.3', '3.3', '4.2', '5.1', '5.2',
        '5.4', '4.3', '3.1', '1.3', '5.5', '2.1', '2.2', '1.1', '4.1',
        '5.6'], dtype=object))
In [15]:
df_outings.condition_rating.unique(), df_outings.snow_quality.unique(), df_outings.snow_quantity.unique()
Out[15]:
(array(['average', 'excellent', 'good', None, 'poor', 'awful'],
       dtype=object),
 array(['poor', None, 'good', 'average', 'excellent', 'awful'],
       dtype=object),
 array(['average', None, 'good', 'excellent', 'poor', 'awful'],
       dtype=object))
In [10]:
# df_outings.elevation_up_snow.isna().sum(), df_outings.elevation_down_snow.isna().sum(), df_outings.elevation_min.isna().sum(), df_outings.elevation_max.isna().sum()

Description

In [20]:
temperature_features = ['TEMPERATURE_MORNING_C', 'temp_morning_7d', 'temp_morning_30d']
intrinsic_features = ['ski_rating_num', 'elevation_max']
dependent_features = ['elevation_up_snow', 'elevation_down_snow', 'condition_rating_num', 'day_of_season']
used_cols = [*temperature_features, *intrinsic_features, *dependent_features] 
In [21]:
df_outings[used_cols].describe()
Out[21]:
TEMPERATURE_MORNING_C temp_morning_7d temp_morning_30d ski_rating_num elevation_max elevation_up_snow elevation_down_snow condition_rating_num day_of_season
count 9881.000000 9813.000000 9676.000000 8534.000000 8781.000000 9084.000000 9041.000000 9007.000000 9881.000000
mean -5.304423 -4.799159 -4.716322 6.508320 2511.026421 1446.922061 1401.968477 2.947263 7.166076
std 6.671790 5.374441 4.210974 3.121939 765.773873 615.210683 530.861852 0.799730 46.240515
min -28.000000 -19.285714 -13.533333 0.000000 3.000000 0.000000 0.000000 0.000000 -140.000000
25% -10.000000 -8.285714 -7.466667 4.000000 2072.000000 1120.000000 1120.000000 2.000000 -25.000000
50% -6.000000 -5.428571 -5.516667 6.000000 2369.000000 1270.000000 1263.000000 3.000000 6.000000
75% -1.000000 -2.000000 -2.733333 9.000000 2645.000000 1450.000000 1450.000000 3.000000 37.000000
max 17.000000 16.285714 15.366667 17.000000 29999.000000 12002.000000 3842.000000 4.000000 172.000000
In [22]:
px.imshow(df_outings[used_cols].corr())
In [48]:
px.imshow(df_outings_tdlm[used_cols].corr(), 
          title='Feature correlation for outings on specific route (Trou de la Mouche)', height=500)

Distributions (histograms)

Report quality

In [61]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_outings['quality']))

Qualitative attributes

In [58]:
df_outings['condition_rating'].isna().sum(), df_outings['snow_quality'].isna().sum(), df_outings['snow_quantity'].isna().sum()
Out[58]:
(876, 8126, 8216)
In [51]:
fig = sp.make_subplots(cols=3, rows=1, shared_yaxes=True, subplot_titles=['Condition', 'Snow quality', 'Snow quantity'])
labels_ratings = {'condition_rating_num': 'Condition', 'snow_quality_num': 'Snow quality', 'snow_quantity_num': 'Snow quantity' }
fig.update_layout(bargroupgap=0.1, xaxis=dict(categoryorder='array', categoryarray=list(rate_mapper.keys()), type='category'))
fig.add_histogram(x=df_outings['condition_rating'], col=1, row=1, showlegend=False)
fig.add_histogram(x=df_outings['snow_quality'], col=2, row=1, showlegend=False)
fig.add_histogram(x=df_outings['snow_quantity'], col=3, row=1, showlegend=False)
In [65]:
px.imshow(df_outings[['condition_rating_num', 'snow_quality_num', 'snow_quantity_num']].corr(), 
                  labels=labels_ratings)

Elevations

In [52]:
print('Elevation [m]')
elevation_labels = {'elevation_min': 'Min', 'elevation_max': 'Max', 
                    'elevation_up_snow': 'Skis on, way up', 'elevation_down_snow': 'Skis off, way down', 
                    'height_diff_up': 'Height difference'}
fig = sp.make_subplots(rows=1, cols=5, shared_yaxes=True, subplot_titles=list(elevation_labels.values()))
bins = {'start': 0, 'end': 4500, 'size': 100}
fig.add_histogram(x=df_outings['elevation_min'], xbins=bins, row=1, col=1, showlegend=False)
fig.add_histogram(x=df_outings['elevation_up_snow'], xbins=bins, row=1, col=2, showlegend=False)
fig.add_histogram(x=df_outings['elevation_down_snow'], xbins=bins, row=1, col=3, showlegend=False)
fig.add_histogram(x=df_outings['elevation_max'], xbins=bins, row=1, col=4, showlegend=False)
fig.add_histogram(x=df_outings['height_diff_up'], xbins=bins, row=1, col=5, showlegend=False)
Elevation [m]

The most populated fields are by decreasing order: 'height_diff_up' , 'elevation_up_snow', 'elevation_down_snow', 'elevation_max'

In [53]:
df_outings['elevation_min'].count(), df_outings['elevation_max'].count(), df_outings['height_diff_up'].count()
Out[53]:
(2644, 8791, 9574)
In [54]:
df_outings['elevation_up_snow'].count(), df_outings['elevation_down_snow'].count(), (df_outings['elevation_up_snow'] == df_outings['elevation_down_snow']).sum()
Out[54]:
(9163, 9118, 8214)
In [211]:
px.scatter_matrix(df_outings[['elevation_min', 'elevation_up_snow', 'elevation_down_snow', 'elevation_max']], opacity=0.1, 
                  title='True elevations correlations', labels=elevation_labels, height=800)

Comments:

  • There is a strong correlation between the min elevation and the skis on/off elevation as expected
  • The max elevation is lightly correlated with the other
  • There are some anomalic outliers above 5000m (Mont Blanc is at 4807m), are they measured in feet ?

Ski difficulty

In [202]:
fig = go.Figure(layout=dict(title='Ski rating', bargroupgap=0.1))
fig.add_trace(go.Histogram(x=df_outings.ski_rating))
fig.update_layout(xaxis=dict(categoryorder='array', categoryarray=list(ski_rating_mapper.keys()), type="category"))

Date

In [85]:
fig = go.Figure(layout=dict(title='Start date', bargroupgap=0.01))
fig.add_trace(go.Histogram(x=df_outings.date_start, nbinsx=500))
In [114]:
fig = go.Figure(layout=dict(title='Season', bargroupgap=0.1))
fig.add_histogram(x=df_outings.season)
In [122]:
fig = go.Figure(layout=dict(title='Day of season (mid season = Feb 15th)', bargroupgap=0.1))
fig.add_histogram(x=df_outings.day_of_season, nbinsx=52)

Morning temperature

In [195]:
fig = go.Figure(layout=dict(title='Morning temperature [°C]', bargroupgap=0.1))
bins = {'start': -35, 'end': 30, 'size': 1}
fig.add_histogram(x=df_outings.TEMPERATURE_MORNING_C, name='Morning', xbins=bins)
fig.add_histogram(x=df_outings.temp_morning_7d, name='Morning 7 days', xbins=bins)
fig.add_histogram(x=df_outings.temp_morning_30d, name='Morning 30 days', xbins=bins)
In [212]:
px.scatter_matrix(df_outings[['TEMPERATURE_MORNING_C', 'temp_morning_7d', 'temp_morning_30d']], opacity=0.1, 
                  title='Temperature correlations', height=600)

Comparison beginning and end of period

Taking:

  • beginning as: 2010-2012,
  • end as: 2017-2019 (2020 is special with Covid lock-down)
In [54]:
df_sel_begin = df_outings[df_outings.season.isin([2010, 2011, 2012])]
df_sel_end = df_outings[df_outings.season.isin([2017, 2018, 2019])]
len(df_sel_begin), len(df_sel_end)
Out[54]:
(2433, 2149)
In [55]:
fig = go.Figure(layout=dict(title='Day of season (mid season = Feb 15th)', bargroupgap=0.01, 
                            xaxis=dict(title='Day relative to Feb 15th'), yaxis=dict(title='%')))
fig.add_histogram(x=df_sel_begin.day_of_season, name='2010-2012', nbinsx=52, histnorm='percent')
fig.add_histogram(x=df_sel_end.day_of_season, name='2017-2019', nbinsx=52, histnorm='percent')

Comment: there are a lot less outing early season in 2017-2019. There is a little more outing at the end of the season, but the tail is also shorter.

In [56]:
fig = go.Figure(layout=dict(title='Condition rating', bargroupgap=0.01, yaxis=dict(title='%'), 
                            xaxis=dict(categoryorder='array', categoryarray=list(rate_mapper.keys()), type='category')))
fig.add_histogram(x=df_sel_begin.condition_rating, name='2010-2012', histnorm='percent')
fig.add_histogram(x=df_sel_end.condition_rating, name='2017-2019', histnorm='percent')
In [60]:
print(f"Skis on on the way up median: 2010-2012 = {df_sel_begin['elevation_down_snow'].median():.1f}, 2017-2019 = {df_sel_end['elevation_down_snow'].median():.1f}")
fig = go.Figure(layout=dict(title='Skis off elevation, way down', bargroupgap=0.01, 
                            xaxis=dict(title='[m]'), yaxis=dict(title='%')))
fig.add_histogram(x=df_sel_begin.elevation_down_snow, name='2010-2012', histnorm='percent', cumulative_enabled=True)
fig.add_histogram(x=df_sel_end.elevation_down_snow, name='2017-2019', histnorm='percent', cumulative_enabled=True)
Skis on on the way up median: 2010-2012 = 1291.0, 2017-2019 = 1263.0
In [61]:
print(f'Mean morning temperature, for 2010-2012={df_sel_begin.TEMPERATURE_MORNING_C.mean():.3g}, for 2017-2019={df_sel_end.TEMPERATURE_MORNING_C.mean():.3g}')
fig = go.Figure(layout=dict(title='Morning temperature [°C]', bargroupgap=0.1, yaxis=dict(title='%')))
fig.add_histogram(x=df_sel_begin.TEMPERATURE_MORNING_C, name='2010-2012', histnorm='percent', cumulative_enabled=True)
fig.add_histogram(x=df_sel_end.TEMPERATURE_MORNING_C, name='2017-2019', histnorm='percent', cumulative_enabled=True)
Mean morning temperature, for 2010-2012=-6, for 2017-2019=-4.18
In [62]:
print(f'Mean morning 7 day temperature, for 2010-2012={df_sel_begin.temp_morning_7d.mean():.3g}, for 2017-2019={df_sel_end.temp_morning_7d.mean():.3g}')
fig = go.Figure(layout=dict(title='Morning temperature average 7 days [°C]', bargroupgap=0.1, yaxis=dict(title='%')))
fig.add_histogram(x=df_sel_begin.temp_morning_7d, name='2010-2012', histnorm='percent', cumulative_enabled=True)
fig.add_histogram(x=df_sel_end.temp_morning_7d, name='2017-2019', histnorm='percent', cumulative_enabled=True)
Mean morning 7 day temperature, for 2010-2012=-5.38, for 2017-2019=-3.9
In [63]:
print(f'Mean morning 30 day temperature, for 2010-2012={df_sel_begin.temp_morning_30d.mean():.3g}, for 2017-2019={df_sel_end.temp_morning_30d.mean():.3g}')
fig = go.Figure(layout=dict(title='Morning temperature average 30 days [°C]', bargroupgap=0.1, yaxis=dict(title='%')))
fig.add_histogram(x=df_sel_begin.temp_morning_30d, name='2010-2012', histnorm='percent', cumulative_enabled=True)
fig.add_histogram(x=df_sel_end.temp_morning_30d, name='2017-2019', histnorm='percent', cumulative_enabled=True)
Mean morning 30 day temperature, for 2010-2012=-5.48, for 2017-2019=-4.02

Comment:

  • No major change on the "skis off" elevation
  • No change on the condition rating. Rating is even a bit higher overall in 2017-2019
  • Season tends to be shifted: start much later
  • Temperature is much above (1.82°C in average)
In [ ]: