GAN on C2C ski touring outing data with application to global warming prediction¶
Based on ski-touring outing reports from www.camptocamp.org in Haute-Savoie (France) and local temperature data reports in Megève, Generative Adversarial Networks are used to estimate the joint probability distribution of the outing features. GAN model is then modified to exhibit a Bayesian network structure (aka a graphical model) and pre-contrain the model on the type of outing (max elevation, difficulty), and temperature.
Learning goals:
- Use GAN to model an unknown distribution
- Mix GAN and graphical models
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, losses, optimizers, metrics, activations
import tensorview as tv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from plotly import express as px, graph_objects as go, subplots as sp
import itertools
from datetime import datetime
area_title = 'haute-savoie'
log_dir="logs/"
style_true = dict(color='cadetblue')
style_generated = dict(color='fuchsia')
style_fooled = dict(color='blue')
style_warm = dict(color='darkorange')
# Demappers: from numerical to ordinal labels (strings)
rating_unmapper = {0: 'awful', 1: 'poor', 2: 'average', 3: 'good', 4: 'excellent'}
ski_rating_unmapper = {**{3*i + j: f'{i+1}.{j+1}' for i,j in itertools.product(range(5), range (3))}, 15: '5.4', 16: '5.5', 17: '5.6'}
df_outings = pd.read_parquet(f'data/C2C/outings_{area_title}.parquet')
# Features
features_c1 = {'ski_rating_num': 'Ski rating (numerical)', 'elevation_max': 'Elevation max'}
temperature_features = {'TEMPERATURE_MORNING_C': 'Morning temperature', 'temp_morning_7d': 'Morning temperature last 7 days', 'temp_morning_30d': 'Morning temperature last 30 days'}
features_c2 = {**temperature_features, 'day_of_season': 'Day of season'}
features_c3 = {'elevation_up_snow': 'Skis on, way up', 'elevation_down_snow': 'Skis off, way down', 'condition_rating_num': 'Condition rating (numerical)'}
used_cols_dict = {**features_c1, **features_c2, **features_c3}
used_cols = list(used_cols_dict.keys())
# Conditions to select outings:
# - Some serious impossible outliers on the elevation (could be in feet)
# - Filter on quality to retain fine, medium and great
condition = (df_outings.elevation_up_snow < 5000) & (df_outings.elevation_down_snow < 5000) & (df_outings.elevation_max < 5000) \
& (df_outings.elevation_up_snow > 200) & (df_outings.elevation_down_snow > 200) & (df_outings.elevation_max > 200) \
& ((df_outings.quality == 'fine') | (df_outings.quality == 'medium') | (df_outings.quality == 'great'))
df_sel = df_outings.loc[condition, used_cols]
# Remove rows containing at least 1 na
df_sel = df_sel[(~df_sel.isna().any(axis=1))]
# Permutation
df_sel_perm = np.random.permutation(df_sel)
len(df_sel)
len(used_cols)
scaler = StandardScaler()
df_sel_scaled = scaler.fit_transform(df_sel_perm)
df_sel.describe()
sel_corr = df_sel.rename(columns=used_cols_dict).corr()
px.imshow(sel_corr,
title='Feature correlations in reference (train) data', height=500)
Model¶
batch_size = 512
latent_dim = 20
num_features = len(used_cols)
generator = models.Sequential([
layers.Dense(32, input_dim=latent_dim, name='g_1', activation=activations.relu),
layers.Dropout(0.3),
layers.Dense(48, name='g_2', activation=activations.relu),
layers.Dropout(0.2),
layers.Dense(48, name='g_3', activation=activations.relu),
layers.Dense(64, name='g_4', activation=activations.relu),
layers.Dropout(0.2),
layers.Dense(num_features, name='g_5')
], name='generator')
generator.compile()
generator.summary()
discriminator = models.Sequential([
layers.Dense(64, input_dim=num_features, name='d_1', activation=activations.relu),
layers.Dropout(0.3),
layers.Dense(48, name='d_2', activation=activations.relu),
layers.Dropout(0.2),
layers.Dense(48, name='d_3', activation=activations.relu),
layers.Dense(32, name='d_4', activation=activations.relu),
layers.Dropout(0.2),
layers.Dense(1, name='d_5') # activation='sigmoid',
], name='discriminator')
discriminator.compile()
tf.keras.utils.plot_model(generator, show_shapes=True, dpi=64)
Train¶
epochs = 400
batch_per_epoch = len(df_sel_scaled) // batch_size
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
def generator_loss(disc_generated_output):
return loss_object(tf.ones_like(disc_generated_output), disc_generated_output)
def discriminator_loss(disc_real_output, disc_generated_output):
real_loss = loss_object(tf.ones_like(disc_real_output), disc_real_output)
generated_loss = loss_object(tf.zeros_like(disc_generated_output), disc_generated_output)
return real_loss + generated_loss
def get_summary_writer():
return tf.summary.create_file_writer(log_dir + "fit/" + datetime.now().strftime("%Y%m%d-%H%M%S"))
@tf.function
def train_step(generator, discriminator,
generator_optimizer, discriminator_optimizer,
generator_latent, batch,
epoch, summary_writer):
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
gen_latent = generator_latent()
gen_output = generator(gen_latent, training=True)
disc_real_output = discriminator(batch, training=True)
disc_generated_output = discriminator(gen_output, training=True)
gen_loss = generator_loss(disc_generated_output)
disc_loss = discriminator_loss(disc_real_output, disc_generated_output)
generator_gradients = gen_tape.gradient(gen_loss, generator.trainable_variables)
discriminator_gradients = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
generator_optimizer.apply_gradients(zip(generator_gradients, generator.trainable_variables))
discriminator_optimizer.apply_gradients(zip(discriminator_gradients, discriminator.trainable_variables))
with summary_writer.as_default():
tf.summary.scalar('gen_loss', gen_loss, step=epoch)
tf.summary.scalar('disc_loss', disc_loss, step=epoch)
return gen_loss, disc_loss
generator_optimizer = tf.keras.optimizers.Adam(3e-4, beta_1=0.4)
discriminator_optimizer = tf.keras.optimizers.Adam(3e-4, beta_1=0.4)
tv_plot = tv.train.PlotMetrics(wait_num=200, columns=2, iter_num=epochs * batch_per_epoch)
summary = get_summary_writer()
def generator_latent():
return tf.random.normal((batch_size, latent_dim), 0, 1)
for epoch in range(epochs):
for b in range(batch_per_epoch):
train_batch = df_sel_scaled[b * batch_size:(b+1) * batch_size]
g_loss, d_loss = train_step(generator, discriminator,
generator_optimizer, discriminator_optimizer,
generator_latent, train_batch,
epoch, summary)
# Plot
tv_plot.update({ 'discriminator_loss': d_loss,# 'discriminator_acc': d_acc,
'generator_loss': g_loss, # 'generator_acc': g_acc
})
tv_plot.draw()
# saving (checkpoint) the model every 20 epochs
#if (epoch + 1) % 20 == 0:
# checkpoint.save(file_prefix = checkpoint_prefix)
Helpers¶
def print_stats(sets, feature, label, format_float=False):
if format_float:
print(f'Mean/std {label}:', ', '.join([f'for {key}={data[feature].mean():.1f}/{data[feature].std():.1f}' for key, data in sets.items()]))
else:
print(f'Mean/std {label}:', ', '.join([f'for {key}={data[feature].mean():.3g}/{data[feature].std():.3g}' for key, data in sets.items()]))
def print_median_stats(sets, feature, label, format_float=False):
if format_float:
print(f'Median {label}:', ', '.join([f'for {key}={data[feature].median():.1f}' for key, data in sets.items()]))
else:
print(f'Median {label}:', ', '.join([f'for {key}={data[feature].median():.3g}' for key, data in sets.items()]))
Test¶
gen_latent = np.random.normal(0, 1, (20000, latent_dim))
gen_outings = generator.predict(gen_latent)
gen_outings_unscaled = scaler.inverse_transform(gen_outings)
scores = tf.sigmoid(discriminator.predict(gen_outings))
fooled = scores >= 0.5
fooled.numpy().mean()
df_generated = pd.DataFrame(gen_outings_unscaled, columns=used_cols)
df_generated['ski_rating'] = df_generated['ski_rating_num'].round().clip(0, 17).replace(ski_rating_unmapper)
df_generated['condition_rating'] = df_generated['condition_rating_num'].round().clip(0, 4).replace(rating_unmapper)
df_generated.describe()
generated_corr = df_generated.rename(columns=used_cols_dict).corr()
px.imshow(generated_corr, height=500)
corr_diff1 = (generated_corr - sel_corr).abs()
print(f'Mean absolute error on correlations: {corr_diff1.values.mean()}')
px.imshow(corr_diff1, zmax=0.3, color_continuous_scale='viridis',
title='Absolute correlation differences between train and generated', height=500)
print_median_stats({'true': df_sel, 'generated': df_generated}, 'elevation_up_snow', 'Skis on, way up', True)
print_median_stats({'true': df_sel, 'generated': df_generated}, 'elevation_down_snow', 'Skis off, way down', True)
print_median_stats({'true': df_sel, 'generated': df_generated}, 'elevation_max', 'Elevation max', True)
fig = sp.make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=['Skis on, way up', 'Skis off, way down', 'Elevation max'], x_title='Elevations [m]')
bins = {'start': 0, 'end': 4800, 'size': 100}
fig.add_trace(go.Histogram(x=df_sel['elevation_up_snow'], xbins=bins, name='true', histnorm='percent', marker=style_true), row=1, col=1)
fig.add_trace(go.Histogram(x=df_generated['elevation_up_snow'], xbins=bins, name='generated', histnorm='percent', marker=style_generated), row=1, col=1)
fig.add_trace(go.Histogram(x=df_sel['elevation_down_snow'], xbins=bins, name='true', histnorm='percent', marker=style_true), row=1, col=2)
fig.add_trace(go.Histogram(x=df_generated['elevation_down_snow'], xbins=bins, name='generated', histnorm='percent', marker=style_generated), row=1, col=2)
fig.add_trace(go.Histogram(x=df_sel['elevation_max'], xbins=bins, name='true', histnorm='percent', marker=style_true), row=1, col=3)
fig.add_trace(go.Histogram(x=df_generated['elevation_max'], xbins=bins, name='generated', histnorm='percent', marker=style_generated), row=1, col=3)
px.scatter_matrix(df_generated[['elevation_up_snow', 'elevation_down_snow', 'elevation_max']], opacity=0.1, title='Generated elevations correlations', labels=used_cols_dict)
print_stats({'true': df_sel, 'generated': df_generated}, 'day_of_season', 'day of season')
fig = go.Figure(layout=dict(title='Day of season (mid season = Feb 15th)', bargroupgap=0.01,
xaxis=dict(title='Day relative to Feb 15th'), yaxis=dict(title='%')))
fig.add_histogram(x=df_sel.day_of_season, name='true', nbinsx=52, histnorm='percent', marker=style_true)
fig.add_histogram(x=df_generated.day_of_season, name='generated', nbinsx=52, histnorm='percent', marker=style_generated)
Ski ratings¶
print_stats({'true': df_sel, 'generated': df_generated}, 'ski_rating_num', 'ski rating (numerical)')
fig = go.Figure(layout=dict(title='Ski rating', bargroupgap=0.01,
xaxis=dict(title='1.1 (easy) to 5.6 (extreme)', categoryorder='array', categoryarray=list(ski_rating_unmapper.values()), type="category"), yaxis=dict(title='%')))
fig.add_histogram(x=df_outings[condition].ski_rating, name='true', histnorm='percent', marker=style_true)
fig.add_histogram(x=df_generated.ski_rating, name='generated', histnorm='percent', marker=style_generated)
Condition ratings¶
print_stats({'true': df_sel, 'generated': df_generated}, 'condition_rating_num', 'condition rating (numerical)')
fig = go.Figure(layout=dict(title='Condition rating', bargroupgap=0.01,
xaxis=dict(categoryorder='array', categoryarray=list(rating_unmapper.values()), type="category"), yaxis=dict(title='%')))
fig.add_histogram(x=df_outings[condition].condition_rating, name='true', histnorm='percent', marker=style_true)
fig.add_histogram(x=df_generated.condition_rating, name='generated', histnorm='percent', marker=style_generated)
Temperatures¶
print(f'Mean/std morning temperature, for true={df_sel.TEMPERATURE_MORNING_C.mean():.3g}/{df_sel.TEMPERATURE_MORNING_C.std():.3g}, for generated={df_generated.TEMPERATURE_MORNING_C.mean():.3g}/{df_generated.TEMPERATURE_MORNING_C.std():.3g}')
print(f'Mean/std last 7 day morning temperature, for true={df_sel.temp_morning_7d.mean():.3g}/{df_sel.temp_morning_7d.std():.3g}, for generated={df_generated.temp_morning_7d.mean():.3g}/{df_generated.temp_morning_7d.std():.3g}')
print(f'Mean/std last 30 day morning temperature, for true={df_sel.temp_morning_30d.mean():.3g}/{df_sel.temp_morning_30d.std():.3g}, for generated={df_generated.temp_morning_30d.mean():.3g}/{df_generated.temp_morning_30d.std():.3g}')
fig = sp.make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=['today', 'last 7 day', 'last 30 day'],
x_title='Morning temperature [°C]')
# fig = go.Figure(layout=dict(title='Trou de la Mouche, morning temperature [°C]', bargroupgap=0.1, yaxis=dict(title='%')))
bins = {'start': -35, 'end': 30, 'size': 1}
fig.add_histogram(x=df_sel.TEMPERATURE_MORNING_C, name='true', histnorm='percent', xbins=bins, marker=style_true, row=1, col=1)
fig.add_histogram(x=df_generated.TEMPERATURE_MORNING_C, name='generated', histnorm='percent', xbins=bins, marker=style_generated, row=1, col=1)
fig.add_histogram(x=df_sel.temp_morning_7d, name='true', histnorm='percent', xbins=bins, marker=style_true, row=1, col=2)
fig.add_histogram(x=df_generated.temp_morning_7d, name='generated', histnorm='percent', xbins=bins, marker=style_generated, row=1, col=2)
fig.add_histogram(x=df_sel.temp_morning_30d, name='true', histnorm='percent', xbins=bins, marker=style_true, row=1, col=3)
fig.add_histogram(x=df_generated.temp_morning_30d, name='generated', histnorm='percent', xbins=bins, marker=style_generated, row=1, col=3)
fig.update_yaxes(title='%')
px.scatter_matrix(df_generated[['TEMPERATURE_MORNING_C', 'temp_morning_7d', 'temp_morning_30d', 'elevation_up_snow', 'elevation_down_snow']], opacity=0.1,
title='Temperature-elevation correlations', height=600)
Part-2 Adding some independency constraints on the generator¶
Clique 1 features of the outing are set independent: maximum elevation and ski rating.
Clique 2 features (temperatures, day of season) are dependent on feature of clique 1 of other features.
The other elevations depend on the quantity and quality of snow and are thus dependent on the clique 1, clique 2 features and day within the season (sun light is dependent on the day of the year).
num_features_c1 = len(features_c1)
num_features_c2 = len(features_c2)
num_features_c3 = num_features - num_features_c1 - num_features_c2
num_latent2 = 10
gen_learning_rate2 = 0.002
def make_generator(n_feat: int, name: str, prefix: str, num_latent: int, num_element_base: int):
return models.Sequential([
layers.Dense(num_element_base * 2, input_dim=num_latent, name=f'{prefix}_1', activation=activations.relu),
layers.Dropout(0.3),
layers.Dense(num_element_base * 4, name=f'{prefix}_2', activation=activations.relu),
layers.Dropout(0.2),
layers.Dense(num_element_base * 6, name=f'{prefix}_4', activation=activations.relu),
layers.Dropout(0.2),
layers.Dense(n_feat, name=f'{prefix}_5')
], name=name)
gen_c1 = make_generator(num_features_c1, 'gen_c1', 'g1', num_latent2, 12)
gen_c2 = make_generator(num_features_c2, 'gen_c2', 'g2', num_latent2, 12)
gen_c3 = make_generator(num_features_c3, 'gen_c3', 'g3', num_latent2, 16)
# Clique #1: outing summit elevation and ski rating
num_latent_c1 = num_latent2
input_c1 = layers.Input(num_latent_c1, name='latent_c1')
c1 = gen_c1(input_c1)
# Clique #2: temperatures and day of season
num_latent_c2 = num_latent2 - num_features_c1
input_c2 = layers.Input(num_latent_c2, name='latent_c2')
input_c2b = layers.concatenate([c1, input_c2])
c2 = gen_c2(input_c2b)
# Clique #3: all other dependent features
num_latent_c3 = num_latent2 - num_features_c1 - num_features_c2
input_c3 = layers.Input(num_latent_c3, name='latent_c3')
input_c3b = layers.concatenate([c1, c2, input_c3])
c3 = gen_c3(input_c3b)
# Output of generator
gen2_output = layers.concatenate([c1, c2, c3], name='generator_output')
generator2 = models.Model([input_c1, input_c2, input_c3], gen2_output, name='generator2')
generator2.compile()
# generator2.summary()
discriminator2 = models.Sequential([
layers.Dense(64, input_dim=num_features, name='d_1', activation=activations.relu),
layers.Dropout(0.3),
layers.Dense(48, name='d_2', activation=activations.relu),
layers.Dropout(0.2),
layers.Dense(48, name='d_3', activation=activations.relu),
layers.Dense(32, name='d_4', activation=activations.relu),
layers.Dropout(0.2),
layers.Dense(1, name='d_5') # , activation='sigmoid' linear activation to output logits
], name='discriminator2')
discriminator2.compile()
tf.keras.utils.plot_model(generator2, show_shapes=True, dpi=64)
Train 2¶
@tf.function
def train_step2(generator, discriminator,
generator_optimizer, discriminator_optimizer,
generator_latent, batch,
epoch, summary_writer):
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
gen_latent = generator_latent()
gen_output = generator(gen_latent, training=True)
disc_real_output = discriminator(batch, training=True)
disc_generated_output = discriminator(gen_output, training=True)
gen_loss = generator_loss(disc_generated_output)
disc_loss = discriminator_loss(disc_real_output, disc_generated_output)
generator_gradients = gen_tape.gradient(gen_loss, generator.trainable_variables)
discriminator_gradients = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
generator_optimizer.apply_gradients(zip(generator_gradients, generator.trainable_variables))
discriminator_optimizer.apply_gradients(zip(discriminator_gradients, discriminator.trainable_variables))
with summary_writer.as_default():
tf.summary.scalar('gen_loss', gen_loss, step=epoch)
tf.summary.scalar('disc_loss', disc_loss, step=epoch)
return gen_loss, disc_loss
generator_optimizer2 = tf.keras.optimizers.Adam(4e-4, beta_1=0.3)
discriminator_optimizer2 = tf.keras.optimizers.Adam(4e-4, beta_1=0.3)
tv_plot = tv.train.PlotMetrics(wait_num=200, columns=2, iter_num=epochs * batch_per_epoch)
summary2 = get_summary_writer()
def generator_latent2():
return [tf.random.normal((batch_size, num_latent_c1), 0, 1),
tf.random.normal((batch_size, num_latent_c2), 0, 1),
tf.random.normal((batch_size, num_latent_c3), 0, 1)]
for epoch in range(epochs):
for b in range(batch_per_epoch):
train_batch = df_sel_scaled[b * batch_size:(b+1) * batch_size]
g_loss, d_loss = train_step2(generator2, discriminator2,
generator_optimizer2, discriminator_optimizer2,
generator_latent2, train_batch, epoch, summary2)
# Plot
tv_plot.update({ 'discriminator_loss': d_loss,# 'discriminator_acc': d_acc,
'generator_loss': g_loss, # 'generator_acc': g_acc
})
tv_plot.draw()
# saving (checkpoint) the model every 20 epochs
#if (epoch + 1) % 20 == 0:
# checkpoint.save(file_prefix = checkpoint_prefix)
Test 2¶
num_test = 20000
gen_latent2 = [np.random.normal(0, 1, (num_test, num_latent_c1)),
np.random.normal(0, 1, (num_test, num_latent_c2)),
np.random.normal(0, 1, (num_test, num_latent_c3))]
gen_outings2 = generator2.predict(gen_latent2)
gen_outings2_unscaled = scaler.inverse_transform(gen_outings2)
scores2 = tf.sigmoid(discriminator2.predict(gen_outings2))
fooled2 = scores2 >= 0.5
scores2.numpy().mean(), fooled2.numpy().mean()
df_generated2 = pd.DataFrame(gen_outings2_unscaled, columns=used_cols)
df_generated2['ski_rating'] = df_generated2['ski_rating_num'].round().clip(0, 17).replace(ski_rating_unmapper)
df_generated2['condition_rating'] = df_generated2['condition_rating_num'].round().clip(0, 4).replace(rating_unmapper)
df_generated2_fooled = df_generated2.loc[fooled2.numpy()]
df_generated2.describe()
generated_corr2 = df_generated2.rename(columns=used_cols_dict).corr()
px.imshow(generated_corr2, height=500)
corr_diff_2 = (generated_corr2 - sel_corr).abs()
print(f'Mean absolute error on correlations: {corr_diff_2.values.mean()}')
px.imshow(corr_diff_2, zmax=0.3, color_continuous_scale='viridis',
title='Absolute correlation differences between train and generated', height=500)
Elevations¶
print_stats({'true': df_sel, 'generated': df_generated2}, 'elevation_up_snow', 'Skis on, way up', True)
print_stats({'true': df_sel, 'generated': df_generated2}, 'elevation_down_snow', 'Skis off, way down', True)
print_stats({'true': df_sel, 'generated': df_generated2}, 'elevation_max', 'Elevation max', True)
fig = sp.make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=['Skis on, way up', 'Skis off, way down', 'Elevation max'], x_title='Elevation [m]')
bins = {'start': 900, 'end': 4000, 'size': 100}
fig.add_trace(go.Histogram(x=df_sel['elevation_up_snow'], xbins=bins, name='real', histnorm='percent', marker=style_true), row=1, col=1)
fig.add_trace(go.Histogram(x=df_generated2['elevation_up_snow'], xbins=bins, name='generated', histnorm='percent', marker=style_generated), row=1, col=1)
# fig.add_trace(go.Histogram(x=df_generated2_fooled['elevation_up_snow'], xbins=bins, name='fooled', histnorm='percent', marker=style_fooled), row=1, col=1)
fig.add_trace(go.Histogram(x=df_sel['elevation_down_snow'], xbins=bins, name='real', histnorm='percent', marker=style_true), row=1, col=2)
fig.add_trace(go.Histogram(x=df_generated2['elevation_down_snow'], xbins=bins, name='generated', histnorm='percent', marker=style_generated), row=1, col=2)
# fig.add_trace(go.Histogram(x=df_generated2_fooled['elevation_down_snow'], xbins=bins, name='fooled', histnorm='percent', marker=style_fooled), row=1, col=2)
fig.add_trace(go.Histogram(x=df_sel['elevation_max'], xbins=bins, name='real', histnorm='percent', marker=style_true), row=1, col=3)
fig.add_trace(go.Histogram(x=df_generated2['elevation_max'], xbins=bins, name='generated', histnorm='percent', marker=style_generated), row=1, col=3)
# fig.add_trace(go.Histogram(x=df_generated2_fooled['elevation_max'], xbins=bins, name='fooled', histnorm='percent', marker=style_fooled), row=1, col=3)
Day of season¶
print_stats({'true': df_sel, 'generated': df_generated2}, 'day_of_season', 'Day of season')
fig = go.Figure(layout=dict(title='Day of season (mid season = Feb 15th)', bargroupgap=0.01,
xaxis=dict(title='Day relative to Feb 15th'), yaxis=dict(title='%')))
fig.add_histogram(x=df_sel.day_of_season, name='true', nbinsx=52, histnorm='percent', marker=style_true)
fig.add_histogram(x=df_generated2.day_of_season, name='generated', nbinsx=52, histnorm='percent', marker=style_generated)
# fig.add_histogram(x=df_generated2_fooled.day_of_season, name='fooled', nbinsx=52, histnorm='percent', marker=style_fooled)
Ski ratings¶
print(f'Mean/std ski rating num, for true={df_sel.ski_rating_num.mean():.3g}/{df_sel.ski_rating_num.std():.3g}, for generated={df_generated2.ski_rating_num.mean():.3g}/{df_generated.ski_rating_num.std():.3g}')
fig = go.Figure(layout=dict(title='Ski rating', bargroupgap=0.01,
xaxis=dict(title='1.1 (easy) to 5.6 (extreme)', categoryorder='array', categoryarray=list(ski_rating_unmapper.values()), type="category"), yaxis=dict(title='%')))
fig.add_histogram(x=df_outings[condition].ski_rating, name='true', histnorm='percent', marker=style_true)
fig.add_histogram(x=df_generated2.ski_rating, name='generated', histnorm='percent', marker=style_generated)
# fig.add_histogram(x=df_generated2_fooled.ski_rating, name='fooled', histnorm='percent', marker=style_fooled)
Condition ratings¶
print(f'Mean/std condition rating num, for true={df_sel.condition_rating_num.mean():.3g}/{df_sel.condition_rating_num.std():.3g}, for generated={df_generated2.condition_rating_num.mean():.3g}/{df_generated2.condition_rating_num.std():.3g}')
fig = go.Figure(layout=dict(title='Condition rating', bargroupgap=0.01,
xaxis=dict(categoryorder='array', categoryarray=list(rating_unmapper.values()), type="category"), yaxis=dict(title='%')))
fig.add_histogram(x=df_outings[condition].condition_rating, name='true', histnorm='percent', marker=style_true)
fig.add_histogram(x=df_generated2.condition_rating, name='generated', histnorm='percent', marker=style_generated)
Temperatures¶
print(f'Mean/std morning temperature, for true={df_sel.TEMPERATURE_MORNING_C.mean():.3g}/{df_sel.TEMPERATURE_MORNING_C.std():.3g}, for generated={df_generated2.TEMPERATURE_MORNING_C.mean():.3g}/{df_generated2.TEMPERATURE_MORNING_C.std():.3g}')
print(f'Mean/std last 7 day morning temperature, for true={df_sel.temp_morning_7d.mean():.3g}/{df_sel.temp_morning_7d.std():.3g}, for generated={df_generated2.temp_morning_7d.mean():.3g}/{df_generated2.temp_morning_7d.std():.3g}')
print(f'Mean/std last 30 day morning temperature, for true={df_sel.temp_morning_30d.mean():.3g}/{df_sel.temp_morning_30d.std():.3g}, for generated={df_generated2.temp_morning_30d.mean():.3g}/{df_generated2.temp_morning_30d.std():.3g}')
fig = sp.make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=['today', 'last 7 day', 'last 30 day'],
x_title='Morning temperature [°C]')
# fig = go.Figure(layout=dict(title='Trou de la Mouche, morning temperature [°C]', bargroupgap=0.1, yaxis=dict(title='%')))
bins = {'start': -35, 'end': 30, 'size': 1}
fig.add_histogram(x=df_sel.TEMPERATURE_MORNING_C, name='true', histnorm='percent', xbins=bins, marker=style_true, row=1, col=1)
fig.add_histogram(x=df_generated2.TEMPERATURE_MORNING_C, name='generated', histnorm='percent', xbins=bins, marker=style_generated, row=1, col=1)
fig.add_histogram(x=df_sel.temp_morning_7d, name='true', histnorm='percent', xbins=bins, marker=style_true, row=1, col=2)
fig.add_histogram(x=df_generated2.temp_morning_7d, name='generated', histnorm='percent', xbins=bins, marker=style_generated, row=1, col=2)
fig.add_histogram(x=df_sel.temp_morning_30d, name='true', histnorm='percent', xbins=bins, marker=style_true, row=1, col=3)
fig.add_histogram(x=df_generated2.temp_morning_30d, name='generated', histnorm='percent', xbins=bins, marker=style_generated, row=1, col=3)
fig.update_yaxes(title='%')
px.scatter_matrix(df_generated2[['TEMPERATURE_MORNING_C', 'temp_morning_7d', 'temp_morning_30d', 'elevation_up_snow', 'elevation_down_snow']], opacity=0.1,
title='Temperature-elevation correlations for generated data', height=600)
Test on a given outing¶
Trou de la Mouche: https://www.camptocamp.org/waypoints/37312/fr/trou-de-la-mouche 508 ski outings
Trou de la Mouche, Paccaly => Grand Crêt: https://www.camptocamp.org/outings?r=46396&act=skitouring 100 ski outings
def scale_single(scaler, index, data):
return (data - scaler.mean_[index]) / scaler.scale_[index]
def unscale_single(scaler, index, data):
return (data * scaler.scale_[index]) + scaler.mean_[index]
def shift_scaled(scaler, index, data, offset):
""" Shift all samples with a constant offset on a scaled data """
unscaled = unscale_single(scaler, index, data)
return scale_single(scaler, index, unscaled + offset)
route_title = 'trou-de-la-mouche'
df_route_true = pd.read_parquet(f'data/C2C/outings_{route_title}.parquet')
tdlm_label = 'Trou de la Mouche'
tdlm_elevation_max = 2453
tdlm_elevation_max_scaled = scale_single(scaler, used_cols.index('elevation_max'), tdlm_elevation_max)
tdlm_ski_rating = 6
tdlm_ski_rating_scaled = scale_single(scaler, used_cols.index('ski_rating_num'), tdlm_ski_rating)
tdlm_num = 5000
tdlm_c1 = np.ones([tdlm_num, 1]).dot([[tdlm_ski_rating_scaled, tdlm_elevation_max_scaled]])
tdlm_c2 = gen_c2.predict(np.c_[tdlm_c1,
np.random.normal(0, 1, [tdlm_num, num_latent_c2])])
tdlm_c3 = gen_c3.predict(np.c_[tdlm_c1,
tdlm_c2,
np.random.normal(0, 1, [tdlm_num, num_latent_c3])])
df_tdlm = pd.DataFrame(scaler.inverse_transform(np.c_[tdlm_c1,
tdlm_c2,
tdlm_c3]),
columns=used_cols)
#df_tdlm['ski_rating'] = df_tdlm['ski_rating_num'].round().clip(0, 17).replace(ski_rating_unmapper)
df_tdlm['condition_rating'] = df_tdlm['condition_rating_num'].round().clip(0, 4).replace(rating_unmapper)
tdlm_scores = tf.sigmoid(discriminator2.predict(np.c_[tdlm_c1,
tdlm_c2,
tdlm_c3]))
tdlm_fooled = tdlm_scores >= 0.5
tdlm_scores.numpy().mean(), tdlm_fooled.numpy().mean()
df_tdlm_fooled = df_tdlm[tdlm_fooled.numpy()]
len(df_tdlm_fooled)
df_tdlm.head()
Elevations¶
fig = sp.make_subplots(rows=1, cols=2, shared_yaxes=True, subplot_titles=['Skis on, way up', 'Skis off, way down', 'Max'],
x_title="Trou de la Mouche, elevations' cumulative histogram [m]")
bins = {'start': 0, 'end': 2000, 'size': 25}
fig.add_trace(go.Histogram(x=df_route_true['elevation_up_snow'], xbins=bins, name='true', histnorm='percent', marker=style_true), row=1, col=1)
fig.add_trace(go.Histogram(x=df_tdlm['elevation_up_snow'], xbins=bins, name='generated', histnorm='percent', marker=style_generated), row=1, col=1)
# fig.add_trace(go.Histogram(x=df_tdlm_fooled['elevation_up_snow'], xbins=bins, name='fooled', histnorm='percent', cumulative_enabled=True, marker=style_fooled), row=1, col=1)
fig.add_trace(go.Histogram(x=df_route_true['elevation_down_snow'], xbins=bins, name='true', histnorm='percent', marker=style_true, showlegend=False), row=1, col=2)
fig.add_trace(go.Histogram(x=df_tdlm['elevation_down_snow'], xbins=bins, name='generated', histnorm='percent', marker=style_generated, showlegend=False), row=1, col=2)
# fig.add_trace(go.Histogram(x=df_tdlm_fooled['elevation_down_snow'], xbins=bins, name='fooled', histnorm='percent', cumulative_enabled=True, marker=style_fooled), row=1, col=2)
Day of season¶
print(f'Mean/std day of season, for true={df_route_true.day_of_season.mean():.3g}/{df_route_true.day_of_season.std():.3g}, for generated={df_generated.day_of_season.mean():.3g}/{df_generated.day_of_season.std():.3g}')
fig = go.Figure(layout=dict(title='Trou de la Mouche, day of season (mid season = Feb 15th)', bargroupgap=0.01,
xaxis=dict(title='Day relative to Feb 15th'), yaxis=dict(title='%')))
fig.add_histogram(x=df_route_true.day_of_season, name='true', nbinsx=52, histnorm='percent', marker=style_true)
fig.add_histogram(x=df_tdlm.day_of_season, name='generated', nbinsx=52, histnorm='percent', marker=style_generated)
# fig.add_histogram(x=df_tdlm_fooled.day_of_season, name='fooled', nbinsx=52, histnorm='percent', marker=style_fooled)
Condition ratings¶
print(f'Mean/std condition rating num, for true={df_route_true.condition_rating_num.mean():.3g}/{df_route_true.condition_rating_num.std():.3g}, for generated={df_generated.condition_rating_num.mean():.3g}/{df_generated.condition_rating_num.std():.3g}')
fig = go.Figure(layout=dict(title='Trou de la Mouche, condition rating', bargroupgap=0.01,
xaxis=dict(categoryorder='array', categoryarray=list(rating_unmapper.values()), type="category"), yaxis=dict(title='%')))
fig.add_histogram(x=df_route_true.condition_rating, name='true', histnorm='percent', marker=style_true)
fig.add_histogram(x=df_tdlm.condition_rating, name='generated', histnorm='percent', marker=style_generated)
# fig.add_histogram(x=df_tdlm_fooled.condition_rating, name='fooled', histnorm='percent', marker=style_fooled)
Temperatures Trou de la Mouche¶
print(f'Mean/std morning temperature, for true={df_route_true.TEMPERATURE_MORNING_C.mean():.3g}/{df_route_true.TEMPERATURE_MORNING_C.std():.3g}, for generated={df_tdlm.TEMPERATURE_MORNING_C.mean():.3g}/{df_tdlm.TEMPERATURE_MORNING_C.std():.3g}')
print(f'Mean/std last 7 day morning temperature, for true={df_route_true.temp_morning_7d.mean():.3g}/{df_route_true.temp_morning_7d.std():.3g}, for generated={df_tdlm.temp_morning_7d.mean():.3g}/{df_tdlm.temp_morning_7d.std():.3g}')
print(f'Mean/std last 30 day morning temperature, for true={df_route_true.temp_morning_30d.mean():.3g}/{df_route_true.temp_morning_30d.std():.3g}, for generated={df_tdlm.temp_morning_30d.mean():.3g}/{df_tdlm.temp_morning_30d.std():.3g}')
fig = sp.make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=['today', 'last 7 day', 'last 30 day'],
x_title='Trou de la Mouche, morning temperature [°C]')
# fig = go.Figure(layout=dict(title='Trou de la Mouche, morning temperature [°C]', bargroupgap=0.1, yaxis=dict(title='%')))
bins = {'start': -35, 'end': 30, 'size': 1}
fig.add_histogram(x=df_route_true.TEMPERATURE_MORNING_C, name='true', histnorm='percent', xbins=bins, marker=style_true, row=1, col=1)
fig.add_histogram(x=df_tdlm.TEMPERATURE_MORNING_C, name='generated', histnorm='percent', xbins=bins, marker=style_generated, row=1, col=1)
# fig.add_histogram(x=df_tdlm_fooled.TEMPERATURE_MORNING_C, name='fooled', histnorm='percent', xbins=bins, marker=style_fooled, row=1, col=1)
fig.add_histogram(x=df_route_true.temp_morning_7d, name='true', histnorm='percent', xbins=bins, marker=style_true, row=1, col=2)
fig.add_histogram(x=df_tdlm.temp_morning_7d, name='generated', histnorm='percent', xbins=bins, marker=style_generated, row=1, col=2)
# fig.add_histogram(x=df_tdlm_fooled.temp_morning_7d, name='fooled', histnorm='percent', xbins=bins, marker=style_fooled, row=1, col=2)
fig.add_histogram(x=df_route_true.temp_morning_30d, name='true', histnorm='percent', xbins=bins, marker=style_true, row=1, col=3)
fig.add_histogram(x=df_tdlm.temp_morning_30d, name='generated', histnorm='percent', xbins=bins, marker=style_generated, row=1, col=3)
# fig.add_histogram(x=df_tdlm_fooled.temp_morning_30d, name='fooled', histnorm='percent', xbins=bins, marker=style_fooled, row=1, col=3)
fig.update_yaxes(title='%')
px.scatter_matrix(df_tdlm[['TEMPERATURE_MORNING_C', 'temp_morning_7d', 'temp_morning_30d', 'elevation_up_snow', 'elevation_down_snow']], opacity=0.1,
title='Trou de la Mouche, temperature-elevation correlations of generated data', height=600)
Global warming impact¶
All temperature are raised by 4 degrees
Global warming has been found to be doubled in the Alps:
delta_temp_range = [1, 2, 3, 3.5, 4, 6]
tdlm_warm_series = {}
tdlm_warm_medians = pd.DataFrame(columns=['delta_temperature',
'elevation_up_median',
'elevation_down_median'])
tdlm_warm_medians = tdlm_warm_medians.append({'delta_temperature': 0,
'elevation_up_median': df_tdlm['elevation_up_snow'].median(),
'elevation_down_median': df_tdlm['elevation_down_snow'].median(),
'morning_temperature': df_tdlm['TEMPERATURE_MORNING_C'].median()},
ignore_index=True)
for delta_temp in delta_temp_range:
c2_warm = [*[shift_scaled(scaler, used_cols.index(feature), tdlm_c2[:,i], delta_temp) for i, feature in enumerate(temperature_features.keys())], tdlm_c2[:,-1]]
tdlm_c2_warm = pd.DataFrame(np.array(c2_warm).T, columns=features_c2)
tdlm_c3_warm = gen_c3.predict(np.c_[tdlm_c1,
tdlm_c2_warm,
np.random.normal(0, 1, [tdlm_num, num_latent_c3])])
df_warm = pd.DataFrame(scaler.inverse_transform(np.c_[tdlm_c1,
tdlm_c2_warm,
tdlm_c3_warm]),
columns=used_cols)
df_warm['condition_rating'] = df_warm['condition_rating_num'].round().clip(0, 4).replace(rating_unmapper)
tdlm_warm_medians = tdlm_warm_medians.append({'delta_temperature': delta_temp,
'elevation_up_median': df_warm['elevation_up_snow'].median(),
'elevation_down_median': df_warm['elevation_down_snow'].median(),
'morning_temperature': df_warm['TEMPERATURE_MORNING_C'].median()},
ignore_index=True)
# tdlm_scores_warm = tf.sigmoid(discriminator2.predict(np.c_[tdlm_c1,
# tdlm_c2_warm,
# tdlm_c3_warm]))
#tdlm_fooled_warm = tdlm_scores_warm >= 0.5
#tdlm_scores_warm.numpy().mean(), tdlm_fooled_warm.numpy().mean()
tdlm_warm_series.update({delta_temp: df_warm})
tdlm_warm_medians.set_index('delta_temperature', inplace=True)
px.line(tdlm_warm_medians[['elevation_up_median', 'elevation_down_median']].rename(columns={'elevation_up_median': 'Skis on',
'elevation_down_median': 'Skis off'}),
labels={'delta_temperature': 'Temperature increase compared to 2010s [°C]', 'value':'[m]'},
title='Median elevation for generated outing reports')
Elevations with global warming¶
delta_temp = 3.5
df_tdlm_warm = tdlm_warm_series[delta_temp]
df_tdlm_warm[['elevation_up_snow', 'elevation_down_snow']].quantile([0.50, 0.60, 0.70, 0.80])
print(f"Skis on on the way up median: now = {df_tdlm['elevation_up_snow'].median():.1f}, with global warming of {delta_temp}°C = {df_tdlm_warm['elevation_up_snow'].median():.1f}")
print(f"Skis on on the way up median: now = {df_tdlm['elevation_down_snow'].median():.1f}, with global warming of {delta_temp}°C = {df_tdlm_warm['elevation_down_snow'].median():.1f}")
fig = sp.make_subplots(rows=1, cols=2, shared_yaxes=True, subplot_titles=['Skis on, way up', 'Skis off, way down', 'Max'],
x_title='Trou de la Mouche, generated elevations [m] as cumulative histograms')
bins = {'start': 900, 'end': 2000, 'size': 25}
fig.add_histogram(x=df_tdlm['elevation_up_snow'], xbins=bins, name='now', histnorm='percent', cumulative_enabled=True, marker=style_generated, row=1, col=1)
fig.add_histogram(x=df_tdlm_warm['elevation_up_snow'], xbins=bins, name='with warming', cumulative_enabled=True, histnorm='percent', marker=style_warm, row=1, col=1)
fig.add_histogram(x=df_tdlm['elevation_down_snow'], xbins=bins, name='now', histnorm='percent', cumulative_enabled=True, marker=style_generated, row=1, col=2)
fig.add_histogram(x=df_tdlm_warm['elevation_down_snow'], xbins=bins, name='with warming', histnorm='percent', cumulative_enabled=True, marker=style_warm, row=1, col=2)
Condition ratings with warming¶
print(f'Mean/std condition rating, now={df_tdlm.condition_rating_num.mean():.3g}/{df_tdlm.condition_rating_num.std():.3g}, with warming={df_tdlm_warm.condition_rating_num.mean():.3g}/{df_tdlm_warm.condition_rating_num.std():.3g}, ')
fig = go.Figure(layout=dict(title='Trou de la Mouche, condition rating', bargroupgap=0.01,
xaxis=dict(categoryorder='array', categoryarray=list(rating_unmapper.values()), type="category"), yaxis=dict(title='%')))
fig.add_histogram(x=df_tdlm.condition_rating, name='now', histnorm='percent', marker=style_generated)
fig.add_histogram(x=df_tdlm_warm.condition_rating, name='with warming', histnorm='percent', marker=style_warm)