Results

functions for plotting results and comparing Kalman Filter with MDS and ERA

Setup

import altair as alt
import altair

load data to test functions

reset_seed()
hai = pd.read_parquet(hai_big_path)
hai_era = pd.read_parquet(hai_era_big_path)
dls = imp_dataloader(hai, hai_era, var_sel = 'TA', gap_len=5, block_len=100, control_lags = [1], n_rep=1, bs=1).cpu()
item = MeteoImpItem(1,2, 'TA', gap_len=10)
items = [item]
targ = orig_target(dls, items)[0]
input, _ = one_batch_with_items(dls, items)

Imputation Methods

Kalman Filter

take a MeteoImp Df and gap fill it using the given model

model = torch.load(here("analysis/results/trained_models/1_gap_varying_6-336_v1.pickle"))
preds, targs = predict_items(model, dls=dls, items = items)
preds[0].mean[targs[0].mask] = targs[0].data[targs[0].mask]

KalmanImputation

input[1].shape
torch.Size([1, 100, 9])
preds_raw = model(input)
len(preds_raw.mean[0])
10
_extract_var(preds_raw.mean, 1, 7)
[[tensor([-0.7555], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.7765], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8153], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8653], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8660], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8712], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8551], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8420], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8442], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8536], dtype=torch.float64, grad_fn=<IndexBackward0>)]]
_extract_var(preds_raw.cov, 1, 7)
[[tensor([[0.0227]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0307]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0359]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0392]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0409]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0409]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0393]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0361]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0310]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0230]], dtype=torch.float64, grad_fn=<IndexBackward0>)]]
items_all = [MeteoImpItem(1,2, list(hai.columns), gap_len=3)] * 3
input_all, _ = one_batch_with_items(dls, items_all)
preds_raw_all = model(input_all)
_extract_var(preds_raw_all.mean, 3, 9)
[[tensor([-0.5650], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.5887], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.6259], dtype=torch.float64, grad_fn=<SliceBackward0>)],
 [tensor([-0.5650], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.5887], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.6259], dtype=torch.float64, grad_fn=<SliceBackward0>)],
 [tensor([-0.5650], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.5887], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.6259], dtype=torch.float64, grad_fn=<SliceBackward0>)]]
buffer_pred(preds_raw_all.mean, input_all[1]).shape
torch.Size([3, 100, 9])
_extract_var(preds_raw_all.cov, 3, 9)
[[tensor([[0.1189]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1717]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1251]], dtype=torch.float64, grad_fn=<SliceBackward0>)],
 [tensor([[0.1189]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1717]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1251]], dtype=torch.float64, grad_fn=<SliceBackward0>)],
 [tensor([[0.1189]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1717]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1251]], dtype=torch.float64, grad_fn=<SliceBackward0>)]]

source

PredictLossVar

 PredictLossVar (only_gap:bool, var:int)

loss (negative log likelihood) for only for one variable for each batch

PredictLossVar(True, 3)(preds_raw_all, input_all)
[tensor(-0.2252, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(-0.2252, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(-0.2252, dtype=torch.float64, grad_fn=<MeanBackward0>)]
input_all[1].shape
torch.Size([3, 100, 9])

source

PredictLikelihoodVar

 PredictLikelihoodVar (only_gap:bool, var:int)

mean between timesteps of Likelihood for only for one variable for each batch

PredictLikelihoodVar(True, 3)(preds_raw_all, input_all)
[tensor(1.1569, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(1.1569, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(1.1569, dtype=torch.float64, grad_fn=<MeanBackward0>)]

source

MultiMetrics

 MultiMetrics (**metrics)

Initialize self. See help(type(self)) for accurate signature.


source

KalmanImputation

 KalmanImputation (model)

Initialize self. See help(type(self)) for accurate signature.

k_imp = KalmanImputation(model)
pred = k_imp(item, dls=dls)
display_as_row({'pred': pred[45:55],'data': targ.data[45:55], 'mask': targ.mask[45:55]}, hide_idx=False)

pred

  TA SW_IN LW_IN VPD WS PA P SWC TS
time                  
2000-01-04 02:30:00 2.3467 0.0000 304.7310 0.9520 5.3900 96.3000 0.0000 43.0800 1.3100
2000-01-04 03:00:00 2.1803 0.0000 304.7310 0.9710 5.5900 96.3000 0.0000 43.0800 1.3100
2000-01-04 03:30:00 1.8728 0.0000 304.7310 0.9580 5.6700 96.2900 0.0000 43.0800 1.3100
2000-01-04 04:00:00 1.4769 0.0000 304.7310 0.9380 6.6100 96.2600 0.0000 43.0800 1.3100
2000-01-04 04:30:00 1.4710 0.0000 315.0050 0.7850 6.6500 96.2600 0.0000 43.0800 1.3100
2000-01-04 05:00:00 1.4303 0.0000 315.0050 0.5290 6.1200 96.2700 0.0000 43.0800 1.3000
2000-01-04 05:30:00 1.5573 0.0000 315.0050 0.4500 5.5800 96.2600 0.2900 43.1300 1.3000
2000-01-04 06:00:00 1.6616 0.0000 315.0050 0.4990 4.7000 96.2500 0.0000 43.1300 1.2900
2000-01-04 06:30:00 1.6438 0.0000 315.0050 0.3820 5.0700 96.2300 0.0000 43.1300 1.2900
2000-01-04 07:00:00 1.5697 0.0000 315.0050 0.3130 5.2300 96.2400 0.0000 43.1300 1.2900

data

  TA SW_IN LW_IN VPD WS PA P SWC TS
time                  
2000-01-04 02:30:00 2.1900 0.0000 304.7310 0.9520 5.3900 96.3000 0.0000 43.0800 1.3100
2000-01-04 03:00:00 2.2700 0.0000 304.7310 0.9710 5.5900 96.3000 0.0000 43.0800 1.3100
2000-01-04 03:30:00 2.3200 0.0000 304.7310 0.9580 5.6700 96.2900 0.0000 43.0800 1.3100
2000-01-04 04:00:00 2.3400 0.0000 304.7310 0.9380 6.6100 96.2600 0.0000 43.0800 1.3100
2000-01-04 04:30:00 2.2400 0.0000 315.0050 0.7850 6.6500 96.2600 0.0000 43.0800 1.3100
2000-01-04 05:00:00 2.0000 0.0000 315.0050 0.5290 6.1200 96.2700 0.0000 43.0800 1.3000
2000-01-04 05:30:00 1.9400 0.0000 315.0050 0.4500 5.5800 96.2600 0.2900 43.1300 1.3000
2000-01-04 06:00:00 2.0700 0.0000 315.0050 0.4990 4.7000 96.2500 0.0000 43.1300 1.2900
2000-01-04 06:30:00 2.0400 0.0000 315.0050 0.3820 5.0700 96.2300 0.0000 43.1300 1.2900
2000-01-04 07:00:00 2.0300 0.0000 315.0050 0.3130 5.2300 96.2400 0.0000 43.1300 1.2900

mask

  TA SW_IN LW_IN VPD WS PA P SWC TS
time                  
2000-01-04 02:30:00 False True True True True True True True True
2000-01-04 03:00:00 False True True True True True True True True
2000-01-04 03:30:00 False True True True True True True True True
2000-01-04 04:00:00 False True True True True True True True True
2000-01-04 04:30:00 False True True True True True True True True
2000-01-04 05:00:00 False True True True True True True True True
2000-01-04 05:30:00 False True True True True True True True True
2000-01-04 06:00:00 False True True True True True True True True
2000-01-04 06:30:00 False True True True True True True True True
2000-01-04 07:00:00 False True True True True True True True True

Kalman Imputation - Specialized Variables


source

KalmanImputationVar

 KalmanImputationVar (models)

Initialize self. See help(type(self)) for accurate signature.

Details
models dataframe with 2 columns model and var
k_impVar = KalmanImputationVar(pd.DataFrame({'model': [model], 'var': 'TA'}))
pred = k_impVar(var= 'TA', item=item, dls=dls)
display_as_row({'pred': pred[45:55],'data': targ.data[45:55], 'mask': targ.mask[45:55]}, hide_idx=False)

pred

  TA SW_IN LW_IN VPD WS PA P SWC TS
time                  
2000-01-04 02:30:00 2.3467 0.0000 304.7310 0.9520 5.3900 96.3000 0.0000 43.0800 1.3100
2000-01-04 03:00:00 2.1803 0.0000 304.7310 0.9710 5.5900 96.3000 0.0000 43.0800 1.3100
2000-01-04 03:30:00 1.8728 0.0000 304.7310 0.9580 5.6700 96.2900 0.0000 43.0800 1.3100
2000-01-04 04:00:00 1.4769 0.0000 304.7310 0.9380 6.6100 96.2600 0.0000 43.0800 1.3100
2000-01-04 04:30:00 1.4710 0.0000 315.0050 0.7850 6.6500 96.2600 0.0000 43.0800 1.3100
2000-01-04 05:00:00 1.4303 0.0000 315.0050 0.5290 6.1200 96.2700 0.0000 43.0800 1.3000
2000-01-04 05:30:00 1.5573 0.0000 315.0050 0.4500 5.5800 96.2600 0.2900 43.1300 1.3000
2000-01-04 06:00:00 1.6616 0.0000 315.0050 0.4990 4.7000 96.2500 0.0000 43.1300 1.2900
2000-01-04 06:30:00 1.6438 0.0000 315.0050 0.3820 5.0700 96.2300 0.0000 43.1300 1.2900
2000-01-04 07:00:00 1.5697 0.0000 315.0050 0.3130 5.2300 96.2400 0.0000 43.1300 1.2900

data

  TA SW_IN LW_IN VPD WS PA P SWC TS
time                  
2000-01-04 02:30:00 2.1900 0.0000 304.7310 0.9520 5.3900 96.3000 0.0000 43.0800 1.3100
2000-01-04 03:00:00 2.2700 0.0000 304.7310 0.9710 5.5900 96.3000 0.0000 43.0800 1.3100
2000-01-04 03:30:00 2.3200 0.0000 304.7310 0.9580 5.6700 96.2900 0.0000 43.0800 1.3100
2000-01-04 04:00:00 2.3400 0.0000 304.7310 0.9380 6.6100 96.2600 0.0000 43.0800 1.3100
2000-01-04 04:30:00 2.2400 0.0000 315.0050 0.7850 6.6500 96.2600 0.0000 43.0800 1.3100
2000-01-04 05:00:00 2.0000 0.0000 315.0050 0.5290 6.1200 96.2700 0.0000 43.0800 1.3000
2000-01-04 05:30:00 1.9400 0.0000 315.0050 0.4500 5.5800 96.2600 0.2900 43.1300 1.3000
2000-01-04 06:00:00 2.0700 0.0000 315.0050 0.4990 4.7000 96.2500 0.0000 43.1300 1.2900
2000-01-04 06:30:00 2.0400 0.0000 315.0050 0.3820 5.0700 96.2300 0.0000 43.1300 1.2900
2000-01-04 07:00:00 2.0300 0.0000 315.0050 0.3130 5.2300 96.2400 0.0000 43.1300 1.2900

mask

  TA SW_IN LW_IN VPD WS PA P SWC TS
time                  
2000-01-04 02:30:00 False True True True True True True True True
2000-01-04 03:00:00 False True True True True True True True True
2000-01-04 03:30:00 False True True True True True True True True
2000-01-04 04:00:00 False True True True True True True True True
2000-01-04 04:30:00 False True True True True True True True True
2000-01-04 05:00:00 False True True True True True True True True
2000-01-04 05:30:00 False True True True True True True True True
2000-01-04 06:00:00 False True True True True True True True True
2000-01-04 06:30:00 False True True True True True True True True
2000-01-04 07:00:00 False True True True True True True True True

MDS

Need to call R from python

r.print('here::here("R/REddyProc_tools.R")')
[1] "here::here(\"R/REddyProc_tools.R\")"
StrVector with 1 elements.
'here::here("R/REddyProc_tools.R")'

source

importr_install

 importr_install (pkg)

source

setupR

 setupR ()

source

R2pd

 R2pd (x)

source

pd2R

 pd2R (x)

Experiment

The time series needs to be at least 90 days long, we we add 45 days before and after the gap

There is a problem with conversions of timestamps between R and Python, so convert to string in Python and then back to datetime in R


source

add_buffer

 add_buffer (index, inner_index, n)

Adds a buffer of after and after index so length is at least

add_buffer(hai.index, hai.index[:100], 50)
add_buffer(hai.index, hai.index[-50:], 50);

source

item2REddy

 item2REddy (item, var, df)

Add context around item for supporting REddyProc

REddy_df = item2REddy(targ, 'TA', hai)

REddy_df_r = r.toR_timestamp(pd2R(REddy_df))

filled = R2pd(r.fill_gaps_EProc(REddy_df_r, "TA"))
R[write to console]: New sEddyProc class for site 'ID'

R[write to console]: Initialized variable 'TA' with 10 real gaps for gap filling.

R[write to console]: Limited MDS algorithm for gap filling of 'TA.gap_0' with LUT(SW_IN only) and MDC.

R[write to console]: Look up table with window size of 7 days with SW_IN

R[write to console]: 10

R[write to console]: Finished gap filling of 'TA' in 0 seconds. Artificial gaps filled: 4419, real gaps filled: 10, unfilled (long) gaps: 0.
filled
TA_orig TA_f TA_fqc TA_fall TA_fall_qc TA_fnum TA_fsd TA_fmeth TA_fwin
1 -0.60 -0.60 0.0 -0.60 NaN NaN NaN NaN NaN
2 -0.65 -0.65 0.0 -0.65 NaN NaN NaN NaN NaN
3 -0.58 -0.58 0.0 -0.58 NaN NaN NaN NaN NaN
4 -0.51 -0.51 0.0 -0.51 NaN NaN NaN NaN NaN
5 -0.49 -0.49 0.0 -0.49 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ...
4415 3.30 3.30 0.0 3.30 NaN NaN NaN NaN NaN
4416 3.10 3.10 0.0 3.10 NaN NaN NaN NaN NaN
4417 3.05 3.05 0.0 3.05 NaN NaN NaN NaN NaN
4418 3.05 3.05 0.0 3.05 NaN NaN NaN NaN NaN
4419 3.05 3.05 0.0 3.05 NaN NaN NaN NaN NaN

4419 rows × 9 columns

REddy_df.set_index("TIMESTAMP_END")
TA SW_IN LW_IN VPD WS PA P SWC TS gap
TIMESTAMP_END
2000-01-01 00:30:00 -0.60 0.0 302.475 0.222 2.05 96.63 0.0 43.00 1.44 0.0
2000-01-01 01:00:00 -0.65 0.0 302.475 0.122 2.53 96.58 0.0 43.00 1.43 0.0
2000-01-01 01:30:00 -0.58 0.0 301.677 0.090 3.15 96.56 0.0 43.00 1.43 0.0
2000-01-01 02:00:00 -0.51 0.0 301.677 0.110 3.12 96.56 0.0 43.00 1.45 0.0
2000-01-01 02:30:00 -0.49 0.0 301.677 0.102 3.04 96.57 0.0 43.00 1.44 0.0
... ... ... ... ... ... ... ... ... ... ...
2000-04-01 23:30:00 3.30 0.0 283.856 1.092 2.50 95.06 0.0 44.96 4.32 0.0
2000-04-02 00:00:00 3.10 0.0 283.856 1.013 2.09 95.04 0.0 44.96 4.22 0.0
2000-04-02 00:30:00 3.05 0.0 283.856 0.964 2.42 95.04 0.0 44.96 4.13 0.0
2000-04-02 01:00:00 3.05 0.0 283.856 0.933 3.04 95.04 0.0 44.96 4.04 0.0
2000-04-02 01:30:00 3.05 0.0 251.387 0.930 2.96 95.03 0.0 44.95 3.94 0.0

4419 rows × 10 columns

filled = filled.set_index(pd.to_datetime(REddy_df.TIMESTAMP_END))

filled_item = filled.loc[targ.data.index]

pred = targ.data.copy()
pred.loc[~targ.mask["TA"], "TA"] = filled_item["TA_f"][~targ.mask["TA"]]
pred[45:55], targ.data[45:55]
(                           TA  SW_IN    LW_IN    VPD    WS     PA     P  \
 time                                                                      
 2000-01-04 02:30:00  1.853612    0.0  304.731  0.952  5.39  96.30  0.00   
 2000-01-04 03:00:00  1.837083    0.0  304.731  0.971  5.59  96.30  0.00   
 2000-01-04 03:30:00  1.823985    0.0  304.731  0.958  5.67  96.29  0.00   
 2000-01-04 04:00:00  1.809805    0.0  304.731  0.938  6.61  96.26  0.00   
 2000-01-04 04:30:00  1.794282    0.0  315.005  0.785  6.65  96.26  0.00   
 2000-01-04 05:00:00  1.783447    0.0  315.005  0.529  6.12  96.27  0.00   
 2000-01-04 05:30:00  1.771525    0.0  315.005  0.450  5.58  96.26  0.29   
 2000-01-04 06:00:00  1.760362    0.0  315.005  0.499  4.70  96.25  0.00   
 2000-01-04 06:30:00  1.748410    0.0  315.005  0.382  5.07  96.23  0.00   
 2000-01-04 07:00:00  1.737933    0.0  315.005  0.313  5.23  96.24  0.00   
 
                        SWC    TS  
 time                              
 2000-01-04 02:30:00  43.08  1.31  
 2000-01-04 03:00:00  43.08  1.31  
 2000-01-04 03:30:00  43.08  1.31  
 2000-01-04 04:00:00  43.08  1.31  
 2000-01-04 04:30:00  43.08  1.31  
 2000-01-04 05:00:00  43.08  1.30  
 2000-01-04 05:30:00  43.13  1.30  
 2000-01-04 06:00:00  43.13  1.29  
 2000-01-04 06:30:00  43.13  1.29  
 2000-01-04 07:00:00  43.13  1.29  ,
                        TA  SW_IN    LW_IN    VPD    WS     PA     P    SWC  \
 time                                                                         
 2000-01-04 02:30:00  2.19    0.0  304.731  0.952  5.39  96.30  0.00  43.08   
 2000-01-04 03:00:00  2.27    0.0  304.731  0.971  5.59  96.30  0.00  43.08   
 2000-01-04 03:30:00  2.32    0.0  304.731  0.958  5.67  96.29  0.00  43.08   
 2000-01-04 04:00:00  2.34    0.0  304.731  0.938  6.61  96.26  0.00  43.08   
 2000-01-04 04:30:00  2.24    0.0  315.005  0.785  6.65  96.26  0.00  43.08   
 2000-01-04 05:00:00  2.00    0.0  315.005  0.529  6.12  96.27  0.00  43.08   
 2000-01-04 05:30:00  1.94    0.0  315.005  0.450  5.58  96.26  0.29  43.13   
 2000-01-04 06:00:00  2.07    0.0  315.005  0.499  4.70  96.25  0.00  43.13   
 2000-01-04 06:30:00  2.04    0.0  315.005  0.382  5.07  96.23  0.00  43.13   
 2000-01-04 07:00:00  2.03    0.0  315.005  0.313  5.23  96.24  0.00  43.13   
 
                        TS  
 time                       
 2000-01-04 02:30:00  1.31  
 2000-01-04 03:00:00  1.31  
 2000-01-04 03:30:00  1.31  
 2000-01-04 04:00:00  1.31  
 2000-01-04 04:30:00  1.31  
 2000-01-04 05:00:00  1.30  
 2000-01-04 05:30:00  1.30  
 2000-01-04 06:00:00  1.29  
 2000-01-04 06:30:00  1.29  
 2000-01-04 07:00:00  1.29  )

source

gap_fill_item

 gap_fill_item (item, REddy_df, var, filled)
gap_fill_item(targ, REddy_df, "TA", filled)
TA SW_IN LW_IN VPD WS PA P SWC TS
time
2000-01-03 04:00:00 0.81 0.0 304.148 0.000 3.53 97.00 0.0 43.09 1.37
2000-01-03 04:30:00 0.95 0.0 304.382 0.000 3.99 96.98 0.0 43.09 1.37
2000-01-03 05:00:00 1.09 0.0 304.382 0.000 3.61 96.96 0.0 43.09 1.37
2000-01-03 05:30:00 1.18 0.0 304.382 0.009 3.90 96.93 0.0 43.09 1.37
2000-01-03 06:00:00 1.35 0.0 304.382 0.061 4.17 96.91 0.0 43.09 1.38
... ... ... ... ... ... ... ... ... ...
2000-01-05 03:30:00 4.62 0.0 330.202 1.162 6.53 95.91 0.0 44.12 1.82
2000-01-05 04:00:00 4.51 0.0 330.202 1.636 5.76 95.94 0.0 44.12 1.85
2000-01-05 04:30:00 4.11 0.0 299.320 1.746 5.79 95.98 0.0 44.12 1.85
2000-01-05 05:00:00 3.77 0.0 299.320 2.065 6.00 96.03 0.0 44.12 1.83
2000-01-05 05:30:00 3.36 0.0 299.320 1.911 4.76 96.10 0.0 44.11 1.81

100 rows × 9 columns

MDSImputation


source

MDSImputation

 MDSImputation (var, df)

Initialize self. See help(type(self)) for accurate signature.

mds_imp = MDSImputation('TA', hai)
mds_imp(targ)
TA SW_IN LW_IN VPD WS PA P SWC TS
time
2000-01-03 04:00:00 0.81 0.0 304.148 0.000 3.53 97.00 0.0 43.09 1.37
2000-01-03 04:30:00 0.95 0.0 304.382 0.000 3.99 96.98 0.0 43.09 1.37
2000-01-03 05:00:00 1.09 0.0 304.382 0.000 3.61 96.96 0.0 43.09 1.37
2000-01-03 05:30:00 1.18 0.0 304.382 0.009 3.90 96.93 0.0 43.09 1.37
2000-01-03 06:00:00 1.35 0.0 304.382 0.061 4.17 96.91 0.0 43.09 1.38
... ... ... ... ... ... ... ... ... ...
2000-01-05 03:30:00 4.62 0.0 330.202 1.162 6.53 95.91 0.0 44.12 1.82
2000-01-05 04:00:00 4.51 0.0 330.202 1.636 5.76 95.94 0.0 44.12 1.85
2000-01-05 04:30:00 4.11 0.0 299.320 1.746 5.79 95.98 0.0 44.12 1.85
2000-01-05 05:00:00 3.77 0.0 299.320 2.065 6.00 96.03 0.0 44.12 1.83
2000-01-05 05:30:00 3.36 0.0 299.320 1.911 4.76 96.10 0.0 44.11 1.81

100 rows × 9 columns

ERA Imputation


source

ERAImputation

 ERAImputation ()

Initialize self. See help(type(self)) for accurate signature.

targ.data.columns
Index(['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS', 'PA', 'P', 'SWC', 'TS'], dtype='object')
era_imp = ERAImputation()
era_imp(targ)
TA SW_IN VPD PA P WS LW_IN SWC TS
time
2000-01-03 04:00:00 1.702 0.0 0.693 97.016 0.000 2.948 304.148 NaN NaN
2000-01-03 04:30:00 1.762 0.0 0.691 97.002 0.026 2.989 304.382 NaN NaN
2000-01-03 05:00:00 1.736 0.0 0.697 96.988 0.000 3.015 304.382 NaN NaN
2000-01-03 05:30:00 1.711 0.0 0.703 96.974 0.000 3.041 304.382 NaN NaN
2000-01-03 06:00:00 1.686 0.0 0.709 96.960 0.000 3.066 304.382 NaN NaN
... ... ... ... ... ... ... ... ... ...
2000-01-05 03:30:00 4.945 0.0 0.774 95.910 0.252 6.107 330.202 NaN NaN
2000-01-05 04:00:00 4.789 0.0 0.758 95.926 0.252 6.189 330.202 NaN NaN
2000-01-05 04:30:00 4.632 0.0 0.741 95.942 0.044 6.270 299.320 NaN NaN
2000-01-05 05:00:00 4.251 0.0 0.766 95.988 0.000 5.923 299.320 NaN NaN
2000-01-05 05:30:00 3.869 0.0 0.790 96.035 0.000 5.576 299.320 NaN NaN

100 rows × 9 columns

Metrics


source

MaskedMetric

 MaskedMetric (metric)

Initialize self. See help(type(self)) for accurate signature.


source

rmse

 rmse (y_true, y_pred)

source

NormalizedMetric

 NormalizedMetric (metric:__main__.MaskedMetric, mean, std)

Initialize self. See help(type(self)) for accurate signature.


source

normalize

 normalize (x, mean, std)
rmse_mask(targ, pred)
array([0.3698457])

Validation Imputations

checks that the values of the imputation makes sense

this is the average error for ERA

np.sqrt((hai['TA'] - hai_era.loc[hai.index]['TA_ERA']).pow(2).mean())
1.8708585984196093
rmse_mask(targ, ERAImputation()(targ))
array([0.29919709])
err_era = []
for _ in range(100):
    items = random.choices(dls.items, k = 1)
    targ = orig_target(dls, items)[0]
    err_era.append(rmse_mask(targ, ERAImputation()(targ)))
np.array(err_era).mean()
1.3839633283333717

Comparison

Take a variable to be filled, makes an artificial gap of given len, tries to fill with 3 methods and return metrics for each of them, repeat n_rep times

prep visualization


source

format_gap_len

 format_gap_len (gap_len:int)

Nice formatting for gap lengths

Type Details
gap_len int gap length in num observations (30 mins)
test_eq(format_gap_len(10), "5 h")
test_eq(format_gap_len(48), "1 day (24 h)")
test_eq(format_gap_len(90), "2 days (45 h)")
test_eq(format_gap_len(336), "1 week (168 h)")

source

prep_df

 prep_df (df)

Aggregation

hai.std(axis=0)['TA']
7.924610764367695
get_stats(hai)
(tensor([8.3339e+00, 1.2096e+02, 3.1150e+02, 3.3807e+00, 3.1800e+00, 9.5962e+01,
         4.3427e-02, 3.4843e+01, 7.9349e+00], dtype=torch.float64),
 tensor([  7.9246, 204.0026,  41.9557,   4.3684,   1.6254,   0.8552,   0.2803,
           8.9131,   5.6586], dtype=torch.float64))

\[ \text{RMSE}_\text{stand} = \frac{1}{\sigma}\text{RMSE}\]


source

RMSEAgg

 RMSEAgg (df)

Aggregate to rmse and normalized rmse


source

timeseriesAgg

 timeseriesAgg (targ, pred, *args)

Imp Methods


source

ImpComparison

 ImpComparison (models:pandas.core.frame.DataFrame, df, control,
                block_len, rmse=True, time_series=False)

Initialize self. See help(type(self)) for accurate signature.


source

l_model

 l_model (x, base_path=Path('/home/runner/work/meteo_imp/meteo_imp/analysi
          s/results/trained_models'))
models_var = pd.DataFrame.from_records([
    {'var': 'TA',    'model': l_model("TA_specialized_gap_6-336_v3_0.pickle",base_path)},
    {'var': 'SW_IN', 'model': l_model("SW_IN_specialized_gap_6-336_v2_0.pickle",base_path)},
    {'var': 'LW_IN', 'model': l_model("LW_IN_specialized_gap_6-336_v1.pickle",base_path)},
    {'var': 'VPD',   'model': l_model("VPD_specialized_gap_6-336_v2_0.pickle",base_path)},
    {'var': 'WS',    'model': l_model("WS_specialized_gap_6-336_v1.pickle",base_path)},
    {'var': 'PA',    'model': l_model("PA_specialized_gap_6-336_v3_0.pickle",base_path)},
    {'var': 'P',     'model': l_model("1_gap_varying_6-336_v3.pickle",base_path)},
    {'var': 'TS',    'model': l_model("TS_specialized_gap_6-336_v2_0.pickle",base_path)},
    {'var': 'SWC',   'model': l_model("SWC_specialized_gap_6-336_v2_1.pickle",base_path)},
])
comp = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446)
data_results = comp.compare(gap_len = [12, 24, 48, 336], var=["TA", "SW_IN"], n_rep=3)
data_results.head()
method var gap_len idx_rep rmse rmse_stand gap_len_f
0 Kalman Filter TA 6 0 0.310825 0.039223 6 h
3 Kalman Filter TA 6 1 0.529280 0.066789 6 h
6 Kalman Filter TA 6 2 0.330725 0.041734 6 h
1 ERA-I TA 6 0 1.329099 0.167718 6 h
4 ERA-I TA 6 1 1.468124 0.185261 6 h
data_results.method.unique()
['Kalman Filter', 'ERA-I', 'MDS']
Categories (3, object): ['Kalman Filter' < 'ERA-I' < 'MDS']
data_results.columns
Index(['method', 'var', 'gap_len', 'idx_rep', 'rmse', 'rmse_stand',
       'gap_len_f'],
      dtype='object')

Kalman Filters


source

KalmanImpComparison

 KalmanImpComparison (models:pandas.core.frame.DataFrame,
                      df:pandas.core.frame.DataFrame,
                      control:pandas.core.frame.DataFrame, block_len:int,
                      rmse:bool=True, time_series:bool=False)

Compare different Kalman filters

Type Default Details
models DataFrame model, gap_single_var,
df DataFrame
control DataFrame
block_len int
rmse bool True
time_series bool False
models_nc = pd.DataFrame({'model': [ l_model("1_gap_varying_336_no_control_v1.pickle"), l_model("1_gap_varying_6-336_v3.pickle")],
                          'type':   [ 'No Control',                                       'Use Control'                         ]})
models_nc
model type
0 Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 No Control
1 Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Use Control
kcomp = KalmanImpComparison(models_nc, hai, hai_era, 100)
k_results = kcomp.compare(n_rep =3, gap_len = [6, 12, 24], var = list(hai.columns))
k_results
var loss likelihood gap_len idx_rep type rmse rmse_stand gap_len_f
0 TA -3.063759 1.762383 3 0 No Control 0.966638 0.121979 3 h
1 TA -3.956292 2.204462 3 1 No Control 0.354328 0.044712 3 h
2 TA -3.164076 1.947664 3 2 No Control 0.891858 0.112543 3 h
3 TA -5.133151 2.662633 3 0 Use Control 0.444229 0.056057 3 h
4 TA -5.387288 2.626224 3 1 Use Control 0.209779 0.026472 3 h
... ... ... ... ... ... ... ... ... ...
1 TS -1.180353 1.672593 12 1 No Control 0.677688 0.119762 12 h
2 TS 0.954142 1.668745 12 2 No Control 1.172264 0.207163 12 h
3 TS 1.036145 1.528941 12 0 Use Control 0.994793 0.175801 12 h
4 TS 1.770240 1.534406 12 1 Use Control 1.173229 0.207334 12 h
5 TS 3.380553 1.458859 12 2 Use Control 1.446071 0.255551 12 h

162 rows × 9 columns

Plotting

Utils

Faceting


source

facet_wrap

 facet_wrap (data:pandas.core.frame.DataFrame, plot_fn, col:str,
             y_labels:list[str]|None=None, n_cols=3,
             y_resolve='independent')
Type Default Details
data DataFrame full dataset
plot_fn function that makes the plot, takes 2 arguments: data and y_label
col str column to facet
y_labels list[str] | None None custom labels y axis
n_cols int 3
y_resolve str independent

source

facet_grid

 facet_grid (data:pandas.core.frame.DataFrame, plot_fn, col:str, row:str,
             y_labels:list[str]|None=None)
Type Default Details
data DataFrame full dataset
plot_fn function that makes the plot, takes 2 arguments: data and y_label
col str column to facet,
row str
y_labels list[str] | None None custom labels y axis
from itertools import product
test_data = pd.DataFrame(list(product(['0','1'], ['a', 'b'])), columns = ['row', 'col'])
test_data['text'] = test_data.row + test_data.col
def test_plot(data, *args): return alt.Chart(data).mark_text().encode(text='text')
facet_wrap(test_data, test_plot, col = 'row')
facet_grid(test_data, test_plot, col = 'col', row='row')

Format


source

PlotFormatter

 PlotFormatter (font_size:int=18, legend_font_size:int=18,
                title_font_size:int=20)

Format altair plot by setting font sizes/legend position


pipe’]

Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

The Plot

def remove_outliers(data, var, groupby):
    """Simple but maybe computationally inefficent to remove outliers from boxplot """
def custom_box_plot(data, x, y, color, xoffset):
    """Boxplot for altair without outliers"""
    bar = alt.Chart(data).mark_bar().encode(
            x=x,
            y='q1:Q',
            y2='q2:Q',
            color=color,
            xoffset=xoffset
        ).transform_aggregate(
            q1=f'q1({y})',
            q3=f'q1({y})',
            groupby=[color, xoffset]
    )
    
    return bar
# custom_box_plot(data_results, 'gap_len', 'rmse', 'method', 'method')

source

the_plot

 the_plot (data)
the_plot(data_results)

source

the_plot_stand

 the_plot_stand (data)
the_plot_stand(data_results)

source

the_plot_stand2

 the_plot_stand2 (data)
dict(zip(scale_meteo.domain, scale_meteo.range))
{'TA': '#1B9E77',
 'SW_IN': '#D95F02',
 'LW_IN': '#7570B3',
 'VPD': '#E7298A',
 'WS': '#66A61E',
 'PA': '#E6AB02',
 'SWC': '#A6761D',
 'TS': '#666666'}
the_plot_stand2(data_results)
alt.Color('color')
Color({
  shorthand: 'color'
})
x = alt.X('gap_len'); color = alt.Color('method'); xOffset = alt.XOffset('method'); y = alt.Y('rmse_stand')
data_results.groupby([x.shorthand, color.shorthand, xOffset.shorthand]).agg({y.shorthand: ['median', lambda x: x.quantile(.25), lambda x: x.quantile(.75)]}).columns
MultiIndex([('rmse_stand',     'median'),
            ('rmse_stand', '<lambda_0>'),
            ('rmse_stand', '<lambda_1>')],
           )
data_results
method var gap_len idx_rep rmse rmse_stand gap_len_f
0 Kalman Filter TA 6 0 0.310825 0.039223 6 h
3 Kalman Filter TA 6 1 0.529280 0.066789 6 h
6 Kalman Filter TA 6 2 0.330725 0.041734 6 h
1 ERA-I TA 6 0 1.329099 0.167718 6 h
4 ERA-I TA 6 1 1.468124 0.185261 6 h
... ... ... ... ... ... ... ...
4 ERA-I SW_IN 168 1 24.792212 0.121529 1 week (168 h)
7 ERA-I SW_IN 168 2 108.398313 0.531358 1 week (168 h)
2 MDS SW_IN 168 0 134.669370 0.660136 1 week (168 h)
5 MDS SW_IN 168 1 31.500833 0.154414 1 week (168 h)
8 MDS SW_IN 168 2 149.125227 0.730997 1 week (168 h)

72 rows × 7 columns

data_results.groupby(['gap_len', 'method', 'var']).agg(
        median = pd.NamedAgg(y.shorthand, 'median'),
        q1 = pd.NamedAgg(y.shorthand,lambda x: x.quantile(.25)),
        q3 = pd.NamedAgg(y.shorthand,lambda x: x.quantile(.75))).reset_index()
gap_len method var median q1 q3
0 6 Kalman Filter TA 0.041734 0.040478 0.054262
1 6 Kalman Filter SW_IN 0.123723 0.105794 0.215405
2 6 Kalman Filter LW_IN NaN NaN NaN
3 6 Kalman Filter VPD NaN NaN NaN
4 6 Kalman Filter WS NaN NaN NaN
... ... ... ... ... ... ...
103 168 MDS WS NaN NaN NaN
104 168 MDS PA NaN NaN NaN
105 168 MDS P NaN NaN NaN
106 168 MDS SWC NaN NaN NaN
107 168 MDS TS NaN NaN NaN

108 rows × 6 columns


source

custom_boxplot_nooutlier

 custom_boxplot_nooutlier (data:pandas.core.frame.DataFrame,
                           x:altair.vegalite.v5.schema.channels.X,
                           y:altair.vegalite.v5.schema.channels.Y,
                           color:altair.vegalite.v5.schema.channels.Color,
                           xOffset:altair.vegalite.v5.schema.channels.XOff
                           set)

source

the_plot_stand3

 the_plot_stand3 (data)
the_plot_stand3(data_results)

violin plots don’t work

def _the_plot_violin(data, y_label):
        return alt.Chart(data).mark_area(orient='horizontal').encode(
        x = alt.X('density:Q', title='gap_len [h]', axis=alt.Axis(labelAngle=0)),
        y = alt.Y('rmse', title=y_label, scale=alt.Scale(domainMin=-.2 *data['rmse'].mean())),
        color=alt.Color('method:N', scale=alt.Scale(domain=["Kalman Filter", "ERA-I", "MDS"], scheme='dark2')),
        xOffset=alt.XOffset('method', scale=alt.Scale(domain=["Kalman Filter", "ERA-I", "MDS"])),
        column=alt.Column('gap_len')
    ).transform_density(
    'rmse',
    as_=['rmse', 'density'],
    groupby=['method', 'gap_len']
).properties(width=250, height=200)
facet_wrap(data_results, _the_plot_violin, "var", y_labels = [f"RMSE {var} [{units_big[var]}]" for var in data_results['var'].unique()])

Gap Length

gap_len = KalmanImpComparison(models_var, hai, hai_era, block_len=100).compare(gap_len = [2,10,30], var=['TA', 'SW_IN', 'VPD'], n_rep=3)
gap_len.gap_len_f
0     1 h
1     1 h
2     1 h
0     5 h
1     5 h
2     5 h
0    15 h
1    15 h
2    15 h
0     1 h
1     1 h
2     1 h
0     5 h
1     5 h
2     5 h
0    15 h
1    15 h
2    15 h
0     1 h
1     1 h
2     1 h
0     5 h
1     5 h
2     5 h
0    15 h
1    15 h
2    15 h
Name: gap_len_f, dtype: category
Categories (3, object): ['1 h' < '5 h' < '15 h']

source

agg_gap_len

 agg_gap_len (data)
gap_len_agg = agg_gap_len(gap_len)
gap_len_agg
var gap_len median Q1 Q3
0 TA 1 0.139298 0.117130 0.212692
1 TA 5 0.415087 0.307310 0.419030
2 TA 15 0.339634 0.332259 0.668379
3 SW_IN 1 10.995754 7.385527 15.342193
4 SW_IN 5 41.343274 29.328629 46.677302
5 SW_IN 15 22.880862 20.480137 29.897607
6 LW_IN 1 NaN NaN NaN
7 LW_IN 5 NaN NaN NaN
8 LW_IN 15 NaN NaN NaN
9 VPD 1 0.151389 0.142296 0.161319
10 VPD 5 0.147326 0.080502 0.581390
11 VPD 15 0.544413 0.367209 1.145505
12 WS 1 NaN NaN NaN
13 WS 5 NaN NaN NaN
14 WS 15 NaN NaN NaN
15 PA 1 NaN NaN NaN
16 PA 5 NaN NaN NaN
17 PA 15 NaN NaN NaN
18 P 1 NaN NaN NaN
19 P 5 NaN NaN NaN
20 P 15 NaN NaN NaN
21 SWC 1 NaN NaN NaN
22 SWC 5 NaN NaN NaN
23 SWC 15 NaN NaN NaN
24 TS 1 NaN NaN NaN
25 TS 5 NaN NaN NaN
26 TS 15 NaN NaN NaN
rmse_df = _get_era_rmse(hai, hai_era)
rmse_df
var era_rmse
0 TA 1.870859
1 SW_IN 76.320335
2 VPD 2.021377
3 PA 0.104781
4 P 0.293058
5 WS 1.207949
6 LW_IN 20.204070
gap_len_agg
var gap_len median Q1 Q3
0 TA 1 0.139298 0.117130 0.212692
1 TA 5 0.415087 0.307310 0.419030
2 TA 15 0.339634 0.332259 0.668379
3 SW_IN 1 10.995754 7.385527 15.342193
4 SW_IN 5 41.343274 29.328629 46.677302
5 SW_IN 15 22.880862 20.480137 29.897607
6 LW_IN 1 NaN NaN NaN
7 LW_IN 5 NaN NaN NaN
8 LW_IN 15 NaN NaN NaN
9 VPD 1 0.151389 0.142296 0.161319
10 VPD 5 0.147326 0.080502 0.581390
11 VPD 15 0.544413 0.367209 1.145505
12 WS 1 NaN NaN NaN
13 WS 5 NaN NaN NaN
14 WS 15 NaN NaN NaN
15 PA 1 NaN NaN NaN
16 PA 5 NaN NaN NaN
17 PA 15 NaN NaN NaN
18 P 1 NaN NaN NaN
19 P 5 NaN NaN NaN
20 P 15 NaN NaN NaN
21 SWC 1 NaN NaN NaN
22 SWC 5 NaN NaN NaN
23 SWC 15 NaN NaN NaN
24 TS 1 NaN NaN NaN
25 TS 5 NaN NaN NaN
26 TS 15 NaN NaN NaN
gap_len_agg = pd.merge(gap_len_agg, rmse_df, on='var')
_plot_gap_len(gap_len_agg[gap_len_agg['var'] == 'TA'], "label")
facet_wrap(gap_len_agg, _plot_gap_len, 'var')

source

plot_gap_len

 plot_gap_len (data, df, control)
plot_gap_len(gap_len, hai, hai_era)

Compare

plotting for comparing performance of 2 or more different conditions


source

plot_compare

 plot_compare (data:pandas.core.frame.DataFrame, compare:str,
               y:str='rmse_stand', scale_domain:Optional[Sequence]=None,
               y_labels:Optional[Sequence]=None)
plot_compare(k_results, compare="type")

can also show different variables on the y, like the loss of the likelihood

plot_compare(k_results, compare="type", y="loss")

Timeseries

comp_ts = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446, rmse=False, time_series = True)
res_ts = comp_ts.compare(gap_len = [12,24,336], var=["TA", 'SW_IN'], n_rep=1)
res_ts.columns
Index(['method', 'var', 'gap_len', 'idx_rep', 'pred', 'targ', 'gap_len_f'], dtype='object')
test_ts = res_ts.query('method == "Kalman Filter" and var == "TA"')
row_res = test_ts.iloc[0]
pred_all = row_res.pred[0]
targ =  row_res.targ[0]
var = row_res['var']
type(pred_all)
meteo_imp.kalman.training.NormalsDf
targ.mask[var]
time
2000-09-21 20:30:00    True
2000-09-21 21:00:00    True
2000-09-21 21:30:00    True
2000-09-21 22:00:00    True
2000-09-21 22:30:00    True
                       ... 
2000-10-01 01:00:00    True
2000-10-01 01:30:00    True
2000-10-01 02:00:00    True
2000-10-01 02:30:00    True
2000-10-01 03:00:00    True
Name: TA, Length: 446, dtype: bool
list(row_res.index)
['method', 'var', 'gap_len', 'idx_rep', 'pred', 'targ', 'gap_len_f']
np.argmin(row_res.targ[0].mask['TA'])
217
row_res.targ[0].mask['TA'].index[47]
Timestamp('2000-09-22 20:00:00')

source

unnest_predictions

 unnest_predictions (row_res:pandas.core.series.Series, ctx_len:int=50)

Unnest predictions/target for each gap to plot timeseries

unnest_predictions(row_res)
time mean std measurement is_present method var gap_len idx_rep gap_len_f
0 2000-09-25 23:30:00 NaN NaN 10.32 True Kalman Filter TA 6 0 6 h
1 2000-09-26 00:00:00 NaN NaN 10.57 True Kalman Filter TA 6 0 6 h
2 2000-09-26 00:30:00 NaN NaN 10.96 True Kalman Filter TA 6 0 6 h
3 2000-09-26 01:00:00 NaN NaN 10.85 True Kalman Filter TA 6 0 6 h
4 2000-09-26 01:30:00 NaN NaN 10.72 True Kalman Filter TA 6 0 6 h
5 2000-09-26 02:00:00 NaN NaN 10.62 True Kalman Filter TA 6 0 6 h
6 2000-09-26 02:30:00 NaN NaN 10.44 True Kalman Filter TA 6 0 6 h
7 2000-09-26 03:00:00 NaN NaN 11.07 True Kalman Filter TA 6 0 6 h
8 2000-09-26 03:30:00 NaN NaN 11.92 True Kalman Filter TA 6 0 6 h
9 2000-09-26 04:00:00 NaN NaN 11.56 True Kalman Filter TA 6 0 6 h
10 2000-09-26 04:30:00 NaN NaN 11.05 True Kalman Filter TA 6 0 6 h
11 2000-09-26 05:00:00 NaN NaN 10.55 True Kalman Filter TA 6 0 6 h
12 2000-09-26 05:30:00 NaN NaN 10.90 True Kalman Filter TA 6 0 6 h
13 2000-09-26 06:00:00 NaN NaN 10.54 True Kalman Filter TA 6 0 6 h
14 2000-09-26 06:30:00 NaN NaN 9.09 True Kalman Filter TA 6 0 6 h
15 2000-09-26 07:00:00 NaN NaN 8.95 True Kalman Filter TA 6 0 6 h
16 2000-09-26 07:30:00 NaN NaN 9.12 True Kalman Filter TA 6 0 6 h
17 2000-09-26 08:00:00 NaN NaN 9.39 True Kalman Filter TA 6 0 6 h
18 2000-09-26 08:30:00 NaN NaN 9.94 True Kalman Filter TA 6 0 6 h
19 2000-09-26 09:00:00 10.423968 0.756711 10.73 False Kalman Filter TA 6 0 6 h
20 2000-09-26 09:30:00 10.546502 0.819027 10.56 False Kalman Filter TA 6 0 6 h
21 2000-09-26 10:00:00 10.915743 0.869077 10.80 False Kalman Filter TA 6 0 6 h
22 2000-09-26 10:30:00 11.531336 0.909095 11.35 False Kalman Filter TA 6 0 6 h
23 2000-09-26 11:00:00 12.373439 0.937222 12.23 False Kalman Filter TA 6 0 6 h
24 2000-09-26 11:30:00 13.597164 0.951751 13.83 False Kalman Filter TA 6 0 6 h
25 2000-09-26 12:00:00 14.629994 0.951881 14.81 False Kalman Filter TA 6 0 6 h
26 2000-09-26 12:30:00 15.043486 0.937641 14.92 False Kalman Filter TA 6 0 6 h
27 2000-09-26 13:00:00 15.080051 0.909788 15.01 False Kalman Filter TA 6 0 6 h
28 2000-09-26 13:30:00 15.326227 0.869610 15.40 False Kalman Filter TA 6 0 6 h
29 2000-09-26 14:00:00 15.859726 0.817723 15.92 False Kalman Filter TA 6 0 6 h
30 2000-09-26 14:30:00 16.001325 0.750160 15.90 False Kalman Filter TA 6 0 6 h
31 2000-09-26 15:00:00 NaN NaN 16.40 True Kalman Filter TA 6 0 6 h
32 2000-09-26 15:30:00 NaN NaN 16.75 True Kalman Filter TA 6 0 6 h
33 2000-09-26 16:00:00 NaN NaN 16.21 True Kalman Filter TA 6 0 6 h
34 2000-09-26 16:30:00 NaN NaN 14.87 True Kalman Filter TA 6 0 6 h
35 2000-09-26 17:00:00 NaN NaN 14.37 True Kalman Filter TA 6 0 6 h
36 2000-09-26 17:30:00 NaN NaN 13.75 True Kalman Filter TA 6 0 6 h
37 2000-09-26 18:00:00 NaN NaN 13.44 True Kalman Filter TA 6 0 6 h
38 2000-09-26 18:30:00 NaN NaN 13.13 True Kalman Filter TA 6 0 6 h
39 2000-09-26 19:00:00 NaN NaN 13.09 True Kalman Filter TA 6 0 6 h
40 2000-09-26 19:30:00 NaN NaN 13.06 True Kalman Filter TA 6 0 6 h
41 2000-09-26 20:00:00 NaN NaN 13.15 True Kalman Filter TA 6 0 6 h
42 2000-09-26 20:30:00 NaN NaN 13.39 True Kalman Filter TA 6 0 6 h
43 2000-09-26 21:00:00 NaN NaN 13.47 True Kalman Filter TA 6 0 6 h
44 2000-09-26 21:30:00 NaN NaN 13.48 True Kalman Filter TA 6 0 6 h
45 2000-09-26 22:00:00 NaN NaN 13.28 True Kalman Filter TA 6 0 6 h
46 2000-09-26 22:30:00 NaN NaN 13.48 True Kalman Filter TA 6 0 6 h
47 2000-09-26 23:00:00 NaN NaN 13.49 True Kalman Filter TA 6 0 6 h
48 2000-09-26 23:30:00 NaN NaN 13.60 True Kalman Filter TA 6 0 6 h
49 2000-09-27 00:00:00 NaN NaN 13.77 True Kalman Filter TA 6 0 6 h
50 2000-09-27 00:30:00 NaN NaN 14.04 True Kalman Filter TA 6 0 6 h
res_ts_plot = pd.concat([unnest_predictions(row) for _,row in res_ts.iterrows()])
plot_points(res_ts_plot.query('var == "TA" and method == "Kalman Filter" and idx_rep == 0 and gap_len==6.'), y= "measurement")
plot_missing_area(res_ts_plot.query('var == "TA" and method == "Kalman Filter" and idx_rep == 0 and gap_len==6.'))
plot_line(res_ts_plot.query('var == "TA" and idx_rep == 0 and gap_len==6.'), y= "mean", color='method', scale=alt.Scale())
plot_error(res_ts_plot.query('var == "TA" and idx_rep == 0 and gap_len==6.').copy(), y= "mean", color='method', scale=alt.Scale())
_plot_timeseries(res_ts_plot.query('var == "TA" and idx_rep == 0 and gap_len==6.').copy(), "TA")
_plot_timeseries(res_ts_plot.query('var == "SW_IN" and idx_rep == 0 and gap_len==12.').copy(), "SW_IN")
data_ts = pd.concat([unnest_predictions(row) for _,row in res_ts.iterrows()])

source

plot_timeseries

 plot_timeseries (data, idx_rep:int|None=None, gap_len:int|None=None,
                  max_idx:int=3, ctx_len={6.0: 50, 12.0: 50, 168.0: 384},
                  scale_color=Scale({   domain: ['Kalman Filter', 'ERA-I',
                  'MDS'],   scheme: 'dark2' }), compare='method')
plot_timeseries(res_ts, idx_rep='random')

Scatter plot

_plot_scatter(data_ts.query('var == "TA" and gap_len==12. and idx_rep ==0').copy(), x="measurement", y= "mean", color='method', scale=alt.Scale())

Table

The Table

A table where in the rows there is

t = data_results.groupby(['method', 'var', 'gap_len']).agg({'rmse': ['mean', 'std']}).unstack(level=0)

t_idx = t.columns.droplevel()
t_idx.names = ['RMSE', None]

t2 = t.copy()
t2.columns = t_idx

t2 = t2.sort_index(axis=1, level=1).swaplevel(axis=1)
t2
Kalman Filter ERA-I MDS
RMSE mean std mean std mean std
var gap_len
TA 6 0.390277 0.120791 1.432389 0.090855 2.153796 0.638164
12 0.971174 0.714823 1.421383 0.645727 3.069420 1.685032
24 0.543269 0.045233 1.618961 0.708207 2.585667 1.306159
168 0.818625 0.205868 1.691291 0.321707 3.811197 0.459708
SW_IN 6 35.270434 23.989000 35.687127 61.811917 40.526401 67.768649
12 65.749867 58.475011 60.069077 98.779015 64.785358 86.461266
24 44.446448 19.486994 55.218478 41.980449 72.847162 75.033947
168 61.289860 37.403341 74.600240 44.042412 105.098477 64.145949
LW_IN 6 NaN NaN NaN NaN NaN NaN
12 NaN NaN NaN NaN NaN NaN
24 NaN NaN NaN NaN NaN NaN
168 NaN NaN NaN NaN NaN NaN
VPD 6 NaN NaN NaN NaN NaN NaN
12 NaN NaN NaN NaN NaN NaN
24 NaN NaN NaN NaN NaN NaN
168 NaN NaN NaN NaN NaN NaN
WS 6 NaN NaN NaN NaN NaN NaN
12 NaN NaN NaN NaN NaN NaN
24 NaN NaN NaN NaN NaN NaN
168 NaN NaN NaN NaN NaN NaN
PA 6 NaN NaN NaN NaN NaN NaN
12 NaN NaN NaN NaN NaN NaN
24 NaN NaN NaN NaN NaN NaN
168 NaN NaN NaN NaN NaN NaN
P 6 NaN NaN NaN NaN NaN NaN
12 NaN NaN NaN NaN NaN NaN
24 NaN NaN NaN NaN NaN NaN
168 NaN NaN NaN NaN NaN NaN
SWC 6 NaN NaN NaN NaN NaN NaN
12 NaN NaN NaN NaN NaN NaN
24 NaN NaN NaN NaN NaN NaN
168 NaN NaN NaN NaN NaN NaN
TS 6 NaN NaN NaN NaN NaN NaN
12 NaN NaN NaN NaN NaN NaN
24 NaN NaN NaN NaN NaN NaN
168 NaN NaN NaN NaN NaN NaN
row = t2.iloc[0]
row
               RMSE
Kalman Filter  mean    0.390277
               std     0.120791
ERA-I          mean    1.432389
               std     0.090855
MDS            mean    2.153796
               std     0.638164
Name: (TA, 6), dtype: float64
np.argmin(row.iloc[[0,2,4]]) * 2
0

source

style_the_table

 style_the_table (style, cols=[0, 2, 4])

source

highlight_min_method

 highlight_min_method (row, props, cols)
# #| export
# renames_table_latex = {name: f"\parbox{{2.1cm}}{{{val}}}" for name, val in 
#                  {'SW_IN': "Shortwave radiation incoming \\textbf{SW IN} [\si{W/m^2}]",
#                'LW_IN': 'Longwave radiation incoming \\textbf{LW IN} [$W/m^2$]',
#                'TA': "Air Temperature \\textbf{TA} [$°C$]",
#                'VPD': "Vapuour Pressure Deficit \\textbf{VPD} [$hPa$]",
#                'PA': "Air Pressure \\textbf{PA} [$hPa$]",
#                'P': "Precipitation \\textbf{P} [$mm$]",
#                'WS': "Wind Speed \\textbf{WS} [$m/s$]",
#           }.items()}
t2.style.pipe(style_the_table)
    Kalman Filter ERA-I MDS
  RMSE mean std mean std mean std
var gap_len            
TA 6 0.390 0.121 1.432 0.091 2.154 0.638
12 0.971 0.715 1.421 0.646 3.069 1.685
24 0.543 0.045 1.619 0.708 2.586 1.306
168 0.819 0.206 1.691 0.322 3.811 0.460
SW_IN 6 35.270 23.989 35.687 61.812 40.526 67.769
12 65.750 58.475 60.069 98.779 64.785 86.461
24 44.446 19.487 55.218 41.980 72.847 75.034
168 61.290 37.403 74.600 44.042 105.098 64.146
LW_IN 6 - - - - - -
12 - - - - - -
24 - - - - - -
168 - - - - - -
VPD 6 - - - - - -
12 - - - - - -
24 - - - - - -
168 - - - - - -
WS 6 - - - - - -
12 - - - - - -
24 - - - - - -
168 - - - - - -
PA 6 - - - - - -
12 - - - - - -
24 - - - - - -
168 - - - - - -
P 6 - - - - - -
12 - - - - - -
24 - - - - - -
168 - - - - - -
SWC 6 - - - - - -
12 - - - - - -
24 - - - - - -
168 - - - - - -
TS 6 - - - - - -
12 - - - - - -
24 - - - - - -
168 - - - - - -
t3 = t2.rename(index = renames_table_latex).style.apply(highlight_min_method, props="font-weight: bold", axis=1, cols=[0,2,4]).format_index(precision=0).format(precision=3)
t3
    Kalman Filter ERA-I MDS
  RMSE mean std mean std mean std
var gap_len            
\parbox{2.1cm}{\textbf{TA} [\si{°C}]} 6 0.390 0.121 1.432 0.091 2.154 0.638
12 0.971 0.715 1.421 0.646 3.069 1.685
24 0.543 0.045 1.619 0.708 2.586 1.306
168 0.819 0.206 1.691 0.322 3.811 0.460
\parbox{2.1cm}{\textbf{SW\_IN} [\si{W/m^2}]} 6 35.270 23.989 35.687 61.812 40.526 67.769
12 65.750 58.475 60.069 98.779 64.785 86.461
24 44.446 19.487 55.218 41.980 72.847 75.034
168 61.290 37.403 74.600 44.042 105.098 64.146
\parbox{2.1cm}{\textbf{LW\_IN} [\si{W/m^2}]} 6 nan nan nan nan nan nan
12 nan nan nan nan nan nan
24 nan nan nan nan nan nan
168 nan nan nan nan nan nan
\parbox{2.1cm}{\textbf{VPD} [\si{hPa}]} 6 nan nan nan nan nan nan
12 nan nan nan nan nan nan
24 nan nan nan nan nan nan
168 nan nan nan nan nan nan
\parbox{2.1cm}{\textbf{WS} [\si{m/s}]} 6 nan nan nan nan nan nan
12 nan nan nan nan nan nan
24 nan nan nan nan nan nan
168 nan nan nan nan nan nan
\parbox{2.1cm}{\textbf{PA} [\si{hPa}]} 6 nan nan nan nan nan nan
12 nan nan nan nan nan nan
24 nan nan nan nan nan nan
168 nan nan nan nan nan nan
\parbox{2.1cm}{\textbf{P} [\si{mm}]} 6 nan nan nan nan nan nan
12 nan nan nan nan nan nan
24 nan nan nan nan nan nan
168 nan nan nan nan nan nan
\parbox{2.1cm}{\textbf{SWC} [\si{\%}]} 6 nan nan nan nan nan nan
12 nan nan nan nan nan nan
24 nan nan nan nan nan nan
168 nan nan nan nan nan nan
\parbox{2.1cm}{\textbf{TS} [\si{°C}]} 6 nan nan nan nan nan nan
12 nan nan nan nan nan nan
24 nan nan nan nan nan nan
168 nan nan nan nan nan nan
print(t3.to_latex(convert_css=True, hrules=True, clines="skip-last;data", column_format="p{2.1cm}c|rr|rr|rr", caption="caption", label="table"))
\begin{table}
\caption{caption}
\label{table}
\begin{tabular}{p{2.1cm}c|rr|rr|rr}
\toprule
 &  & \multicolumn{2}{r}{Kalman Filter} & \multicolumn{2}{r}{ERA-I} & \multicolumn{2}{r}{MDS} \\
 & RMSE & mean & std & mean & std & mean & std \\
var & gap_len &  &  &  &  &  &  \\
\midrule
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{TA} [\si{°C}]}} & 6 & \bfseries 0.390 & 0.121 & 1.432 & 0.091 & 2.154 & 0.638 \\
 & 12 & \bfseries 0.971 & 0.715 & 1.421 & 0.646 & 3.069 & 1.685 \\
 & 24 & \bfseries 0.543 & 0.045 & 1.619 & 0.708 & 2.586 & 1.306 \\
 & 168 & \bfseries 0.819 & 0.206 & 1.691 & 0.322 & 3.811 & 0.460 \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{SW\_IN} [\si{W/m^2}]}} & 6 & \bfseries 35.270 & 23.989 & 35.687 & 61.812 & 40.526 & 67.769 \\
 & 12 & 65.750 & 58.475 & \bfseries 60.069 & 98.779 & 64.785 & 86.461 \\
 & 24 & \bfseries 44.446 & 19.487 & 55.218 & 41.980 & 72.847 & 75.034 \\
 & 168 & \bfseries 61.290 & 37.403 & 74.600 & 44.042 & 105.098 & 64.146 \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{LW\_IN} [\si{W/m^2}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{VPD} [\si{hPa}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{WS} [\si{m/s}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{PA} [\si{hPa}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{P} [\si{mm}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{SWC} [\si{\%}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{TS} [\si{°C}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\bottomrule
\end{tabular}
\end{table}
gap_len

source

the_table

 the_table (data, y='rmse', y_name='RMSE')
the_table(data_results)

source

the_table_latex

 the_table_latex (table, file, caption='', label='', stand=False)
the_table_latex(the_table(data_results), "test_table.tex")

Table compare


source

table_compare

 table_compare (data, compare:str, y='rmse_stand', compare_ascending=True)
table_compare(k_results, 'type')

source

table_compare_latex

 table_compare_latex (table, file, caption='', label='')

Table Compare 3

same of above but for comparing 3 options instead of 2


source

table_compare3

 table_compare3 (data, compare:str, y='rmse_stand',
                 compare_ascending=True)

source

table_compare3_latex

 table_compare3_latex (table, file, caption='', label='')

Gap len table

g = gap_len.groupby(['var', 'gap_len']).agg({'rmse': ['mean']})
g
(g.droplevel(level=0, axis=1)
 .reset_index()
 .melt(id_vars=['var', 'gap_len'], var_name='rmse')
 .pivot(index = ['var', 'rmse'], columns=['gap_len'])
 .droplevel(level=0, axis=1)
)

source

table_gap_len

 table_gap_len (data, y='rmse_stand')
table_gap_len(gap_len)

source

table_gap_len_latex

 table_gap_len_latex (table, file, caption='', label='')
[f"{col:.0f}" for col in list(table_gap_len(gap_len).columns)]
from tempfile import tempdir
from pathlib import Path
table_gap_len_latex(table_gap_len(gap_len), Path(tempdir) / "test_table.tex")