Results

functions for plotting results and comparing Kalman Filter with MDS and ERA

Setup

import altair as alt

import altair

load data to test functions

reset_seed()

hai = pd.read_parquet(hai_big_path)
hai_era = pd.read_parquet(hai_era_big_path)

dls = imp_dataloader(hai, hai_era, var_sel = 'TA', gap_len=5, block_len=100, control_lags = [1], n_rep=1, bs=1).cpu()

item = MeteoImpItem(1,2, 'TA', gap_len=10)
items = [item]

targ = orig_target(dls, items)[0]

input, _ = one_batch_with_items(dls, items)

Imputation Methods

Kalman Filter

take a MeteoImp Df and gap fill it using the given model

model = torch.load(here("analysis/results/trained_models/1_gap_varying_6-336_v1.pickle"))

preds, targs = predict_items(model, dls=dls, items = items)
preds[0].mean[targs[0].mask] = targs[0].data[targs[0].mask]

KalmanImputation

input[1].shape

torch.Size([1, 100, 9])

preds_raw = model(input)

len(preds_raw.mean[0])

_extract_var(preds_raw.mean, 1, 7)

[[tensor([-0.7555], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.7765], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8153], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8653], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8660], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8712], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8551], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8420], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8442], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([-0.8536], dtype=torch.float64, grad_fn=<IndexBackward0>)]]

_extract_var(preds_raw.cov, 1, 7)

[[tensor([[0.0227]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0307]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0359]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0392]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0409]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0409]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0393]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0361]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0310]], dtype=torch.float64, grad_fn=<IndexBackward0>),
  tensor([[0.0230]], dtype=torch.float64, grad_fn=<IndexBackward0>)]]

items_all = [MeteoImpItem(1,2, list(hai.columns), gap_len=3)] * 3

input_all, _ = one_batch_with_items(dls, items_all)

preds_raw_all = model(input_all)

_extract_var(preds_raw_all.mean, 3, 9)

[[tensor([-0.5650], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.5887], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.6259], dtype=torch.float64, grad_fn=<SliceBackward0>)],
 [tensor([-0.5650], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.5887], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.6259], dtype=torch.float64, grad_fn=<SliceBackward0>)],
 [tensor([-0.5650], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.5887], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([-0.6259], dtype=torch.float64, grad_fn=<SliceBackward0>)]]

buffer_pred(preds_raw_all.mean, input_all[1]).shape

torch.Size([3, 100, 9])

_extract_var(preds_raw_all.cov, 3, 9)

[[tensor([[0.1189]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1717]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1251]], dtype=torch.float64, grad_fn=<SliceBackward0>)],
 [tensor([[0.1189]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1717]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1251]], dtype=torch.float64, grad_fn=<SliceBackward0>)],
 [tensor([[0.1189]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1717]], dtype=torch.float64, grad_fn=<SliceBackward0>),
  tensor([[0.1251]], dtype=torch.float64, grad_fn=<SliceBackward0>)]]

source

PredictLossVar

 PredictLossVar (only_gap:bool, var:int)

loss (negative log likelihood) for only for one variable for each batch

PredictLossVar(True, 3)(preds_raw_all, input_all)

[tensor(-0.2252, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(-0.2252, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(-0.2252, dtype=torch.float64, grad_fn=<MeanBackward0>)]

input_all[1].shape

torch.Size([3, 100, 9])

source

PredictLikelihoodVar

 PredictLikelihoodVar (only_gap:bool, var:int)

mean between timesteps of Likelihood for only for one variable for each batch

PredictLikelihoodVar(True, 3)(preds_raw_all, input_all)

[tensor(1.1569, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(1.1569, dtype=torch.float64, grad_fn=<MeanBackward0>),
 tensor(1.1569, dtype=torch.float64, grad_fn=<MeanBackward0>)]

source

MultiMetrics

 MultiMetrics (**metrics)

Initialize self. See help(type(self)) for accurate signature.

source

KalmanImputation

 KalmanImputation (model)

Initialize self. See help(type(self)) for accurate signature.

k_imp = KalmanImputation(model)

pred = k_imp(item, dls=dls)

display_as_row({'pred': pred[45:55],'data': targ.data[45:55], 'mask': targ.mask[45:55]}, hide_idx=False)

pred

	TA	SW_IN	LW_IN	VPD	WS	PA	P	SWC	TS
time
2000-01-04 02:30:00	2.3467	0.0000	304.7310	0.9520	5.3900	96.3000	0.0000	43.0800	1.3100
2000-01-04 03:00:00	2.1803	0.0000	304.7310	0.9710	5.5900	96.3000	0.0000	43.0800	1.3100
2000-01-04 03:30:00	1.8728	0.0000	304.7310	0.9580	5.6700	96.2900	0.0000	43.0800	1.3100
2000-01-04 04:00:00	1.4769	0.0000	304.7310	0.9380	6.6100	96.2600	0.0000	43.0800	1.3100
2000-01-04 04:30:00	1.4710	0.0000	315.0050	0.7850	6.6500	96.2600	0.0000	43.0800	1.3100
2000-01-04 05:00:00	1.4303	0.0000	315.0050	0.5290	6.1200	96.2700	0.0000	43.0800	1.3000
2000-01-04 05:30:00	1.5573	0.0000	315.0050	0.4500	5.5800	96.2600	0.2900	43.1300	1.3000
2000-01-04 06:00:00	1.6616	0.0000	315.0050	0.4990	4.7000	96.2500	0.0000	43.1300	1.2900
2000-01-04 06:30:00	1.6438	0.0000	315.0050	0.3820	5.0700	96.2300	0.0000	43.1300	1.2900
2000-01-04 07:00:00	1.5697	0.0000	315.0050	0.3130	5.2300	96.2400	0.0000	43.1300	1.2900

data

	TA	SW_IN	LW_IN	VPD	WS	PA	P	SWC	TS
time
2000-01-04 02:30:00	2.1900	0.0000	304.7310	0.9520	5.3900	96.3000	0.0000	43.0800	1.3100
2000-01-04 03:00:00	2.2700	0.0000	304.7310	0.9710	5.5900	96.3000	0.0000	43.0800	1.3100
2000-01-04 03:30:00	2.3200	0.0000	304.7310	0.9580	5.6700	96.2900	0.0000	43.0800	1.3100
2000-01-04 04:00:00	2.3400	0.0000	304.7310	0.9380	6.6100	96.2600	0.0000	43.0800	1.3100
2000-01-04 04:30:00	2.2400	0.0000	315.0050	0.7850	6.6500	96.2600	0.0000	43.0800	1.3100
2000-01-04 05:00:00	2.0000	0.0000	315.0050	0.5290	6.1200	96.2700	0.0000	43.0800	1.3000
2000-01-04 05:30:00	1.9400	0.0000	315.0050	0.4500	5.5800	96.2600	0.2900	43.1300	1.3000
2000-01-04 06:00:00	2.0700	0.0000	315.0050	0.4990	4.7000	96.2500	0.0000	43.1300	1.2900
2000-01-04 06:30:00	2.0400	0.0000	315.0050	0.3820	5.0700	96.2300	0.0000	43.1300	1.2900
2000-01-04 07:00:00	2.0300	0.0000	315.0050	0.3130	5.2300	96.2400	0.0000	43.1300	1.2900

mask

	TA	SW_IN	LW_IN	VPD	WS	PA	P	SWC	TS
time
2000-01-04 02:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 03:00:00	False	True	True	True	True	True	True	True	True
2000-01-04 03:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 04:00:00	False	True	True	True	True	True	True	True	True
2000-01-04 04:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 05:00:00	False	True	True	True	True	True	True	True	True
2000-01-04 05:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 06:00:00	False	True	True	True	True	True	True	True	True
2000-01-04 06:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 07:00:00	False	True	True	True	True	True	True	True	True

Kalman Imputation - Specialized Variables

source

KalmanImputationVar

 KalmanImputationVar (models)

Initialize self. See help(type(self)) for accurate signature.

	Details
models	dataframe with 2 columns `model` and `var`

k_impVar = KalmanImputationVar(pd.DataFrame({'model': [model], 'var': 'TA'}))

pred = k_impVar(var= 'TA', item=item, dls=dls)

display_as_row({'pred': pred[45:55],'data': targ.data[45:55], 'mask': targ.mask[45:55]}, hide_idx=False)

pred

	TA	SW_IN	LW_IN	VPD	WS	PA	P	SWC	TS
time
2000-01-04 02:30:00	2.3467	0.0000	304.7310	0.9520	5.3900	96.3000	0.0000	43.0800	1.3100
2000-01-04 03:00:00	2.1803	0.0000	304.7310	0.9710	5.5900	96.3000	0.0000	43.0800	1.3100
2000-01-04 03:30:00	1.8728	0.0000	304.7310	0.9580	5.6700	96.2900	0.0000	43.0800	1.3100
2000-01-04 04:00:00	1.4769	0.0000	304.7310	0.9380	6.6100	96.2600	0.0000	43.0800	1.3100
2000-01-04 04:30:00	1.4710	0.0000	315.0050	0.7850	6.6500	96.2600	0.0000	43.0800	1.3100
2000-01-04 05:00:00	1.4303	0.0000	315.0050	0.5290	6.1200	96.2700	0.0000	43.0800	1.3000
2000-01-04 05:30:00	1.5573	0.0000	315.0050	0.4500	5.5800	96.2600	0.2900	43.1300	1.3000
2000-01-04 06:00:00	1.6616	0.0000	315.0050	0.4990	4.7000	96.2500	0.0000	43.1300	1.2900
2000-01-04 06:30:00	1.6438	0.0000	315.0050	0.3820	5.0700	96.2300	0.0000	43.1300	1.2900
2000-01-04 07:00:00	1.5697	0.0000	315.0050	0.3130	5.2300	96.2400	0.0000	43.1300	1.2900

data

	TA	SW_IN	LW_IN	VPD	WS	PA	P	SWC	TS
time
2000-01-04 02:30:00	2.1900	0.0000	304.7310	0.9520	5.3900	96.3000	0.0000	43.0800	1.3100
2000-01-04 03:00:00	2.2700	0.0000	304.7310	0.9710	5.5900	96.3000	0.0000	43.0800	1.3100
2000-01-04 03:30:00	2.3200	0.0000	304.7310	0.9580	5.6700	96.2900	0.0000	43.0800	1.3100
2000-01-04 04:00:00	2.3400	0.0000	304.7310	0.9380	6.6100	96.2600	0.0000	43.0800	1.3100
2000-01-04 04:30:00	2.2400	0.0000	315.0050	0.7850	6.6500	96.2600	0.0000	43.0800	1.3100
2000-01-04 05:00:00	2.0000	0.0000	315.0050	0.5290	6.1200	96.2700	0.0000	43.0800	1.3000
2000-01-04 05:30:00	1.9400	0.0000	315.0050	0.4500	5.5800	96.2600	0.2900	43.1300	1.3000
2000-01-04 06:00:00	2.0700	0.0000	315.0050	0.4990	4.7000	96.2500	0.0000	43.1300	1.2900
2000-01-04 06:30:00	2.0400	0.0000	315.0050	0.3820	5.0700	96.2300	0.0000	43.1300	1.2900
2000-01-04 07:00:00	2.0300	0.0000	315.0050	0.3130	5.2300	96.2400	0.0000	43.1300	1.2900

mask

	TA	SW_IN	LW_IN	VPD	WS	PA	P	SWC	TS
time
2000-01-04 02:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 03:00:00	False	True	True	True	True	True	True	True	True
2000-01-04 03:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 04:00:00	False	True	True	True	True	True	True	True	True
2000-01-04 04:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 05:00:00	False	True	True	True	True	True	True	True	True
2000-01-04 05:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 06:00:00	False	True	True	True	True	True	True	True	True
2000-01-04 06:30:00	False	True	True	True	True	True	True	True	True
2000-01-04 07:00:00	False	True	True	True	True	True	True	True	True

MDS

Need to call R from python

r.print('here::here("R/REddyProc_tools.R")')

[1] "here::here(\"R/REddyProc_tools.R\")"

StrVector with 1 elements.

'here::here("R/REddyProc_tools.R")'

source

importr_install

 importr_install (pkg)

source

setupR

 setupR ()

source

R2pd

 R2pd (x)

source

pd2R

 pd2R (x)

Experiment

The time series needs to be at least 90 days long, we we add 45 days before and after the gap

There is a problem with conversions of timestamps between R and Python, so convert to string in Python and then back to datetime in R

source

add_buffer

 add_buffer (index, inner_index, n)

Adds a buffer of after and after index so length is at least

add_buffer(hai.index, hai.index[:100], 50)
add_buffer(hai.index, hai.index[-50:], 50);

source

item2REddy

 item2REddy (item, var, df)

Add context around item for supporting REddyProc

REddy_df = item2REddy(targ, 'TA', hai)

REddy_df_r = r.toR_timestamp(pd2R(REddy_df))

filled = R2pd(r.fill_gaps_EProc(REddy_df_r, "TA"))

R[write to console]: New sEddyProc class for site 'ID'

R[write to console]: Initialized variable 'TA' with 10 real gaps for gap filling.

R[write to console]: Limited MDS algorithm for gap filling of 'TA.gap_0' with LUT(SW_IN only) and MDC.

R[write to console]: Look up table with window size of 7 days with SW_IN

R[write to console]: 10

R[write to console]: Finished gap filling of 'TA' in 0 seconds. Artificial gaps filled: 4419, real gaps filled: 10, unfilled (long) gaps: 0.

filled

	TA_orig	TA_f	TA_fqc	TA_fall	TA_fall_qc	TA_fnum	TA_fsd	TA_fmeth	TA_fwin
1	-0.60	-0.60	0.0	-0.60	NaN	NaN	NaN	NaN	NaN
2	-0.65	-0.65	0.0	-0.65	NaN	NaN	NaN	NaN	NaN
3	-0.58	-0.58	0.0	-0.58	NaN	NaN	NaN	NaN	NaN
4	-0.51	-0.51	0.0	-0.51	NaN	NaN	NaN	NaN	NaN
5	-0.49	-0.49	0.0	-0.49	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
4415	3.30	3.30	0.0	3.30	NaN	NaN	NaN	NaN	NaN
4416	3.10	3.10	0.0	3.10	NaN	NaN	NaN	NaN	NaN
4417	3.05	3.05	0.0	3.05	NaN	NaN	NaN	NaN	NaN
4418	3.05	3.05	0.0	3.05	NaN	NaN	NaN	NaN	NaN
4419	3.05	3.05	0.0	3.05	NaN	NaN	NaN	NaN	NaN

4419 rows × 9 columns

REddy_df.set_index("TIMESTAMP_END")

	TA	SW_IN	LW_IN	VPD	WS	PA	P	SWC	TS	gap
TIMESTAMP_END
2000-01-01 00:30:00	-0.60	0.0	302.475	0.222	2.05	96.63	0.0	43.00	1.44	0.0
2000-01-01 01:00:00	-0.65	0.0	302.475	0.122	2.53	96.58	0.0	43.00	1.43	0.0
2000-01-01 01:30:00	-0.58	0.0	301.677	0.090	3.15	96.56	0.0	43.00	1.43	0.0
2000-01-01 02:00:00	-0.51	0.0	301.677	0.110	3.12	96.56	0.0	43.00	1.45	0.0
2000-01-01 02:30:00	-0.49	0.0	301.677	0.102	3.04	96.57	0.0	43.00	1.44	0.0
...	...	...	...	...	...	...	...	...	...	...
2000-04-01 23:30:00	3.30	0.0	283.856	1.092	2.50	95.06	0.0	44.96	4.32	0.0
2000-04-02 00:00:00	3.10	0.0	283.856	1.013	2.09	95.04	0.0	44.96	4.22	0.0
2000-04-02 00:30:00	3.05	0.0	283.856	0.964	2.42	95.04	0.0	44.96	4.13	0.0
2000-04-02 01:00:00	3.05	0.0	283.856	0.933	3.04	95.04	0.0	44.96	4.04	0.0
2000-04-02 01:30:00	3.05	0.0	251.387	0.930	2.96	95.03	0.0	44.95	3.94	0.0

4419 rows × 10 columns

filled = filled.set_index(pd.to_datetime(REddy_df.TIMESTAMP_END))

filled_item = filled.loc[targ.data.index]

pred = targ.data.copy()
pred.loc[~targ.mask["TA"], "TA"] = filled_item["TA_f"][~targ.mask["TA"]]

pred[45:55], targ.data[45:55]

(                           TA  SW_IN    LW_IN    VPD    WS     PA     P  \
 time                                                                      
 2000-01-04 02:30:00  1.853612    0.0  304.731  0.952  5.39  96.30  0.00   
 2000-01-04 03:00:00  1.837083    0.0  304.731  0.971  5.59  96.30  0.00   
 2000-01-04 03:30:00  1.823985    0.0  304.731  0.958  5.67  96.29  0.00   
 2000-01-04 04:00:00  1.809805    0.0  304.731  0.938  6.61  96.26  0.00   
 2000-01-04 04:30:00  1.794282    0.0  315.005  0.785  6.65  96.26  0.00   
 2000-01-04 05:00:00  1.783447    0.0  315.005  0.529  6.12  96.27  0.00   
 2000-01-04 05:30:00  1.771525    0.0  315.005  0.450  5.58  96.26  0.29   
 2000-01-04 06:00:00  1.760362    0.0  315.005  0.499  4.70  96.25  0.00   
 2000-01-04 06:30:00  1.748410    0.0  315.005  0.382  5.07  96.23  0.00   
 2000-01-04 07:00:00  1.737933    0.0  315.005  0.313  5.23  96.24  0.00   
 
                        SWC    TS  
 time                              
 2000-01-04 02:30:00  43.08  1.31  
 2000-01-04 03:00:00  43.08  1.31  
 2000-01-04 03:30:00  43.08  1.31  
 2000-01-04 04:00:00  43.08  1.31  
 2000-01-04 04:30:00  43.08  1.31  
 2000-01-04 05:00:00  43.08  1.30  
 2000-01-04 05:30:00  43.13  1.30  
 2000-01-04 06:00:00  43.13  1.29  
 2000-01-04 06:30:00  43.13  1.29  
 2000-01-04 07:00:00  43.13  1.29  ,
                        TA  SW_IN    LW_IN    VPD    WS     PA     P    SWC  \
 time                                                                         
 2000-01-04 02:30:00  2.19    0.0  304.731  0.952  5.39  96.30  0.00  43.08   
 2000-01-04 03:00:00  2.27    0.0  304.731  0.971  5.59  96.30  0.00  43.08   
 2000-01-04 03:30:00  2.32    0.0  304.731  0.958  5.67  96.29  0.00  43.08   
 2000-01-04 04:00:00  2.34    0.0  304.731  0.938  6.61  96.26  0.00  43.08   
 2000-01-04 04:30:00  2.24    0.0  315.005  0.785  6.65  96.26  0.00  43.08   
 2000-01-04 05:00:00  2.00    0.0  315.005  0.529  6.12  96.27  0.00  43.08   
 2000-01-04 05:30:00  1.94    0.0  315.005  0.450  5.58  96.26  0.29  43.13   
 2000-01-04 06:00:00  2.07    0.0  315.005  0.499  4.70  96.25  0.00  43.13   
 2000-01-04 06:30:00  2.04    0.0  315.005  0.382  5.07  96.23  0.00  43.13   
 2000-01-04 07:00:00  2.03    0.0  315.005  0.313  5.23  96.24  0.00  43.13   
 
                        TS  
 time                       
 2000-01-04 02:30:00  1.31  
 2000-01-04 03:00:00  1.31  
 2000-01-04 03:30:00  1.31  
 2000-01-04 04:00:00  1.31  
 2000-01-04 04:30:00  1.31  
 2000-01-04 05:00:00  1.30  
 2000-01-04 05:30:00  1.30  
 2000-01-04 06:00:00  1.29  
 2000-01-04 06:30:00  1.29  
 2000-01-04 07:00:00  1.29  )

source

gap_fill_item

 gap_fill_item (item, REddy_df, var, filled)

gap_fill_item(targ, REddy_df, "TA", filled)

	TA	SW_IN	LW_IN	VPD	WS	PA	P	SWC	TS
time
2000-01-03 04:00:00	0.81	0.0	304.148	0.000	3.53	97.00	0.0	43.09	1.37
2000-01-03 04:30:00	0.95	0.0	304.382	0.000	3.99	96.98	0.0	43.09	1.37
2000-01-03 05:00:00	1.09	0.0	304.382	0.000	3.61	96.96	0.0	43.09	1.37
2000-01-03 05:30:00	1.18	0.0	304.382	0.009	3.90	96.93	0.0	43.09	1.37
2000-01-03 06:00:00	1.35	0.0	304.382	0.061	4.17	96.91	0.0	43.09	1.38
...	...	...	...	...	...	...	...	...	...
2000-01-05 03:30:00	4.62	0.0	330.202	1.162	6.53	95.91	0.0	44.12	1.82
2000-01-05 04:00:00	4.51	0.0	330.202	1.636	5.76	95.94	0.0	44.12	1.85
2000-01-05 04:30:00	4.11	0.0	299.320	1.746	5.79	95.98	0.0	44.12	1.85
2000-01-05 05:00:00	3.77	0.0	299.320	2.065	6.00	96.03	0.0	44.12	1.83
2000-01-05 05:30:00	3.36	0.0	299.320	1.911	4.76	96.10	0.0	44.11	1.81

100 rows × 9 columns

MDSImputation

source

MDSImputation

 MDSImputation (var, df)

Initialize self. See help(type(self)) for accurate signature.

mds_imp = MDSImputation('TA', hai)

mds_imp(targ)

	TA	SW_IN	LW_IN	VPD	WS	PA	P	SWC	TS
time
2000-01-03 04:00:00	0.81	0.0	304.148	0.000	3.53	97.00	0.0	43.09	1.37
2000-01-03 04:30:00	0.95	0.0	304.382	0.000	3.99	96.98	0.0	43.09	1.37
2000-01-03 05:00:00	1.09	0.0	304.382	0.000	3.61	96.96	0.0	43.09	1.37
2000-01-03 05:30:00	1.18	0.0	304.382	0.009	3.90	96.93	0.0	43.09	1.37
2000-01-03 06:00:00	1.35	0.0	304.382	0.061	4.17	96.91	0.0	43.09	1.38
...	...	...	...	...	...	...	...	...	...
2000-01-05 03:30:00	4.62	0.0	330.202	1.162	6.53	95.91	0.0	44.12	1.82
2000-01-05 04:00:00	4.51	0.0	330.202	1.636	5.76	95.94	0.0	44.12	1.85
2000-01-05 04:30:00	4.11	0.0	299.320	1.746	5.79	95.98	0.0	44.12	1.85
2000-01-05 05:00:00	3.77	0.0	299.320	2.065	6.00	96.03	0.0	44.12	1.83
2000-01-05 05:30:00	3.36	0.0	299.320	1.911	4.76	96.10	0.0	44.11	1.81

100 rows × 9 columns

ERA Imputation

source

ERAImputation

 ERAImputation ()

Initialize self. See help(type(self)) for accurate signature.

targ.data.columns

Index(['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS', 'PA', 'P', 'SWC', 'TS'], dtype='object')

era_imp = ERAImputation()

era_imp(targ)

	TA	SW_IN	VPD	PA	P	WS	LW_IN	SWC	TS
time
2000-01-03 04:00:00	1.702	0.0	0.693	97.016	0.000	2.948	304.148	NaN	NaN
2000-01-03 04:30:00	1.762	0.0	0.691	97.002	0.026	2.989	304.382	NaN	NaN
2000-01-03 05:00:00	1.736	0.0	0.697	96.988	0.000	3.015	304.382	NaN	NaN
2000-01-03 05:30:00	1.711	0.0	0.703	96.974	0.000	3.041	304.382	NaN	NaN
2000-01-03 06:00:00	1.686	0.0	0.709	96.960	0.000	3.066	304.382	NaN	NaN
...	...	...	...	...	...	...	...	...	...
2000-01-05 03:30:00	4.945	0.0	0.774	95.910	0.252	6.107	330.202	NaN	NaN
2000-01-05 04:00:00	4.789	0.0	0.758	95.926	0.252	6.189	330.202	NaN	NaN
2000-01-05 04:30:00	4.632	0.0	0.741	95.942	0.044	6.270	299.320	NaN	NaN
2000-01-05 05:00:00	4.251	0.0	0.766	95.988	0.000	5.923	299.320	NaN	NaN
2000-01-05 05:30:00	3.869	0.0	0.790	96.035	0.000	5.576	299.320	NaN	NaN

100 rows × 9 columns

Metrics

source

MaskedMetric

 MaskedMetric (metric)

Initialize self. See help(type(self)) for accurate signature.

source

rmse

 rmse (y_true, y_pred)

source

NormalizedMetric

 NormalizedMetric (metric:__main__.MaskedMetric, mean, std)

Initialize self. See help(type(self)) for accurate signature.

source

normalize

 normalize (x, mean, std)

rmse_mask(targ, pred)

array([0.3698457])

Validation Imputations

checks that the values of the imputation makes sense

this is the average error for ERA

np.sqrt((hai['TA'] - hai_era.loc[hai.index]['TA_ERA']).pow(2).mean())

1.8708585984196093

rmse_mask(targ, ERAImputation()(targ))

array([0.29919709])

err_era = []
for _ in range(100):
    items = random.choices(dls.items, k = 1)
    targ = orig_target(dls, items)[0]
    err_era.append(rmse_mask(targ, ERAImputation()(targ)))

np.array(err_era).mean()

1.3839633283333717

Comparison

Take a variable to be filled, makes an artificial gap of given len, tries to fill with 3 methods and return metrics for each of them, repeat n_rep times

prep visualization

source

format_gap_len

 format_gap_len (gap_len:int)

Nice formatting for gap lengths

	Type	Details
gap_len	int	gap length in num observations (30 mins)

test_eq(format_gap_len(10), "5 h")
test_eq(format_gap_len(48), "1 day (24 h)")
test_eq(format_gap_len(90), "2 days (45 h)")
test_eq(format_gap_len(336), "1 week (168 h)")

source

prep_df

 prep_df (df)

Aggregation

hai.std(axis=0)['TA']

7.924610764367695

get_stats(hai)

(tensor([8.3339e+00, 1.2096e+02, 3.1150e+02, 3.3807e+00, 3.1800e+00, 9.5962e+01,
         4.3427e-02, 3.4843e+01, 7.9349e+00], dtype=torch.float64),
 tensor([  7.9246, 204.0026,  41.9557,   4.3684,   1.6254,   0.8552,   0.2803,
           8.9131,   5.6586], dtype=torch.float64))

\[ \text{RMSE}_\text{stand} = \frac{1}{\sigma}\text{RMSE}\]

source

RMSEAgg

 RMSEAgg (df)

Aggregate to rmse and normalized rmse

source

timeseriesAgg

 timeseriesAgg (targ, pred, *args)

Imp Methods

source

ImpComparison

 ImpComparison (models:pandas.core.frame.DataFrame, df, control,
                block_len, rmse=True, time_series=False)

Initialize self. See help(type(self)) for accurate signature.

source

l_model

 l_model (x, base_path=Path('/home/runner/work/meteo_imp/meteo_imp/analysi
          s/results/trained_models'))

models_var = pd.DataFrame.from_records([
    {'var': 'TA',    'model': l_model("TA_specialized_gap_6-336_v3_0.pickle",base_path)},
    {'var': 'SW_IN', 'model': l_model("SW_IN_specialized_gap_6-336_v2_0.pickle",base_path)},
    {'var': 'LW_IN', 'model': l_model("LW_IN_specialized_gap_6-336_v1.pickle",base_path)},
    {'var': 'VPD',   'model': l_model("VPD_specialized_gap_6-336_v2_0.pickle",base_path)},
    {'var': 'WS',    'model': l_model("WS_specialized_gap_6-336_v1.pickle",base_path)},
    {'var': 'PA',    'model': l_model("PA_specialized_gap_6-336_v3_0.pickle",base_path)},
    {'var': 'P',     'model': l_model("1_gap_varying_6-336_v3.pickle",base_path)},
    {'var': 'TS',    'model': l_model("TS_specialized_gap_6-336_v2_0.pickle",base_path)},
    {'var': 'SWC',   'model': l_model("SWC_specialized_gap_6-336_v2_1.pickle",base_path)},
])

comp = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446)

data_results = comp.compare(gap_len = [12, 24, 48, 336], var=["TA", "SW_IN"], n_rep=3)

data_results.head()

	method	var	gap_len	idx_rep	rmse	rmse_stand	gap_len_f
0	Kalman Filter	TA	6	0	0.310825	0.039223	6 h
3	Kalman Filter	TA	6	1	0.529280	0.066789	6 h
6	Kalman Filter	TA	6	2	0.330725	0.041734	6 h
1	ERA-I	TA	6	0	1.329099	0.167718	6 h
4	ERA-I	TA	6	1	1.468124	0.185261	6 h

data_results.method.unique()

['Kalman Filter', 'ERA-I', 'MDS']
Categories (3, object): ['Kalman Filter' < 'ERA-I' < 'MDS']

data_results.columns

Index(['method', 'var', 'gap_len', 'idx_rep', 'rmse', 'rmse_stand',
       'gap_len_f'],
      dtype='object')

Kalman Filters

source

KalmanImpComparison

 KalmanImpComparison (models:pandas.core.frame.DataFrame,
                      df:pandas.core.frame.DataFrame,
                      control:pandas.core.frame.DataFrame, block_len:int,
                      rmse:bool=True, time_series:bool=False)

Compare different Kalman filters

	Type	Default	Details
models	DataFrame		model, gap_single_var,
df	DataFrame
control	DataFrame
block_len	int
rmse	bool	True
time_series	bool	False

models_nc = pd.DataFrame({'model': [ l_model("1_gap_varying_336_no_control_v1.pickle"), l_model("1_gap_varying_6-336_v3.pickle")],
                          'type':   [ 'No Control',                                       'Use Control'                         ]})

models_nc

	model	type
0	Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14	No Control
1	Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14	Use Control

kcomp = KalmanImpComparison(models_nc, hai, hai_era, 100)

k_results = kcomp.compare(n_rep =3, gap_len = [6, 12, 24], var = list(hai.columns))

k_results

	var	loss	likelihood	gap_len	idx_rep	type	rmse	rmse_stand	gap_len_f
0	TA	-3.063759	1.762383	3	0	No Control	0.966638	0.121979	3 h
1	TA	-3.956292	2.204462	3	1	No Control	0.354328	0.044712	3 h
2	TA	-3.164076	1.947664	3	2	No Control	0.891858	0.112543	3 h
3	TA	-5.133151	2.662633	3	0	Use Control	0.444229	0.056057	3 h
4	TA	-5.387288	2.626224	3	1	Use Control	0.209779	0.026472	3 h
...	...	...	...	...	...	...	...	...	...
1	TS	-1.180353	1.672593	12	1	No Control	0.677688	0.119762	12 h
2	TS	0.954142	1.668745	12	2	No Control	1.172264	0.207163	12 h
3	TS	1.036145	1.528941	12	0	Use Control	0.994793	0.175801	12 h
4	TS	1.770240	1.534406	12	1	Use Control	1.173229	0.207334	12 h
5	TS	3.380553	1.458859	12	2	Use Control	1.446071	0.255551	12 h

162 rows × 9 columns

Plotting

Utils

Faceting

source

facet_wrap

 facet_wrap (data:pandas.core.frame.DataFrame, plot_fn, col:str,
             y_labels:list[str]|None=None, n_cols=3,
             y_resolve='independent')

	Type	Default	Details
data	DataFrame		full dataset
plot_fn			function that makes the plot, takes 2 arguments: data and y_label
col	str		column to facet
y_labels	list[str] \| None	None	custom labels y axis
n_cols	int	3
y_resolve	str	independent

source

facet_grid

 facet_grid (data:pandas.core.frame.DataFrame, plot_fn, col:str, row:str,
             y_labels:list[str]|None=None)

	Type	Default	Details
data	DataFrame		full dataset
plot_fn			function that makes the plot, takes 2 arguments: data and y_label
col	str		column to facet,
row	str
y_labels	list[str] \| None	None	custom labels y axis

from itertools import product

test_data = pd.DataFrame(list(product(['0','1'], ['a', 'b'])), columns = ['row', 'col'])
test_data['text'] = test_data.row + test_data.col

def test_plot(data, *args): return alt.Chart(data).mark_text().encode(text='text')

facet_wrap(test_data, test_plot, col = 'row')

facet_grid(test_data, test_plot, col = 'col', row='row')

Format

source

PlotFormatter

 PlotFormatter (font_size:int=18, legend_font_size:int=18,
                title_font_size:int=20)

Format altair plot by setting font sizes/legend position

pipe’]

Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

The Plot

def remove_outliers(data, var, groupby):
    """Simple but maybe computationally inefficent to remove outliers from boxplot """

def custom_box_plot(data, x, y, color, xoffset):
    """Boxplot for altair without outliers"""
    bar = alt.Chart(data).mark_bar().encode(
            x=x,
            y='q1:Q',
            y2='q2:Q',
            color=color,
            xoffset=xoffset
        ).transform_aggregate(
            q1=f'q1({y})',
            q3=f'q1({y})',
            groupby=[color, xoffset]
    )
    
    return bar

# custom_box_plot(data_results, 'gap_len', 'rmse', 'method', 'method')

source

the_plot

 the_plot (data)

the_plot(data_results)

source

the_plot_stand

 the_plot_stand (data)

the_plot_stand(data_results)

source

the_plot_stand2

 the_plot_stand2 (data)

dict(zip(scale_meteo.domain, scale_meteo.range))

{'TA': '#1B9E77',
 'SW_IN': '#D95F02',
 'LW_IN': '#7570B3',
 'VPD': '#E7298A',
 'WS': '#66A61E',
 'PA': '#E6AB02',
 'SWC': '#A6761D',
 'TS': '#666666'}

the_plot_stand2(data_results)

alt.Color('color')

Color({
  shorthand: 'color'
})

x = alt.X('gap_len'); color = alt.Color('method'); xOffset = alt.XOffset('method'); y = alt.Y('rmse_stand')

data_results.groupby([x.shorthand, color.shorthand, xOffset.shorthand]).agg({y.shorthand: ['median', lambda x: x.quantile(.25), lambda x: x.quantile(.75)]}).columns

MultiIndex([('rmse_stand',     'median'),
            ('rmse_stand', '<lambda_0>'),
            ('rmse_stand', '<lambda_1>')],
           )

data_results

	method	var	gap_len	idx_rep	rmse	rmse_stand	gap_len_f
0	Kalman Filter	TA	6	0	0.310825	0.039223	6 h
3	Kalman Filter	TA	6	1	0.529280	0.066789	6 h
6	Kalman Filter	TA	6	2	0.330725	0.041734	6 h
1	ERA-I	TA	6	0	1.329099	0.167718	6 h
4	ERA-I	TA	6	1	1.468124	0.185261	6 h
...	...	...	...	...	...	...	...
4	ERA-I	SW_IN	168	1	24.792212	0.121529	1 week (168 h)
7	ERA-I	SW_IN	168	2	108.398313	0.531358	1 week (168 h)
2	MDS	SW_IN	168	0	134.669370	0.660136	1 week (168 h)
5	MDS	SW_IN	168	1	31.500833	0.154414	1 week (168 h)
8	MDS	SW_IN	168	2	149.125227	0.730997	1 week (168 h)

72 rows × 7 columns

data_results.groupby(['gap_len', 'method', 'var']).agg(
        median = pd.NamedAgg(y.shorthand, 'median'),
        q1 = pd.NamedAgg(y.shorthand,lambda x: x.quantile(.25)),
        q3 = pd.NamedAgg(y.shorthand,lambda x: x.quantile(.75))).reset_index()

	gap_len	method	var	median	q1	q3
0	6	Kalman Filter	TA	0.041734	0.040478	0.054262
1	6	Kalman Filter	SW_IN	0.123723	0.105794	0.215405
2	6	Kalman Filter	LW_IN	NaN	NaN	NaN
3	6	Kalman Filter	VPD	NaN	NaN	NaN
4	6	Kalman Filter	WS	NaN	NaN	NaN
...	...	...	...	...	...	...
103	168	MDS	WS	NaN	NaN	NaN
104	168	MDS	PA	NaN	NaN	NaN
105	168	MDS	P	NaN	NaN	NaN
106	168	MDS	SWC	NaN	NaN	NaN
107	168	MDS	TS	NaN	NaN	NaN

108 rows × 6 columns

source

custom_boxplot_nooutlier

 custom_boxplot_nooutlier (data:pandas.core.frame.DataFrame,
                           x:altair.vegalite.v5.schema.channels.X,
                           y:altair.vegalite.v5.schema.channels.Y,
                           color:altair.vegalite.v5.schema.channels.Color,
                           xOffset:altair.vegalite.v5.schema.channels.XOff
                           set)

source

the_plot_stand3

 the_plot_stand3 (data)

the_plot_stand3(data_results)

violin plots don’t work

def _the_plot_violin(data, y_label):
        return alt.Chart(data).mark_area(orient='horizontal').encode(
        x = alt.X('density:Q', title='gap_len [h]', axis=alt.Axis(labelAngle=0)),
        y = alt.Y('rmse', title=y_label, scale=alt.Scale(domainMin=-.2 *data['rmse'].mean())),
        color=alt.Color('method:N', scale=alt.Scale(domain=["Kalman Filter", "ERA-I", "MDS"], scheme='dark2')),
        xOffset=alt.XOffset('method', scale=alt.Scale(domain=["Kalman Filter", "ERA-I", "MDS"])),
        column=alt.Column('gap_len')
    ).transform_density(
    'rmse',
    as_=['rmse', 'density'],
    groupby=['method', 'gap_len']
).properties(width=250, height=200)

facet_wrap(data_results, _the_plot_violin, "var", y_labels = [f"RMSE {var} [{units_big[var]}]" for var in data_results['var'].unique()])

Gap Length

gap_len = KalmanImpComparison(models_var, hai, hai_era, block_len=100).compare(gap_len = [2,10,30], var=['TA', 'SW_IN', 'VPD'], n_rep=3)

gap_len.gap_len_f

0     1 h
1     1 h
2     1 h
0     5 h
1     5 h
2     5 h
0    15 h
1    15 h
2    15 h
0     1 h
1     1 h
2     1 h
0     5 h
1     5 h
2     5 h
0    15 h
1    15 h
2    15 h
0     1 h
1     1 h
2     1 h
0     5 h
1     5 h
2     5 h
0    15 h
1    15 h
2    15 h
Name: gap_len_f, dtype: category
Categories (3, object): ['1 h' < '5 h' < '15 h']

source

agg_gap_len

 agg_gap_len (data)

gap_len_agg = agg_gap_len(gap_len)

gap_len_agg

	var	gap_len	median	Q1	Q3
0	TA	1	0.139298	0.117130	0.212692
1	TA	5	0.415087	0.307310	0.419030
2	TA	15	0.339634	0.332259	0.668379
3	SW_IN	1	10.995754	7.385527	15.342193
4	SW_IN	5	41.343274	29.328629	46.677302
5	SW_IN	15	22.880862	20.480137	29.897607
6	LW_IN	1	NaN	NaN	NaN
7	LW_IN	5	NaN	NaN	NaN
8	LW_IN	15	NaN	NaN	NaN
9	VPD	1	0.151389	0.142296	0.161319
10	VPD	5	0.147326	0.080502	0.581390
11	VPD	15	0.544413	0.367209	1.145505
12	WS	1	NaN	NaN	NaN
13	WS	5	NaN	NaN	NaN
14	WS	15	NaN	NaN	NaN
15	PA	1	NaN	NaN	NaN
16	PA	5	NaN	NaN	NaN
17	PA	15	NaN	NaN	NaN
18	P	1	NaN	NaN	NaN
19	P	5	NaN	NaN	NaN
20	P	15	NaN	NaN	NaN
21	SWC	1	NaN	NaN	NaN
22	SWC	5	NaN	NaN	NaN
23	SWC	15	NaN	NaN	NaN
24	TS	1	NaN	NaN	NaN
25	TS	5	NaN	NaN	NaN
26	TS	15	NaN	NaN	NaN

rmse_df = _get_era_rmse(hai, hai_era)

rmse_df

	var	era_rmse
0	TA	1.870859
1	SW_IN	76.320335
2	VPD	2.021377
3	PA	0.104781
4	P	0.293058
5	WS	1.207949
6	LW_IN	20.204070

gap_len_agg

	var	gap_len	median	Q1	Q3
0	TA	1	0.139298	0.117130	0.212692
1	TA	5	0.415087	0.307310	0.419030
2	TA	15	0.339634	0.332259	0.668379
3	SW_IN	1	10.995754	7.385527	15.342193
4	SW_IN	5	41.343274	29.328629	46.677302
5	SW_IN	15	22.880862	20.480137	29.897607
6	LW_IN	1	NaN	NaN	NaN
7	LW_IN	5	NaN	NaN	NaN
8	LW_IN	15	NaN	NaN	NaN
9	VPD	1	0.151389	0.142296	0.161319
10	VPD	5	0.147326	0.080502	0.581390
11	VPD	15	0.544413	0.367209	1.145505
12	WS	1	NaN	NaN	NaN
13	WS	5	NaN	NaN	NaN
14	WS	15	NaN	NaN	NaN
15	PA	1	NaN	NaN	NaN
16	PA	5	NaN	NaN	NaN
17	PA	15	NaN	NaN	NaN
18	P	1	NaN	NaN	NaN
19	P	5	NaN	NaN	NaN
20	P	15	NaN	NaN	NaN
21	SWC	1	NaN	NaN	NaN
22	SWC	5	NaN	NaN	NaN
23	SWC	15	NaN	NaN	NaN
24	TS	1	NaN	NaN	NaN
25	TS	5	NaN	NaN	NaN
26	TS	15	NaN	NaN	NaN

gap_len_agg = pd.merge(gap_len_agg, rmse_df, on='var')

_plot_gap_len(gap_len_agg[gap_len_agg['var'] == 'TA'], "label")

facet_wrap(gap_len_agg, _plot_gap_len, 'var')

source

plot_gap_len

 plot_gap_len (data, df, control)

plot_gap_len(gap_len, hai, hai_era)

Compare

plotting for comparing performance of 2 or more different conditions

source

plot_compare

 plot_compare (data:pandas.core.frame.DataFrame, compare:str,
               y:str='rmse_stand', scale_domain:Optional[Sequence]=None,
               y_labels:Optional[Sequence]=None)

plot_compare(k_results, compare="type")

can also show different variables on the y, like the loss of the likelihood

plot_compare(k_results, compare="type", y="loss")

Timeseries

comp_ts = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446, rmse=False, time_series = True)

res_ts = comp_ts.compare(gap_len = [12,24,336], var=["TA", 'SW_IN'], n_rep=1)

res_ts.columns

Index(['method', 'var', 'gap_len', 'idx_rep', 'pred', 'targ', 'gap_len_f'], dtype='object')

test_ts = res_ts.query('method == "Kalman Filter" and var == "TA"')

row_res = test_ts.iloc[0]

pred_all = row_res.pred[0]
targ =  row_res.targ[0]
var = row_res['var']

type(pred_all)

meteo_imp.kalman.training.NormalsDf

targ.mask[var]

time
2000-09-21 20:30:00    True
2000-09-21 21:00:00    True
2000-09-21 21:30:00    True
2000-09-21 22:00:00    True
2000-09-21 22:30:00    True
                       ... 
2000-10-01 01:00:00    True
2000-10-01 01:30:00    True
2000-10-01 02:00:00    True
2000-10-01 02:30:00    True
2000-10-01 03:00:00    True
Name: TA, Length: 446, dtype: bool

list(row_res.index)

['method', 'var', 'gap_len', 'idx_rep', 'pred', 'targ', 'gap_len_f']

np.argmin(row_res.targ[0].mask['TA'])

row_res.targ[0].mask['TA'].index[47]

Timestamp('2000-09-22 20:00:00')

source

unnest_predictions

 unnest_predictions (row_res:pandas.core.series.Series, ctx_len:int=50)

Unnest predictions/target for each gap to plot timeseries

unnest_predictions(row_res)

	time	mean	std	measurement	is_present	method	var	gap_len	gap_len_f
0	2000-09-25 23:30:00	NaN	NaN	10.32	True	Kalman Filter	TA	6	6 h
1	2000-09-26 00:00:00	NaN	NaN	10.57	True	Kalman Filter	TA	6	6 h
2	2000-09-26 00:30:00	NaN	NaN	10.96	True	Kalman Filter	TA	6	6 h
3	2000-09-26 01:00:00	NaN	NaN	10.85	True	Kalman Filter	TA	6	6 h
4	2000-09-26 01:30:00	NaN	NaN	10.72	True	Kalman Filter	TA	6	6 h
5	2000-09-26 02:00:00	NaN	NaN	10.62	True	Kalman Filter	TA	6	6 h
6	2000-09-26 02:30:00	NaN	NaN	10.44	True	Kalman Filter	TA	6	6 h
7	2000-09-26 03:00:00	NaN	NaN	11.07	True	Kalman Filter	TA	6	6 h
8	2000-09-26 03:30:00	NaN	NaN	11.92	True	Kalman Filter	TA	6	6 h
9	2000-09-26 04:00:00	NaN	NaN	11.56	True	Kalman Filter	TA	6	6 h
10	2000-09-26 04:30:00	NaN	NaN	11.05	True	Kalman Filter	TA	6	6 h
11	2000-09-26 05:00:00	NaN	NaN	10.55	True	Kalman Filter	TA	6	6 h
12	2000-09-26 05:30:00	NaN	NaN	10.90	True	Kalman Filter	TA	6	6 h
13	2000-09-26 06:00:00	NaN	NaN	10.54	True	Kalman Filter	TA	6	6 h
14	2000-09-26 06:30:00	NaN	NaN	9.09	True	Kalman Filter	TA	6	6 h
15	2000-09-26 07:00:00	NaN	NaN	8.95	True	Kalman Filter	TA	6	6 h
16	2000-09-26 07:30:00	NaN	NaN	9.12	True	Kalman Filter	TA	6	6 h
17	2000-09-26 08:00:00	NaN	NaN	9.39	True	Kalman Filter	TA	6	6 h
18	2000-09-26 08:30:00	NaN	NaN	9.94	True	Kalman Filter	TA	6	6 h
19	2000-09-26 09:00:00	10.423968	0.756711	10.73	False	Kalman Filter	TA	6	6 h
20	2000-09-26 09:30:00	10.546502	0.819027	10.56	False	Kalman Filter	TA	6	6 h
21	2000-09-26 10:00:00	10.915743	0.869077	10.80	False	Kalman Filter	TA	6	6 h
22	2000-09-26 10:30:00	11.531336	0.909095	11.35	False	Kalman Filter	TA	6	6 h
23	2000-09-26 11:00:00	12.373439	0.937222	12.23	False	Kalman Filter	TA	6	6 h
24	2000-09-26 11:30:00	13.597164	0.951751	13.83	False	Kalman Filter	TA	6	6 h
25	2000-09-26 12:00:00	14.629994	0.951881	14.81	False	Kalman Filter	TA	6	6 h
26	2000-09-26 12:30:00	15.043486	0.937641	14.92	False	Kalman Filter	TA	6	6 h
27	2000-09-26 13:00:00	15.080051	0.909788	15.01	False	Kalman Filter	TA	6	6 h
28	2000-09-26 13:30:00	15.326227	0.869610	15.40	False	Kalman Filter	TA	6	6 h
29	2000-09-26 14:00:00	15.859726	0.817723	15.92	False	Kalman Filter	TA	6	6 h
30	2000-09-26 14:30:00	16.001325	0.750160	15.90	False	Kalman Filter	TA	6	6 h
31	2000-09-26 15:00:00	NaN	NaN	16.40	True	Kalman Filter	TA	6	6 h
32	2000-09-26 15:30:00	NaN	NaN	16.75	True	Kalman Filter	TA	6	6 h
33	2000-09-26 16:00:00	NaN	NaN	16.21	True	Kalman Filter	TA	6	6 h
34	2000-09-26 16:30:00	NaN	NaN	14.87	True	Kalman Filter	TA	6	6 h
35	2000-09-26 17:00:00	NaN	NaN	14.37	True	Kalman Filter	TA	6	6 h
36	2000-09-26 17:30:00	NaN	NaN	13.75	True	Kalman Filter	TA	6	6 h
37	2000-09-26 18:00:00	NaN	NaN	13.44	True	Kalman Filter	TA	6	6 h
38	2000-09-26 18:30:00	NaN	NaN	13.13	True	Kalman Filter	TA	6	6 h
39	2000-09-26 19:00:00	NaN	NaN	13.09	True	Kalman Filter	TA	6	6 h
40	2000-09-26 19:30:00	NaN	NaN	13.06	True	Kalman Filter	TA	6	6 h
41	2000-09-26 20:00:00	NaN	NaN	13.15	True	Kalman Filter	TA	6	6 h
42	2000-09-26 20:30:00	NaN	NaN	13.39	True	Kalman Filter	TA	6	6 h
43	2000-09-26 21:00:00	NaN	NaN	13.47	True	Kalman Filter	TA	6	6 h
44	2000-09-26 21:30:00	NaN	NaN	13.48	True	Kalman Filter	TA	6	6 h
45	2000-09-26 22:00:00	NaN	NaN	13.28	True	Kalman Filter	TA	6	6 h
46	2000-09-26 22:30:00	NaN	NaN	13.48	True	Kalman Filter	TA	6	6 h
47	2000-09-26 23:00:00	NaN	NaN	13.49	True	Kalman Filter	TA	6	6 h
48	2000-09-26 23:30:00	NaN	NaN	13.60	True	Kalman Filter	TA	6	6 h
49	2000-09-27 00:00:00	NaN	NaN	13.77	True	Kalman Filter	TA	6	6 h
50	2000-09-27 00:30:00	NaN	NaN	14.04	True	Kalman Filter	TA	6	6 h

res_ts_plot = pd.concat([unnest_predictions(row) for _,row in res_ts.iterrows()])

plot_points(res_ts_plot.query('var == "TA" and method == "Kalman Filter" and idx_rep == 0 and gap_len==6.'), y= "measurement")

plot_missing_area(res_ts_plot.query('var == "TA" and method == "Kalman Filter" and idx_rep == 0 and gap_len==6.'))

plot_line(res_ts_plot.query('var == "TA" and idx_rep == 0 and gap_len==6.'), y= "mean", color='method', scale=alt.Scale())

plot_error(res_ts_plot.query('var == "TA" and idx_rep == 0 and gap_len==6.').copy(), y= "mean", color='method', scale=alt.Scale())

_plot_timeseries(res_ts_plot.query('var == "TA" and idx_rep == 0 and gap_len==6.').copy(), "TA")

_plot_timeseries(res_ts_plot.query('var == "SW_IN" and idx_rep == 0 and gap_len==12.').copy(), "SW_IN")

data_ts = pd.concat([unnest_predictions(row) for _,row in res_ts.iterrows()])

source

plot_timeseries

 plot_timeseries (data, idx_rep:int|None=None, gap_len:int|None=None,
                  max_idx:int=3, ctx_len={6.0: 50, 12.0: 50, 168.0: 384},
                  scale_color=Scale({   domain: ['Kalman Filter', 'ERA-I',
                  'MDS'],   scheme: 'dark2' }), compare='method')

plot_timeseries(res_ts, idx_rep='random')

Scatter plot

_plot_scatter(data_ts.query('var == "TA" and gap_len==12. and idx_rep ==0').copy(), x="measurement", y= "mean", color='method', scale=alt.Scale())

Table

The Table

A table where in the rows there is

t = data_results.groupby(['method', 'var', 'gap_len']).agg({'rmse': ['mean', 'std']}).unstack(level=0)

t_idx = t.columns.droplevel()
t_idx.names = ['RMSE', None]

t2 = t.copy()
t2.columns = t_idx

t2 = t2.sort_index(axis=1, level=1).swaplevel(axis=1)
t2

		Kalman Filter		ERA-I		MDS
	RMSE	mean	std	mean	std	mean	std
var	gap_len
TA	6	0.390277	0.120791	1.432389	0.090855	2.153796	0.638164
	12	0.971174	0.714823	1.421383	0.645727	3.069420	1.685032
	24	0.543269	0.045233	1.618961	0.708207	2.585667	1.306159
	168	0.818625	0.205868	1.691291	0.321707	3.811197	0.459708
SW_IN	6	35.270434	23.989000	35.687127	61.811917	40.526401	67.768649
	12	65.749867	58.475011	60.069077	98.779015	64.785358	86.461266
	24	44.446448	19.486994	55.218478	41.980449	72.847162	75.033947
	168	61.289860	37.403341	74.600240	44.042412	105.098477	64.145949
LW_IN	6	NaN	NaN	NaN	NaN	NaN	NaN
	12	NaN	NaN	NaN	NaN	NaN	NaN
	24	NaN	NaN	NaN	NaN	NaN	NaN
	168	NaN	NaN	NaN	NaN	NaN	NaN
VPD	6	NaN	NaN	NaN	NaN	NaN	NaN
	12	NaN	NaN	NaN	NaN	NaN	NaN
	24	NaN	NaN	NaN	NaN	NaN	NaN
	168	NaN	NaN	NaN	NaN	NaN	NaN
WS	6	NaN	NaN	NaN	NaN	NaN	NaN
	12	NaN	NaN	NaN	NaN	NaN	NaN
	24	NaN	NaN	NaN	NaN	NaN	NaN
	168	NaN	NaN	NaN	NaN	NaN	NaN
PA	6	NaN	NaN	NaN	NaN	NaN	NaN
	12	NaN	NaN	NaN	NaN	NaN	NaN
	24	NaN	NaN	NaN	NaN	NaN	NaN
	168	NaN	NaN	NaN	NaN	NaN	NaN
P	6	NaN	NaN	NaN	NaN	NaN	NaN
	12	NaN	NaN	NaN	NaN	NaN	NaN
	24	NaN	NaN	NaN	NaN	NaN	NaN
	168	NaN	NaN	NaN	NaN	NaN	NaN
SWC	6	NaN	NaN	NaN	NaN	NaN	NaN
	12	NaN	NaN	NaN	NaN	NaN	NaN
	24	NaN	NaN	NaN	NaN	NaN	NaN
	168	NaN	NaN	NaN	NaN	NaN	NaN
TS	6	NaN	NaN	NaN	NaN	NaN	NaN
	12	NaN	NaN	NaN	NaN	NaN	NaN
	24	NaN	NaN	NaN	NaN	NaN	NaN
	168	NaN	NaN	NaN	NaN	NaN	NaN

row = t2.iloc[0]

row

               RMSE
Kalman Filter  mean    0.390277
               std     0.120791
ERA-I          mean    1.432389
               std     0.090855
MDS            mean    2.153796
               std     0.638164
Name: (TA, 6), dtype: float64

np.argmin(row.iloc[[0,2,4]]) * 2

source

style_the_table

 style_the_table (style, cols=[0, 2, 4])

source

highlight_min_method

 highlight_min_method (row, props, cols)

# #| export
# renames_table_latex = {name: f"\parbox{{2.1cm}}{{{val}}}" for name, val in 
#                  {'SW_IN': "Shortwave radiation incoming \\textbf{SW IN} [\si{W/m^2}]",
#                'LW_IN': 'Longwave radiation incoming \\textbf{LW IN} [$W/m^2$]',
#                'TA': "Air Temperature \\textbf{TA} [$°C$]",
#                'VPD': "Vapuour Pressure Deficit \\textbf{VPD} [$hPa$]",
#                'PA': "Air Pressure \\textbf{PA} [$hPa$]",
#                'P': "Precipitation \\textbf{P} [$mm$]",
#                'WS': "Wind Speed \\textbf{WS} [$m/s$]",
#           }.items()}

t2.style.pipe(style_the_table)

		Kalman Filter		ERA-I		MDS
	RMSE	mean	std	mean	std	mean	std
var	gap_len
TA	6	0.390	0.121	1.432	0.091	2.154	0.638
	12	0.971	0.715	1.421	0.646	3.069	1.685
	24	0.543	0.045	1.619	0.708	2.586	1.306
	168	0.819	0.206	1.691	0.322	3.811	0.460
SW_IN	6	35.270	23.989	35.687	61.812	40.526	67.769
	12	65.750	58.475	60.069	98.779	64.785	86.461
	24	44.446	19.487	55.218	41.980	72.847	75.034
	168	61.290	37.403	74.600	44.042	105.098	64.146
LW_IN	6	-	-	-	-	-	-
	12	-	-	-	-	-	-
	24	-	-	-	-	-	-
	168	-	-	-	-	-	-
VPD	6	-	-	-	-	-	-
	12	-	-	-	-	-	-
	24	-	-	-	-	-	-
	168	-	-	-	-	-	-
WS	6	-	-	-	-	-	-
	12	-	-	-	-	-	-
	24	-	-	-	-	-	-
	168	-	-	-	-	-	-
PA	6	-	-	-	-	-	-
	12	-	-	-	-	-	-
	24	-	-	-	-	-	-
	168	-	-	-	-	-	-
P	6	-	-	-	-	-	-
	12	-	-	-	-	-	-
	24	-	-	-	-	-	-
	168	-	-	-	-	-	-
SWC	6	-	-	-	-	-	-
	12	-	-	-	-	-	-
	24	-	-	-	-	-	-
	168	-	-	-	-	-	-
TS	6	-	-	-	-	-	-
	12	-	-	-	-	-	-
	24	-	-	-	-	-	-
	168	-	-	-	-	-	-

t3 = t2.rename(index = renames_table_latex).style.apply(highlight_min_method, props="font-weight: bold", axis=1, cols=[0,2,4]).format_index(precision=0).format(precision=3)

t3

		Kalman Filter		ERA-I		MDS
	RMSE	mean	std	mean	std	mean	std
var	gap_len
\parbox{2.1cm}{\textbf{TA} [\si{°C}]}	6	0.390	0.121	1.432	0.091	2.154	0.638
	12	0.971	0.715	1.421	0.646	3.069	1.685
	24	0.543	0.045	1.619	0.708	2.586	1.306
	168	0.819	0.206	1.691	0.322	3.811	0.460
\parbox{2.1cm}{\textbf{SW\_IN} [\si{W/m^2}]}	6	35.270	23.989	35.687	61.812	40.526	67.769
	12	65.750	58.475	60.069	98.779	64.785	86.461
	24	44.446	19.487	55.218	41.980	72.847	75.034
	168	61.290	37.403	74.600	44.042	105.098	64.146
\parbox{2.1cm}{\textbf{LW\_IN} [\si{W/m^2}]}	6	nan	nan	nan	nan	nan	nan
	12	nan	nan	nan	nan	nan	nan
	24	nan	nan	nan	nan	nan	nan
	168	nan	nan	nan	nan	nan	nan
\parbox{2.1cm}{\textbf{VPD} [\si{hPa}]}	6	nan	nan	nan	nan	nan	nan
	12	nan	nan	nan	nan	nan	nan
	24	nan	nan	nan	nan	nan	nan
	168	nan	nan	nan	nan	nan	nan
\parbox{2.1cm}{\textbf{WS} [\si{m/s}]}	6	nan	nan	nan	nan	nan	nan
	12	nan	nan	nan	nan	nan	nan
	24	nan	nan	nan	nan	nan	nan
	168	nan	nan	nan	nan	nan	nan
\parbox{2.1cm}{\textbf{PA} [\si{hPa}]}	6	nan	nan	nan	nan	nan	nan
	12	nan	nan	nan	nan	nan	nan
	24	nan	nan	nan	nan	nan	nan
	168	nan	nan	nan	nan	nan	nan
\parbox{2.1cm}{\textbf{P} [\si{mm}]}	6	nan	nan	nan	nan	nan	nan
	12	nan	nan	nan	nan	nan	nan
	24	nan	nan	nan	nan	nan	nan
	168	nan	nan	nan	nan	nan	nan
\parbox{2.1cm}{\textbf{SWC} [\si{\%}]}	6	nan	nan	nan	nan	nan	nan
	12	nan	nan	nan	nan	nan	nan
	24	nan	nan	nan	nan	nan	nan
	168	nan	nan	nan	nan	nan	nan
\parbox{2.1cm}{\textbf{TS} [\si{°C}]}	6	nan	nan	nan	nan	nan	nan
	12	nan	nan	nan	nan	nan	nan
	24	nan	nan	nan	nan	nan	nan
	168	nan	nan	nan	nan	nan	nan

print(t3.to_latex(convert_css=True, hrules=True, clines="skip-last;data", column_format="p{2.1cm}c|rr|rr|rr", caption="caption", label="table"))

\begin{table}
\caption{caption}
\label{table}
\begin{tabular}{p{2.1cm}c|rr|rr|rr}
\toprule
 &  & \multicolumn{2}{r}{Kalman Filter} & \multicolumn{2}{r}{ERA-I} & \multicolumn{2}{r}{MDS} \\
 & RMSE & mean & std & mean & std & mean & std \\
var & gap_len &  &  &  &  &  &  \\
\midrule
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{TA} [\si{°C}]}} & 6 & \bfseries 0.390 & 0.121 & 1.432 & 0.091 & 2.154 & 0.638 \\
 & 12 & \bfseries 0.971 & 0.715 & 1.421 & 0.646 & 3.069 & 1.685 \\
 & 24 & \bfseries 0.543 & 0.045 & 1.619 & 0.708 & 2.586 & 1.306 \\
 & 168 & \bfseries 0.819 & 0.206 & 1.691 & 0.322 & 3.811 & 0.460 \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{SW\_IN} [\si{W/m^2}]}} & 6 & \bfseries 35.270 & 23.989 & 35.687 & 61.812 & 40.526 & 67.769 \\
 & 12 & 65.750 & 58.475 & \bfseries 60.069 & 98.779 & 64.785 & 86.461 \\
 & 24 & \bfseries 44.446 & 19.487 & 55.218 & 41.980 & 72.847 & 75.034 \\
 & 168 & \bfseries 61.290 & 37.403 & 74.600 & 44.042 & 105.098 & 64.146 \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{LW\_IN} [\si{W/m^2}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{VPD} [\si{hPa}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{WS} [\si{m/s}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{PA} [\si{hPa}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{P} [\si{mm}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{SWC} [\si{\%}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\multirow[c]{4}{*}{\parbox{2.1cm}{\textbf{TS} [\si{°C}]}} & 6 & nan & nan & nan & nan & nan & nan \\
 & 12 & nan & nan & nan & nan & nan & nan \\
 & 24 & nan & nan & nan & nan & nan & nan \\
 & 168 & nan & nan & nan & nan & nan & nan \\
\cline{1-8}
\bottomrule
\end{tabular}
\end{table}

gap_len

source

the_table

 the_table (data, y='rmse', y_name='RMSE')

the_table(data_results)

source

the_table_latex

 the_table_latex (table, file, caption='', label='', stand=False)

the_table_latex(the_table(data_results), "test_table.tex")

Table compare

source

table_compare

 table_compare (data, compare:str, y='rmse_stand', compare_ascending=True)

table_compare(k_results, 'type')

source

table_compare_latex

 table_compare_latex (table, file, caption='', label='')

Table Compare 3

same of above but for comparing 3 options instead of 2

source

table_compare3

 table_compare3 (data, compare:str, y='rmse_stand',
                 compare_ascending=True)

source

table_compare3_latex

 table_compare3_latex (table, file, caption='', label='')

Gap len table

g = gap_len.groupby(['var', 'gap_len']).agg({'rmse': ['mean']})

(g.droplevel(level=0, axis=1)
 .reset_index()
 .melt(id_vars=['var', 'gap_len'], var_name='rmse')
 .pivot(index = ['var', 'rmse'], columns=['gap_len'])
 .droplevel(level=0, axis=1)
)

source

table_gap_len

 table_gap_len (data, y='rmse_stand')

table_gap_len(gap_len)

source

table_gap_len_latex

 table_gap_len_latex (table, file, caption='', label='')

[f"{col:.0f}" for col in list(table_gap_len(gap_len).columns)]

from tempfile import tempdir
from pathlib import Path

table_gap_len_latex(table_gap_len(gap_len), Path(tempdir) / "test_table.tex")