Bin Data Examples

[ ]:

import packages

[1]:

import os import re import numpy as np import pandas as pd import matplotlib.pyplot as plt from GPSat import get_data_path from GPSat.bin_data import BinData from GPSat.dataprepper import DataPrep from GPSat.dataloader import DataLoader from GPSat.utils import WGS84toEASE2, EASE2toWGS84, cprint, stats_on_vals from GPSat.plot_utils import plot_wrapper

parameters

[2]:

val_col = 'z' by_cols = ['t', 'source'] val_col = val_col x_col = 'x' y_col = 'y' grid_res = 50_000 x_range = [-4_500_000.0, 4_500_000.0] y_range = [-4_500_000.0, 4_500_000.0] lat_0, lon_0 = 90, 0 # plotting # extent = [lon min, lat max, lat min, lat max] extent = [-180, 180, 60, 90] # which projection to use: "north" or "south" projection = "north"

read in raw data

in this case from several csv files

[3]:


df = DataLoader.read_flat_files(file_dirs=get_data_path("example"), file_regex="_RAW\.csv$", col_funcs={ "source": { "func": lambda x: re.sub('_RAW.*$', '', os.path.basename(x)), "filename_as_arg": True } }) # convert lon, lat, datetime to x, y, t - to be used as the coordinate space # - these could be included in the col_funcs df['x'], df['y'] = WGS84toEASE2(lon=df['lon'], lat=df['lat'], lat_0=lat_0, lon_0=lon_0) df['t'] = df['datetime'].values.astype("datetime64[D]").astype(float)
----------------------------------------------------------------------------------------------------
reading files from:
/home/runner/work/GPSat/GPSat/data/example/
that match regular expression: _RAW\.csv$
'read_from_multiple_files': 0.923 seconds

Statistic on Values

it is useful to look at summary statistic on values to get an idea how it should be processed

[4]:

sov = stats_on_vals(vals=df[val_col].values, name=val_col) cprint("-" * 10, "BOLD") cprint(f"Stats on '{val_col}' column", "OKCYAN") cprint(sov, "OKBLUE")
'stats_on_vals': 0.330 seconds
----------
Stats on 'z' column
                      z
measure               z
size            1174848
num_not_nan     1174848
num_inf               0
min            -16.7965
mean           0.128416
max             16.7093
std            0.186566
skew         -10.912819
kurtosis     890.268297
q0.010          -0.3732
q0.050          -0.1387
q0.100          -0.0485
q0.200           0.0322
q0.300           0.0767
q0.400           0.1101
q0.500           0.1394
q0.600           0.1686
q0.700           0.2011
q0.800           0.2424
q0.900           0.3067
q0.950           0.3632
q0.990           0.4775

DataPrep.bin_data_by: 2d binning

[5]:


bin_ds = DataPrep.bin_data_by(df=df.loc[(df['z'] > -0.35) & (df['z'] < 0.65)], by_cols=by_cols, val_col=val_col, x_col=x_col, y_col=y_col, grid_res=grid_res, x_range=x_range, y_range=y_range, return_df=False)
'bin_data_by': 1.624 seconds

plot results

[6]:

# bin_data_by returns a Dataset, unless return_df = True # - drop nans and reset index bin_df = bin_ds.to_dataframe().dropna().reset_index() # this will plot all observations, some on top of each other bin_df['lon'], bin_df['lat'] = EASE2toWGS84(bin_df['x'], bin_df['y'], lat_0=lat_0, lon_0=lon_0) mid_t = np.median(bin_df['t']) fig, stats_df = plot_wrapper(plt_df=bin_df.loc[bin_df['t'] == mid_t], val_col=val_col, max_obs=500_000, vmin_max=[-0.1, 0.5], projection=projection, extent=extent) plt.show()
'stats_on_vals': 0.003 seconds
plotting pcolormesh...
'plot_pcolormesh': 0.042 seconds
plotting hist (using all data)...
'plot_hist': 0.066 seconds
'plot_wrapper': 0.196 seconds
/opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/site-packages/cartopy/io/__init__.py:241: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/50m_physical/ne_50m_land.zip
  warnings.warn(f'Downloading: {url}', DownloadWarning)
/opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/site-packages/cartopy/io/__init__.py:241: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/50m_physical/ne_50m_coastline.zip
  warnings.warn(f'Downloading: {url}', DownloadWarning)
/opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/site-packages/cartopy/io/__init__.py:241: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/50m_physical/ne_50m_lakes.zip
  warnings.warn(f'Downloading: {url}', DownloadWarning)
../_images/notebooks_bin_data_13_2.png

apply 1-d binning

demonstrated by a toy example

[7]:


# - # generate toy data # - N = 10001 np.random.seed(0) # tf.random.set_seed(0) # Build inputs X X = np.linspace(0, 4 * np.pi, N)[:, None] # X must be of shape [N, 1] # Deterministic functions in place of latent ones f1 = np.sin f2 = np.cos # Use transform = exp to ensure positive-only scale values transform = np.exp # Compute loc and scale as functions of input X loc = f1(X) scale = transform(f2(X)) # Sample outputs Y from Gaussian Likelihood # - scale is standard deviation Y = np.random.normal(loc, scale) # store data in DataFrame (bin_data_by expects DataFrame atm) # - by is a dummy column, currently need df_dummy = pd.DataFrame({"x": X[:,0], "y": Y[:,0], 'by': 1})

Bin 1d Data

[8]:

# TODO: just use bin_data ? needs to be modified bdf = DataPrep.bin_data_by(df=df_dummy, x_col='x', val_col='y', by_cols='by', # bin_statistic=[np.mean, np.var, len], bin_statistic=["mean", "std", "count"], x_range=[0, 4 * np.pi], grid_res=0.1, bin_2d=False, return_df=True) bdf.reset_index(inplace=True) bdf.drop("by", axis=1, inplace=True)
'bin_data_by': 1.554 seconds

plot binned results with original obs

[9]:

plt.plot(bdf['x'], bdf['y_mean']) # plt.fill_between(bdf['x'], bdf['y_mean'] + np.sqrt(bdf['y_var']), bdf['y_mean'] - np.sqrt(bdf['y_var']), # alpha=0.5) plt.fill_between(bdf['x'], bdf['y_mean'] + bdf['y_std'], bdf['y_mean'] - bdf['y_std'], alpha=0.5) # plt.plot(bdf['x'], bdf['y_mean'] + np.sqrt(bdf['y_var'])) # plt.plot(bdf['x'], bdf['y_mean'] - np.sqrt(bdf['y_var'])) plt.scatter(X[:,0], Y[:,0], s=5, alpha=0.25) plt.show() # - # identify tracks - apply 1d binning # -
../_images/notebooks_bin_data_19_0.png

BinData class

[10]:

# TODO: save data as parquet file # useful when have large amount of data in a single hdf5 file # - allows for reading data in by batches parq_tmp = get_data_path("example", "tmp.parquet") df['date'] = df['datetime'].astype('datetime64[D]') df.to_parquet(parq_tmp, engine="fastparquet")

bin config

same parameters used by DataPrep.bin_data_by

[11]:


bin_config = { 'grid_res': 50000, 'by_cols': ['source', 'date'], 'val_col': val_col, 'bin_statistic': 'mean', 'row_select': [{'col': val_col, 'comp': '>=', 'val': -2.0}, {'col': val_col, 'comp': '<=', 'val': 2.0}], 'x_col': 'x', 'y_col': 'y', 'x_range': [-4500000.0, 4500000.0], 'y_range': [-4500000.0, 4500000.0] }

bin data

NOTE: this class is currently a work in progress, effectively acts as a wrapper for DataPrep.bin_data_by and stats_on_vals

[12]:

bd = BinData() # if load_by is not specified will read data in by unique by_cols (from bin_config) bin_df, stats = bd.bin_data(source=parq_tmp, batch=False, bin_config=bin_config)
will bin data all at once
'data_select': 0.424 seconds
'load': 0.425 seconds
getting stats on column: z from data
'stats_on_vals': 0.255 seconds
binning data...
'data_select': 0.058 seconds
'bin_data_by': 1.771 seconds
'bin_data_all_at_once': 2.480 seconds

view results

[13]:

cprint("-" * 20, "BOLD") cprint("bin_df:") cprint("head", "OKCYAN") cprint(bin_df.head(5), "OKBLUE") cprint("dtypes", "OKCYAN") cprint(bin_df.dtypes, "OKBLUE") cprint("-" * 20, "BOLD") cprint("stats:") cprint(stats, "OKBLUE")

--------------------
bin_df:
head
           y          x source       date         z
0 -2475000.0  1125000.0      B 2020-03-06  0.181500
1 -2225000.0 -1525000.0      A 2020-03-10  0.198300
2 -2225000.0 -1425000.0      B 2020-03-01  0.231700
3 -2175000.0 -1625000.0      A 2020-03-06 -0.005200
4 -2175000.0 -1575000.0      A 2020-03-06 -0.052156
dtypes
y                float64
x                float64
source            object
date      datetime64[ns]
z                float64
dtype: object
--------------------
stats:
                      z
measure               z
size            1174848
num_not_nan     1174848
num_inf               0
min            -16.7965
mean           0.128416
max             16.7093
std            0.186566
skew         -10.912819
kurtosis     890.268297
q0.001        -1.087376
q0.010          -0.3732
q0.050          -0.1387
q0.100          -0.0485
q0.200           0.0322
q0.300           0.0767
q0.400           0.1101
q0.500           0.1394
q0.600           0.1686
q0.700           0.2011
q0.800           0.2424
q0.900           0.3067
q0.950           0.3632
q0.990           0.4775
q0.999         0.646315

write to file

[14]:

# bin_df.to_parquet("/path/to/binned_data.parquet", engine="fastparquet")