{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6807c98f",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "# Bin Data Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "615f53c4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "276a8bfc",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "## import packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e71dd415",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import os\n",
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from GPSat import get_data_path\n",
    "from GPSat.bin_data import BinData\n",
    "from GPSat.dataprepper import DataPrep\n",
    "from GPSat.dataloader import DataLoader\n",
    "from GPSat.utils import WGS84toEASE2, EASE2toWGS84, cprint, stats_on_vals\n",
    "from GPSat.plot_utils import plot_wrapper"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a1abf19",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "## parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f5728d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "val_col = 'z'\n",
    "\n",
    "by_cols = ['t', 'source']\n",
    "val_col = val_col\n",
    "x_col = 'x'\n",
    "y_col = 'y'\n",
    "grid_res = 50_000\n",
    "x_range = [-4_500_000.0, 4_500_000.0]\n",
    "y_range = [-4_500_000.0, 4_500_000.0]\n",
    "\n",
    "lat_0, lon_0 = 90, 0\n",
    "\n",
    "# plotting\n",
    "# extent = [lon min, lat max, lat min, lat max]\n",
    "extent = [-180, 180, 60, 90]\n",
    "\n",
    "# which projection to use: \"north\" or \"south\"\n",
    "projection = \"north\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b45b087e",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "## read in raw data\n",
    "\n",
    "in this case from several csv files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acfb0a53",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "df = DataLoader.read_flat_files(file_dirs=get_data_path(\"example\"),\n",
    "                                file_regex=\"_RAW\\.csv$\",\n",
    "                                col_funcs={\n",
    "                                    \"source\": {\n",
    "                                        \"func\": lambda x: re.sub('_RAW.*$', '', os.path.basename(x)),\n",
    "                                        \"filename_as_arg\": True\n",
    "                                    }\n",
    "                                })\n",
    "\n",
    "# convert lon, lat, datetime to x, y, t - to be used as the coordinate space\n",
    "# - these could be included in the col_funcs\n",
    "df['x'], df['y'] = WGS84toEASE2(lon=df['lon'], lat=df['lat'], lat_0=lat_0, lon_0=lon_0)\n",
    "df['t'] = df['datetime'].values.astype(\"datetime64[D]\").astype(float)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ce98525",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "## Statistic on Values\n",
    "it is useful to look at summary statistic on values to get an idea how it should be processed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cbaeb3f",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "\n",
    "sov = stats_on_vals(vals=df[val_col].values, name=val_col)\n",
    "\n",
    "cprint(\"-\" * 10, \"BOLD\")\n",
    "cprint(f\"Stats on '{val_col}' column\", \"OKCYAN\")\n",
    "cprint(sov, \"OKBLUE\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f711fc1",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "## DataPrep.bin_data_by: 2d binning\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c691160",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "bin_ds = DataPrep.bin_data_by(df=df.loc[(df['z'] > -0.35) & (df['z'] < 0.65)],\n",
    "                              by_cols=by_cols,\n",
    "                              val_col=val_col,\n",
    "                              x_col=x_col,\n",
    "                              y_col=y_col,\n",
    "                              grid_res=grid_res,\n",
    "                              x_range=x_range,\n",
    "                              y_range=y_range,\n",
    "                              return_df=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8cdb51fb",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "## plot results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22b7b4e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# bin_data_by returns a Dataset, unless return_df = True\n",
    "# - drop nans and reset index\n",
    "bin_df = bin_ds.to_dataframe().dropna().reset_index()\n",
    "\n",
    "# this will plot all observations, some on top of each other\n",
    "bin_df['lon'], bin_df['lat'] = EASE2toWGS84(bin_df['x'], bin_df['y'],\n",
    "                                            lat_0=lat_0, lon_0=lon_0)\n",
    "\n",
    "mid_t = np.median(bin_df['t'])\n",
    "\n",
    "fig, stats_df = plot_wrapper(plt_df=bin_df.loc[bin_df['t'] == mid_t],\n",
    "                             val_col=val_col,\n",
    "                             max_obs=500_000,\n",
    "                             vmin_max=[-0.1, 0.5],\n",
    "                             projection=projection,\n",
    "                             extent=extent)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f7e31b1f",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "## apply 1-d binning\n",
    "demonstrated by a toy example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28f8f5f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "# -\n",
    "# generate toy data\n",
    "# -\n",
    "\n",
    "N = 10001\n",
    "\n",
    "np.random.seed(0)\n",
    "# tf.random.set_seed(0)\n",
    "\n",
    "# Build inputs X\n",
    "X = np.linspace(0, 4 * np.pi, N)[:, None]  # X must be of shape [N, 1]\n",
    "\n",
    "# Deterministic functions in place of latent ones\n",
    "f1 = np.sin\n",
    "f2 = np.cos\n",
    "\n",
    "# Use transform = exp to ensure positive-only scale values\n",
    "transform = np.exp\n",
    "\n",
    "# Compute loc and scale as functions of input X\n",
    "loc = f1(X)\n",
    "scale = transform(f2(X))\n",
    "\n",
    "# Sample outputs Y from Gaussian Likelihood\n",
    "# - scale is standard deviation\n",
    "Y = np.random.normal(loc, scale)\n",
    "\n",
    "# store data in DataFrame (bin_data_by expects DataFrame atm)\n",
    "# - by is a dummy column, currently need\n",
    "df_dummy = pd.DataFrame({\"x\": X[:,0], \"y\": Y[:,0], 'by': 1})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f1431f0",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "Bin 1d Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ee90801",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "\n",
    "# TODO: just use bin_data ? needs to be modified\n",
    "bdf = DataPrep.bin_data_by(df=df_dummy,\n",
    "                           x_col='x',\n",
    "                           val_col='y',\n",
    "                           by_cols='by',\n",
    "                           # bin_statistic=[np.mean, np.var, len],\n",
    "                           bin_statistic=[\"mean\", \"std\", \"count\"],\n",
    "                           x_range=[0, 4 * np.pi],\n",
    "                           grid_res=0.1,\n",
    "                           bin_2d=False,\n",
    "                           return_df=True)\n",
    "\n",
    "bdf.reset_index(inplace=True)\n",
    "bdf.drop(\"by\", axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6cade7d",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "plot binned results with original obs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca458cda",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "\n",
    "plt.plot(bdf['x'], bdf['y_mean'])\n",
    "# plt.fill_between(bdf['x'],  bdf['y_mean'] + np.sqrt(bdf['y_var']),  bdf['y_mean'] - np.sqrt(bdf['y_var']),\n",
    "#                  alpha=0.5)\n",
    "plt.fill_between(bdf['x'],\n",
    "                 bdf['y_mean'] + bdf['y_std'],\n",
    "                 bdf['y_mean'] - bdf['y_std'],\n",
    "                 alpha=0.5)\n",
    "# plt.plot(bdf['x'], bdf['y_mean'] + np.sqrt(bdf['y_var']))\n",
    "# plt.plot(bdf['x'], bdf['y_mean'] - np.sqrt(bdf['y_var']))\n",
    "plt.scatter(X[:,0], Y[:,0], s=5, alpha=0.25)\n",
    "plt.show()\n",
    "\n",
    "\n",
    "# -\n",
    "# identify tracks - apply 1d binning\n",
    "# -"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23f48061",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "## BinData class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb3d535f",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# TODO: save data as parquet file\n",
    "\n",
    "# useful when have large amount of data in a single hdf5 file\n",
    "# - allows for reading data in by batches\n",
    "\n",
    "parq_tmp = get_data_path(\"example\", \"tmp.parquet\")\n",
    "df['date'] = df['datetime'].astype('datetime64[D]')\n",
    "\n",
    "df.to_parquet(parq_tmp, engine=\"fastparquet\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9875f974",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "## bin config\n",
    "same parameters used by DataPrep.bin_data_by"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f30ff792",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "bin_config = {\n",
    "    'grid_res': 50000,\n",
    "    'by_cols': ['source', 'date'],\n",
    "    'val_col': val_col,\n",
    "    'bin_statistic': 'mean',\n",
    "    'row_select': [{'col': val_col, 'comp': '>=', 'val': -2.0},\n",
    "                   {'col': val_col, 'comp': '<=', 'val': 2.0}],\n",
    "    'x_col': 'x',\n",
    "    'y_col': 'y',\n",
    "    'x_range': [-4500000.0, 4500000.0],\n",
    "    'y_range': [-4500000.0, 4500000.0]\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a6fd607",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "bin data\n",
    "\n",
    "NOTE: this class is currently a work in progress, effectively acts as a wrapper for DataPrep.bin_data_by and stats_on_vals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "364edf99",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "\n",
    "bd = BinData()\n",
    "\n",
    "# if load_by is not specified will read data in by unique by_cols (from bin_config)\n",
    "bin_df, stats = bd.bin_data(source=parq_tmp,\n",
    "                            batch=False,\n",
    "                            bin_config=bin_config)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec1626d7",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "view results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76d0e826",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "cprint(\"-\" * 20, \"BOLD\")\n",
    "cprint(\"bin_df:\")\n",
    "cprint(\"head\", \"OKCYAN\")\n",
    "cprint(bin_df.head(5), \"OKBLUE\")\n",
    "cprint(\"dtypes\", \"OKCYAN\")\n",
    "cprint(bin_df.dtypes, \"OKBLUE\")\n",
    "\n",
    "cprint(\"-\" * 20, \"BOLD\")\n",
    "cprint(\"stats:\")\n",
    "cprint(stats, \"OKBLUE\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13758eca",
   "metadata": {
    "lines_to_next_cell": 0
   },
   "source": [
    "write to file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00371c15",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "\n",
    "# bin_df.to_parquet(\"/path/to/binned_data.parquet\", engine=\"fastparquet\")\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "jupytext": {
   "cell_metadata_filter": "-all",
   "main_language": "python",
   "notebook_metadata_filter": "-all"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}