{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "18b18c53",
   "metadata": {
    "id": "460ff464-7812-41fb-bc5b-bc4f24e16499"
   },
   "source": [
    "# Loading Trajectory Data\n",
    "\n",
    "Mobility data comes in many formats: timestamps as unix integers or ISO strings (with timezones), \n",
    "coordinates in lat/lon or projected, files as single CSVs or partitioned directories.\n",
    "\n",
    "`nomad.io.from_file` handles these cases with a single function call."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e3245095",
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import pandas as pd\n",
    "import nomad.io.base as loader\n",
    "import nomad.data as data_folder\n",
    "from pathlib import Path\n",
    "\n",
    "data_dir = Path(data_folder.__file__).parent"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a48b37e7",
   "metadata": {},
   "source": [
    "## Pandas vs nomad.io for partitioned data\n",
    "\n",
    "Partitioned directories (e.g., `date=2024-01-01/`, `date=2024-01-02/`, ...) require a loop with pandas:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0a3a2a1b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pandas: 25835 rows\n",
      "user_id            object\n",
      "dev_lat           float64\n",
      "dev_lon           float64\n",
      "local_datetime     object\n",
      "dtype: object\n",
      "\n",
      "First few rows:\n",
      "             user_id    dev_lat    dev_lon                 local_datetime\n",
      "0    wizardly_joliot  38.321711 -36.667334  2024-01-01 14:29:00.000000000\n",
      "1    wizardly_joliot  38.321676 -36.667365  2024-01-01 14:35:00.000000000\n",
      "2  wonderful_swirles  38.321017 -36.667869  2024-01-01 15:06:00.000000000\n"
     ]
    }
   ],
   "source": [
    "csv_files = glob.glob(str(data_dir / \"partitioned_csv\" / \"*\" / \"*.csv\"))\n",
    "df_list = []\n",
    "for f in csv_files:\n",
    "    df_list.append(pd.read_csv(f))\n",
    "df_pandas = pd.concat(df_list, ignore_index=True)\n",
    "\n",
    "print(f\"Pandas: {len(df_pandas)} rows\")\n",
    "print(df_pandas.dtypes)\n",
    "print(\"\\nFirst few rows:\")\n",
    "print(df_pandas.head(3))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71eb4656",
   "metadata": {},
   "source": [
    "`nomad.io.from_file` handles partitioned directories in one line, plus automatic type casting and column mapping:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d48bf128",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nomad.io: 25835 rows\n",
      "user_id                   object\n",
      "dev_lat                  float64\n",
      "dev_lon                  float64\n",
      "local_datetime    datetime64[ns]\n",
      "dtype: object\n",
      "\n",
      "First few rows:\n",
      "          user_id    dev_lat    dev_lon      local_datetime\n",
      "0  admiring_curie  38.320444 -36.666827 2024-01-04 02:40:00\n",
      "1  admiring_curie  38.320438 -36.666755 2024-01-04 03:16:00\n",
      "2  admiring_curie  38.320434 -36.666877 2024-01-04 19:21:00\n",
      "\n",
      "Note: 'local_datetime' is now datetime64[ns], not object!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\pacob\\Desktop\\Brain\\Code Development\\nomad\\nomad\\io\\base.py:621: UserWarning: The 'local_datetime' column has timezone-naive records consider localizing or using unix timestamps.\n",
      "  warnings.warn(f\"The '{col}' column has timezone-naive records consider localizing or using unix timestamps.\")\n"
     ]
    }
   ],
   "source": [
    "traj_cols = {\"user_id\": \"user_id\",\n",
    "             \"latitude\": \"dev_lat\",\n",
    "             \"longitude\": \"dev_lon\",\n",
    "             \"datetime\": \"local_datetime\"}\n",
    "\n",
    "df = loader.from_file(data_dir / \"partitioned_csv\", format=\"csv\", traj_cols=traj_cols, parse_dates=True)\n",
    "print(f\"nomad.io: {len(df)} rows\")\n",
    "print(df.dtypes)\n",
    "print(\"\\nFirst few rows:\")\n",
    "print(df.head(3))\n",
    "print(\"\\nNote: 'local_datetime' is now datetime64[ns], not object!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20c187a2",
   "metadata": {},
   "source": [
    "The same pattern works for Parquet files, with the type casting and processing relying on passing to the functions which columns correspond to the default \"typical\" spatio-temporal column names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4ca035ae",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 25835 rows\n",
      "uid           object\n",
      "timestamp      Int64\n",
      "latitude     float64\n",
      "longitude    float64\n",
      "date          object\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "traj_cols = {\"user_id\": \"uid\", \"timestamp\": \"timestamp\", \n",
    "             \"latitude\": \"latitude\", \"longitude\": \"longitude\", \"date\": \"date\"}\n",
    "\n",
    "df = loader.from_file(data_dir / \"partitioned_parquet\", format=\"parquet\", traj_cols=traj_cols, parse_dates=True)\n",
    "print(f\"Loaded {len(df)} rows\")\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1e2f4479",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['user_id', 'latitude', 'longitude', 'datetime', 'start_datetime', 'end_datetime', 'start_timestamp', 'end_timestamp', 'timestamp', 'date', 'utc_date', 'x', 'y', 'geohash', 'tz_offset', 'duration', 'ha', 'h3_cell', 'location_id'])\n"
     ]
    }
   ],
   "source": [
    "# These are the default canonical columnn names\n",
    "from nomad.constants import DEFAULT_SCHEMA\n",
    "print(DEFAULT_SCHEMA.keys())"
   ]
  }
 ],
 "metadata": {
  "jupytext": {
   "formats": "ipynb,py:percent"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}