{ "cells": [ { "cell_type": "markdown", "id": "18b18c53", "metadata": { "id": "460ff464-7812-41fb-bc5b-bc4f24e16499" }, "source": [ "# Loading Trajectory Data\n", "\n", "Mobility data comes in many formats: timestamps as unix integers or ISO strings (with timezones), \n", "coordinates in lat/lon or projected, files as single CSVs or partitioned directories.\n", "\n", "`nomad.io.from_file` handles these cases with a single function call." ] }, { "cell_type": "code", "execution_count": 1, "id": "e3245095", "metadata": {}, "outputs": [], "source": [ "import glob\n", "import pandas as pd\n", "import nomad.io.base as loader\n", "import nomad.data as data_folder\n", "from pathlib import Path\n", "\n", "data_dir = Path(data_folder.__file__).parent" ] }, { "cell_type": "markdown", "id": "a48b37e7", "metadata": {}, "source": [ "## Pandas vs nomad.io for partitioned data\n", "\n", "Partitioned directories (e.g., `date=2024-01-01/`, `date=2024-01-02/`, ...) require a loop with pandas:" ] }, { "cell_type": "code", "execution_count": 2, "id": "0a3a2a1b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pandas: 25835 rows\n", "user_id object\n", "dev_lat float64\n", "dev_lon float64\n", "local_datetime object\n", "dtype: object\n", "\n", "First few rows:\n", " user_id dev_lat dev_lon local_datetime\n", "0 wizardly_joliot 38.321711 -36.667334 2024-01-01 14:29:00.000000000\n", "1 wizardly_joliot 38.321676 -36.667365 2024-01-01 14:35:00.000000000\n", "2 wonderful_swirles 38.321017 -36.667869 2024-01-01 15:06:00.000000000\n" ] } ], "source": [ "csv_files = glob.glob(str(data_dir / \"partitioned_csv\" / \"*\" / \"*.csv\"))\n", "df_list = []\n", "for f in csv_files:\n", " df_list.append(pd.read_csv(f))\n", "df_pandas = pd.concat(df_list, ignore_index=True)\n", "\n", "print(f\"Pandas: {len(df_pandas)} rows\")\n", "print(df_pandas.dtypes)\n", "print(\"\\nFirst few rows:\")\n", "print(df_pandas.head(3))" ] }, { "cell_type": "markdown", "id": "71eb4656", "metadata": {}, "source": [ "`nomad.io.from_file` handles partitioned directories in one line, plus automatic type casting and column mapping:" ] }, { "cell_type": "code", "execution_count": 3, "id": "d48bf128", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nomad.io: 25835 rows\n", "user_id object\n", "dev_lat float64\n", "dev_lon float64\n", "local_datetime datetime64[ns]\n", "dtype: object\n", "\n", "First few rows:\n", " user_id dev_lat dev_lon local_datetime\n", "0 admiring_curie 38.320444 -36.666827 2024-01-04 02:40:00\n", "1 admiring_curie 38.320438 -36.666755 2024-01-04 03:16:00\n", "2 admiring_curie 38.320434 -36.666877 2024-01-04 19:21:00\n", "\n", "Note: 'local_datetime' is now datetime64[ns], not object!\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\pacob\\Desktop\\Brain\\Code Development\\nomad\\nomad\\io\\base.py:621: UserWarning: The 'local_datetime' column has timezone-naive records consider localizing or using unix timestamps.\n", " warnings.warn(f\"The '{col}' column has timezone-naive records consider localizing or using unix timestamps.\")\n" ] } ], "source": [ "traj_cols = {\"user_id\": \"user_id\",\n", " \"latitude\": \"dev_lat\",\n", " \"longitude\": \"dev_lon\",\n", " \"datetime\": \"local_datetime\"}\n", "\n", "df = loader.from_file(data_dir / \"partitioned_csv\", format=\"csv\", traj_cols=traj_cols, parse_dates=True)\n", "print(f\"nomad.io: {len(df)} rows\")\n", "print(df.dtypes)\n", "print(\"\\nFirst few rows:\")\n", "print(df.head(3))\n", "print(\"\\nNote: 'local_datetime' is now datetime64[ns], not object!\")" ] }, { "cell_type": "markdown", "id": "20c187a2", "metadata": {}, "source": [ "The same pattern works for Parquet files, with the type casting and processing relying on passing to the functions which columns correspond to the default \"typical\" spatio-temporal column names" ] }, { "cell_type": "code", "execution_count": 4, "id": "4ca035ae", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 25835 rows\n", "uid object\n", "timestamp Int64\n", "latitude float64\n", "longitude float64\n", "date object\n", "dtype: object\n" ] } ], "source": [ "traj_cols = {\"user_id\": \"uid\", \"timestamp\": \"timestamp\", \n", " \"latitude\": \"latitude\", \"longitude\": \"longitude\", \"date\": \"date\"}\n", "\n", "df = loader.from_file(data_dir / \"partitioned_parquet\", format=\"parquet\", traj_cols=traj_cols, parse_dates=True)\n", "print(f\"Loaded {len(df)} rows\")\n", "print(df.dtypes)" ] }, { "cell_type": "code", "execution_count": 5, "id": "1e2f4479", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['user_id', 'latitude', 'longitude', 'datetime', 'start_datetime', 'end_datetime', 'start_timestamp', 'end_timestamp', 'timestamp', 'date', 'utc_date', 'x', 'y', 'geohash', 'tz_offset', 'duration', 'ha', 'h3_cell', 'location_id'])\n" ] } ], "source": [ "# These are the default canonical columnn names\n", "from nomad.constants import DEFAULT_SCHEMA\n", "print(DEFAULT_SCHEMA.keys())" ] } ], "metadata": { "jupytext": { "formats": "ipynb,py:percent" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }