From c53210d3857924e266cc1a1479378e1a0aab3828 Mon Sep 17 00:00:00 2001
From: OuyangWenyu <hust2014owen@gmail.com>
Date: Mon, 20 Nov 2023 20:47:01 +0800
Subject: [PATCH 1/2] save attr str var to numeric (#4)

---
 hydrodataset/camels.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/hydrodataset/camels.py b/hydrodataset/camels.py
index 706bc80..88d680f 100644
--- a/hydrodataset/camels.py
+++ b/hydrodataset/camels.py
@@ -1,10 +1,10 @@
 """
 Author: Wenyu Ouyang
 Date: 2022-01-05 18:01:11
-LastEditTime: 2023-10-18 17:54:32
+LastEditTime: 2023-11-20 20:08:14
 LastEditors: Wenyu Ouyang
 Description: Read Camels Series ("AUStralia", "BRazil", "ChiLe", "GreatBritain", "UnitedStates") datasets
-FilePath: \hydrodataset\hydrodataset\camels.py
+FilePath: /hydrodataset/hydrodataset/camels.py
 Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved.
 """
 import json
@@ -1523,6 +1523,7 @@ def cache_attributes_xrdataset(self):
 
         attrs_df = pd.concat(attrs.values(), axis=1)
 
+        # fix station names
         def fix_station_nm(station_nm):
             name = station_nm.title().rsplit(" ", 1)
             name[0] = name[0] if name[0][-1] == "," else f"{name[0]},"
@@ -1535,6 +1536,17 @@ def fix_station_nm(station_nm):
         obj_cols = attrs_df.columns[attrs_df.dtypes == "object"]
         for c in obj_cols:
             attrs_df[c] = attrs_df[c].str.strip().astype(str)
+
+        # transform categorical variables to numeric
+        categorical_mappings = {}
+        for column in attrs_df.columns:
+            if attrs_df[column].dtype == "object":
+                attrs_df[column] = attrs_df[column].astype("category")
+                categorical_mappings[column] = dict(
+                    enumerate(attrs_df[column].cat.categories)
+                )
+                attrs_df[column] = attrs_df[column].cat.codes
+
         # unify id to basin
         attrs_df.index.name = "basin"
         # We use xarray dataset to cache all data
@@ -1605,6 +1617,12 @@ def fix_station_nm(station_nm):
         for var_name in units_dict:
             if var_name in ds_from_df.data_vars:
                 ds_from_df[var_name].attrs["units"] = units_dict[var_name]
+
+        # Assign categorical mappings to the variables in the Dataset
+        for column in ds_from_df.data_vars:
+            if column in categorical_mappings:
+                mapping_str = categorical_mappings[column]
+                ds_from_df[column].attrs["category_mapping"] = str(mapping_str)
         return ds_from_df
 
     def cache_streamflow_xrdataset(self):

From 5c0abd6027ab396d00c031a627f06f5b98d290fb Mon Sep 17 00:00:00 2001
From: ouyangwenyu <wenyuouyang@outlook.com>
Date: Tue, 21 Nov 2023 06:48:31 +0800
Subject: [PATCH 2/2] =?UTF-8?q?Bump=20version:=200.1.6=20=E2=86=92=200.1.7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 hydrodataset/__init__.py | 2 +-
 setup.cfg                | 2 +-
 setup.py                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hydrodataset/__init__.py b/hydrodataset/__init__.py
index 090e46c..db495e2 100644
--- a/hydrodataset/__init__.py
+++ b/hydrodataset/__init__.py
@@ -12,7 +12,7 @@
 
 __author__ = """Wenyu Ouyang"""
 __email__ = "wenyuouyang@outlook.com"
-__version__ = '0.1.6'
+__version__ = '0.1.7'
 
 # we use a .hydrodataset dir to save the setting
 hydrodataset_setting_dir = Path.home().joinpath(".hydrodataset")
diff --git a/setup.cfg b/setup.cfg
index d8036a2..c03fae4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.6
+current_version = 0.1.7
 commit = True
 tag = True
 
diff --git a/setup.py b/setup.py
index 91f0953..856672a 100644
--- a/setup.py
+++ b/setup.py
@@ -65,6 +65,6 @@
     test_suite="tests",
     tests_require=test_requirements,
     url="https://github.com/OuyangWenyu/hydrodataset",
-    version='0.1.6',
+    version='0.1.7',
     zip_safe=False,
 )