Skip to content

Commit

Permalink
Update table managment to allow using lists
Browse files Browse the repository at this point in the history
  • Loading branch information
ajparsons committed Jul 8, 2024
1 parent 627416f commit b113190
Showing 1 changed file with 30 additions and 7 deletions.
37 changes: 30 additions & 7 deletions src/data_common/dataset/table_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pandas as pd
from pandas.io.json import build_table_schema

import numpy as np
from data_common.db import duck_query


Expand All @@ -18,15 +18,29 @@ class SchemaValidator(TypedDict):
fields: list[TypedFieldSchema]


def expand_array(series: pd.Series) -> pd.Series:
"""
This function takes in a series and returns a new series where any arrays have been expanded into separate rows.
"""
# if any values are an np.ndarray - we need to convert them to a string to avoid a TypeError
if any(isinstance(x, (list, tuple, np.ndarray)) for x in series):
return series.apply(str) # type: ignore
return series


def is_unique(series: pd.Series) -> bool:
"""
This function takes in a series and returns a boolean of whether or not all the values in the series are unique.
"""
return len(series) == len(series.unique())

return len(series) == len(expand_array(series).unique())


def get_example(series: pd.Series) -> str | int | float:
item = sorted(list(series.dropna()))
try:
item = sorted(list(series.dropna()))
except ValueError:
item = series
if len(item) == 0:
return ""
item = item[0]
Expand Down Expand Up @@ -117,13 +131,22 @@ def update_table_schema(
raise ValueError(f"Unsupported file type {path.suffix}")

# get columns that have less than 15 unique entries and have no blank entries
cols = df.apply(lambda x: x.nunique() < 15 and not x.isnull().any())

def safe_unique(col: pd.Series) -> bool:
# check nunique is under 15
# if the series contains any items that is itsef an numpy array - we need to
# convert it to a string to avoid a TypeError
if any(isinstance(x, (list, tuple, np.ndarray)) for x in col):
return False
return col.nunique() < 15 and not col.isnull().any()

cols = df.apply(safe_unique)
low_count_cols = df.columns.to_series()[cols].to_list()

return Schema.get_table_schema(
df,
descriptions=get_descriptions_from_schema(existing_schema)
if existing_schema
else {},
descriptions=(
get_descriptions_from_schema(existing_schema) if existing_schema else {}
),
enums={x: Schema.USE_UNIQUE for x in low_count_cols},
)

0 comments on commit b113190

Please sign in to comment.