From 5416b7c444784b3d13470eee74816bc6cf9fbfea Mon Sep 17 00:00:00 2001 From: Diego Arenas Date: Mon, 23 Dec 2024 03:15:28 +0100 Subject: [PATCH] Add documentation with mkdocs --- .github/workflows/ci.yml | 22 ++++++ Pipfile | 4 ++ README.md | 67 +++++++++++++++--- docs/afe.md | 1 + docs/generate.md | 1 + docs/index.md | 146 +++++++++++++++++++++++++++++++++++++++ docs/profile.md | 1 + mkdocs.yml | 43 ++++++++++++ src/afes/afe.py | 17 +++++ 9 files changed, 291 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 docs/afe.md create mode 100644 docs/generate.md create mode 100644 docs/index.md create mode 100644 docs/profile.md create mode 100644 mkdocs.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..c0fb34c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,22 @@ +name: ci +on: + push: + branches: + - main +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: 3.x + - uses: actions/cache@v2 + with: + key: ${{ github.ref }} + path: .cache + - run: pip install mkdocs-material + - run: pip install pillow cairosvg + - run: mkdocs gh-deploy --force diff --git a/Pipfile b/Pipfile index 0708dba..31baa16 100644 --- a/Pipfile +++ b/Pipfile @@ -28,6 +28,10 @@ pipfile = "*" pytest = "*" pytest-cov = "*" typing-extensions = "*" +mkdocs-material = "*" +mkdocs = "*" +mkdocstrings-python = "*" +mkdocstrings = {version = "*", extras = ["python"]} [requires] python_version = "3.10" diff --git a/README.md b/README.md index 78dd185..a1c8ff5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,8 @@ ![](https://img.shields.io/github/last-commit/darenasc/auto-fes) ![](https://img.shields.io/github/stars/darenasc/auto-fes?style=social) -Automated exploration of files in a folder structure to extract metadata and potential usage of information. +Automated exploration of files in a folder structure to extract metadata and +potential usage of information. If you have a bunch of sctructured data in plain files, this library is for you. @@ -51,8 +52,6 @@ flowchart LR ## Explore -You need to import the [auto_fe.py](code/auto_fe.py) file and call it as follows. - ```python from afes import afe @@ -64,13 +63,31 @@ df_files = afe.explore(TARGET_FOLDER) df_files ``` -Checkout the [example.py](src/example.py) file and then run it from a terminal with python as the following code, or using a Jupyter [notebook](src/notebook-example.ipynb). +The `df_files` dataframe will look like the following table, depending on the +files you plan to explore. + +``` +| | path | name | extension | size | human_readable | rows | separator | +| ---: | :------------------------------------------------ | :----------------------- | :-------- | ------: | :------------- | ----: | :-------- | +| 1 | /content/sample_data/auto_mpg.csv | auto_mpg | .csv | 20854 | 20.4 KiB | 399 | comma | +| 2 | /content/sample_data/car_evaluation.csv | car_evaluation | .csv | 51916 | 50.7 KiB | 1729 | comma | +| 3 | /content/sample_data/iris.csv | iris | .csv | 4606 | 4.5 KiB | 151 | comma | +| 4 | /content/sample_data/wine_quality.csv | wine_quality | .csv | 414831 | 405.1 KiB | 6498 | comma | +| 5 | /content/sample_data/california_housing_test.csv | california_housing_test | .csv | 301141 | 294.1 KiB | 3001 | comma | +| 6 | /content/sample_data/california_housing_train.csv | california_housing_train | .csv | 1706430 | 1.6 MiB | 17001 | comma | +``` + +Checkout the [example.py](src/example.py) file and then run it from a terminal +with python as the following code, or using a Jupyter +[notebook](src/notebook-example.ipynb). ## Generate code -Using the dataframe `df_files` generated in the explore phase, you can generate working python pandas code to be used. +Using the dataframe `df_files` generated in the explore phase, you can generate +working python pandas code to be used. -the function `generate()` will generate python code to load the files using `pandas`. +the function `generate()` will generate python code to load the files using +`pandas`. ```python from afes import afe @@ -83,13 +100,36 @@ df_files = afe.explore(TARGET_FOLDER) afe.generate(df_files) ``` -By default the code is printed to the standard output but also written by default to the `./code.txt` file. +The generated code will look like this: + +```bash +### Start of the code ### +import pandas as pd + +df_auto_mpg = pd.read_csv('/content/sample_data/auto_mpg.csv', sep = ',') +df_car_evaluation = pd.read_csv('/content/sample_data/car_evaluation.csv', sep = ',') +df_iris = pd.read_csv('/content/sample_data/iris.csv', sep = ',') +df_wine_quality = pd.read_csv('/content/sample_data/wine_quality.csv', sep = ',') +df_california_housing_test = pd.read_csv('/content/sample_data/california_housing_test.csv', sep = ',') +df_california_housing_train = pd.read_csv('/content/sample_data/california_housing_train.csv', sep = ',') + +### End of the code ### + +"code.txt" has the generated Python code to load the files. +``` + +By default the code is printed to the standard output but also written by +default to the `./code.txt` file. -> Note: you can replace the `.txt` extension by `.py` to make it a working Python script. +> Note: you can replace the `.txt` extension by `.py` to make it a working +> Python script. ### Profile -Using the dataframe `df_files` generated in the explore phase, the function `profile(df_files)` will automatically load and profiline the files using [ydata-profiling](https://github.com/ydataai/ydata-profiling) or [sweetviz](https://github.com/fbdesignpro/sweetviz). +Using the dataframe `df_files` generated in the explore phase, the function +`profile(df_files)` will automatically load and profiline the files using +[ydata-profiling](https://github.com/ydataai/ydata-profiling) or +[sweetviz](https://github.com/fbdesignpro/sweetviz). ```python # Path to folder with files to be explored @@ -103,8 +143,13 @@ afe.profile(df_files, profile_tool="ydata-profiling", output_path=OUTPUT_FOLDER) afe.profile(df_files, profile_tool="sweetviz", output_path=OUTPUT_FOLDER) ``` -By default, it will process the files using `ydata-profiling` by size order starting with the smallest file. It will create the reports and export them in HTML format. It will store the reports in the same directory where the code is running or it save them in a given directory with the `output_path = ''` argument. +By default, it will process the files using `ydata-profiling` by size order +starting with the smallest file. It will create the reports and export them in +HTML format. It will store the reports in the same directory where the code is +running or it save them in a given directory with the +`output_path = ''` argument. # Contributing -* Open an [issue](https://github.com/darenasc/auto-fes/issues) to request more functionalities or feedback. \ No newline at end of file +* Open an [issue](https://github.com/darenasc/auto-fes/issues) to request more +* functionalities or feedback. \ No newline at end of file diff --git a/docs/afe.md b/docs/afe.md new file mode 100644 index 0000000..a542210 --- /dev/null +++ b/docs/afe.md @@ -0,0 +1 @@ +::: src.afes.afe \ No newline at end of file diff --git a/docs/generate.md b/docs/generate.md new file mode 100644 index 0000000..cddce6e --- /dev/null +++ b/docs/generate.md @@ -0,0 +1 @@ +::: src.afes.generate \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..d71ceac --- /dev/null +++ b/docs/index.md @@ -0,0 +1,146 @@ +# Automated File Exploration System + +Automated exploration of files in a folder structure to extract metadata and +potential usage of information. + +If you have a bunch of sctructured data in plain files, this library is for you. + +# Installation + +```bash +pip install -q git+https://github.com/darenasc/auto-fes.git +pip install -q ydata_profiling sweetviz # to make profiling tools work +``` + +## How to use it +```python +from afes import afe + +# Path to folder with files to be explored +TARGET_FOLDER = "" +OUTPUT_FOLDER = "" + +# Run exploration on the files +df_files = afe.explore(TARGET_FOLDER) + +# Generate pandas code to load the files +afe.generate(df_files) + +# Run profiling on each file +afe.profile(df_files, profile_tool="ydata-profiling", output_path=OUTPUT_FOLDER) +afe.profile(df_files, profile_tool="sweetviz", output_path=OUTPUT_FOLDER) +``` + +# What can you do with AFES + +* Explore +* Generate code +* Profile + +```mermaid +flowchart LR + Explore --> Generate + Explore --> Profile + Generate --> PandasCode + Profile --> ydata-profile@{ shape: doc } + Profile --> sweetviz@{ shape: doc } +``` + +## Explore + +```python +from afes import afe + +# Path to folder with files to be explored +TARGET_FOLDER = "" + +# Run exploration on the files +df_files = afe.explore(TARGET_FOLDER) +df_files +``` + +The `df_files` dataframe will look like the following table, depending on the +files you plan to explore. + +``` +| | path | name | extension | size | human_readable | rows | separator | +| ---: | :------------------------------------------------ | :----------------------- | :-------- | ------: | :------------- | ----: | :-------- | +| 1 | /content/sample_data/auto_mpg.csv | auto_mpg | .csv | 20854 | 20.4 KiB | 399 | comma | +| 2 | /content/sample_data/car_evaluation.csv | car_evaluation | .csv | 51916 | 50.7 KiB | 1729 | comma | +| 3 | /content/sample_data/iris.csv | iris | .csv | 4606 | 4.5 KiB | 151 | comma | +| 4 | /content/sample_data/wine_quality.csv | wine_quality | .csv | 414831 | 405.1 KiB | 6498 | comma | +| 5 | /content/sample_data/california_housing_test.csv | california_housing_test | .csv | 301141 | 294.1 KiB | 3001 | comma | +| 6 | /content/sample_data/california_housing_train.csv | california_housing_train | .csv | 1706430 | 1.6 MiB | 17001 | comma | +``` + +## Generate code + +Using the dataframe `df_files` generated in the explore phase, you can generate +working python pandas code to be used. + +the function `generate()` will generate python code to load the files using +`pandas`. + +```python +from afes import afe + +# Path to folder with files to be explored +TARGET_FOLDER = "" +OUTPUT_FOLDER = "" + +df_files = afe.explore(TARGET_FOLDER) +afe.generate(df_files) +``` + +The generated code will look like this: + +```bash +### Start of the code ### +import pandas as pd + +df_auto_mpg = pd.read_csv('/content/sample_data/auto_mpg.csv', sep = ',') +df_car_evaluation = pd.read_csv('/content/sample_data/car_evaluation.csv', sep = ',') +df_iris = pd.read_csv('/content/sample_data/iris.csv', sep = ',') +df_wine_quality = pd.read_csv('/content/sample_data/wine_quality.csv', sep = ',') +df_california_housing_test = pd.read_csv('/content/sample_data/california_housing_test.csv', sep = ',') +df_california_housing_train = pd.read_csv('/content/sample_data/california_housing_train.csv', sep = ',') + +### End of the code ### + +"code.txt" has the generated Python code to load the files. +``` + +By default the code is printed to the standard output but also written by +default to the `./code.txt` file. + +> Note: you can replace the `.txt` extension by `.py` to make it a working +> Python script. + +### Profile + +Using the dataframe `df_files` generated in the explore phase, the function +`profile(df_files)` will automatically load and profiline the files using +[ydata-profiling](https://github.com/ydataai/ydata-profiling) or +[sweetviz](https://github.com/fbdesignpro/sweetviz). + +```python +# Path to folder with files to be explored +TARGET_FOLDER = "" +OUTPUT_FOLDER = "" + +# Run exploration on the files +df_files = afe.explore(TARGET_FOLDER) + +afe.profile(df_files, profile_tool="ydata-profiling", output_path=OUTPUT_FOLDER) # or +afe.profile(df_files, profile_tool="sweetviz", output_path=OUTPUT_FOLDER) +``` + +By default, it will process the files using `ydata-profiling` by size order +starting with the smallest file. It will create the reports and export them in +HTML format. It will store the reports in the same directory where the code is +running or it save them in a given directory with the +`output_path = ''` argument. + +# Contributing + +* Open an [issue](https://github.com/darenasc/auto-fes/issues) to request more functionalities or feedback. \ No newline at end of file diff --git a/docs/profile.md b/docs/profile.md new file mode 100644 index 0000000..f32e384 --- /dev/null +++ b/docs/profile.md @@ -0,0 +1 @@ +::: src.afes.profile \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..e5ffbc0 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,43 @@ +site_name: Auto File Exploration System + +theme: + name: "material" + features: + # - navigation.tabs + # - navigation.sections + # - toc.integrate + # - navigation.top + - search.suggest + # - search.highlight + # - content.tabs.link + - content.code.annotation + - content.code.copy + language: en + +plugins: + - search + - mkdocstrings + +extra: + social: + - icon: fontawesome/brands/github-alt + link: https://github.com/darenasc + - icon: fontawesome/brands/twitter + link: https://twitter.com/darenasc + - icon: fontawesome/brands/linkedin + link: https://www.linkedin.com/in/darenasc/ +# markdown_extensions: +# - pymdownx.highlight: +# anchor_linenums: true +# - pymdownx.inlinehilite +# - pymdownx.snippets +# - admonition +# - pymdownx.arithmatex: +# generic: true +# - footnotes +# - pymdownx.details +# - pymdownx.superfences +# - pymdownx.mark +# - attr_list +copyright: | + © 2024 Diego Arenas diff --git a/src/afes/afe.py b/src/afes/afe.py index 244ad83..f0cbeb5 100644 --- a/src/afes/afe.py +++ b/src/afes/afe.py @@ -117,6 +117,14 @@ def generate( python_file: str = "code.txt", verbose: bool = True, ): + """Generate pandas code to load the files. + + Args: + df (pd.DataFrame): DataFrame with the explored files. + python_file (str, optional): Name of the file to save the code. + Defaults to "code.txt". + verbose (bool, optional): Flag to print the code. Defaults to True. + """ generate_pandas_code(df, python_file=python_file, verbose=verbose) @@ -125,6 +133,15 @@ def profile( output_path: str | Path = ".", profile_tool: str = "ydata-profiling", ): + """Profile the structured data. + + Args: + df (pd.DataFrame): DataFrame with the files to be profiled. + output_path (str | Path, optional): Folder to save the HTML reports. + Defaults to ".". + profile_tool (str, optional): Select which profiling too to use. + Defaults to "ydata-profiling". + """ output_path = Path(output_path) output_path.mkdir(parents=True, exist_ok=True) df.sort_values(by="size", inplace=True)