diff --git a/AUTHORS.rst b/AUTHORS.rst index abb15a5bd..a43bf0eb4 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -2,4 +2,4 @@ Contributors ============ -* Markus Binsteiner +* Markus Binsteiner diff --git a/dev/dev.ipynb b/dev/dev.ipynb index 6c8eac33c..814dfdde1 100644 --- a/dev/dev.ipynb +++ b/dev/dev.ipynb @@ -64,8 +64,8 @@ "outputs": [ { "data": { - "text/plain": " \n \u001B[3m \u001B[0m\u001B[3mDescription \u001B[0m\u001B[3m \u001B[0m Import a local folder and its metadata. \n \n \u001B[3m \u001B[0m\u001B[3mOrigin \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mAuthors\u001B[0m\u001B[3m \u001B[0m Markus Binsteiner (markus.binsteiner@uni.lu) \n \n \n \u001B[3m \u001B[0m\u001B[3mType context \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mTags \u001B[0m\u001B[3m \u001B[0m core, onboarding \n \u001B[3m \u001B[0m\u001B[3mLabels \u001B[0m\u001B[3m \u001B[0m \u001B[3mpackage\u001B[0m: kiara_modules.core \n \u001B[3m \u001B[0m\u001B[3mReferences\u001B[0m\u001B[3m \u001B[0m \u001B[3msource_repo\u001B[0m: \n https://github.com/DHARPA-Project/kiara_modules.core \n \u001B[3mdocumentation\u001B[0m: https://dharpa.org/kiara_modules.core/ \n \u001B[3mmodule_doc\u001B[0m: \n https://dharpa.org/kiara_modules.core/modules_list.html… \n \n \n \u001B[3m \u001B[0m\u001B[3mPython class \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mclass_name \u001B[0m\u001B[3m \u001B[0m ImportFolderModule \n \u001B[3m \u001B[0m\u001B[3mmodule_name\u001B[0m\u001B[3m \u001B[0m kiara_modules.core.onboarding.folder \n \u001B[3m \u001B[0m\u001B[3mfull_name \u001B[0m\u001B[3m \u001B[0m kiara_modules.core.onboarding.folder.ImportFolderModule \n \n \n \u001B[3m \u001B[0m\u001B[3mConfiguration\u001B[0m\u001B[3m \u001B[0m \u001B[38;2;248;248;242;49m{\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"constants\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m{},\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"defaults\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m{},\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"source_is_immutable\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;102;217;239;49mfalse\u001B[0m \n \u001B[38;2;248;248;242;49m}\u001B[0m \n \n \u001B[3m \u001B[0m\u001B[3mInputs \u001B[0m\u001B[3m \u001B[0m \n \u001B[1m \u001B[0m\u001B[1mField name \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mType \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDescription \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mRequired\u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDefault \u001B[0m\u001B[1m \u001B[0m \n ─────────────────────────────────────────────────────────────────────── \n \u001B[3m \u001B[0m\u001B[3mpath \u001B[0m\u001B[3m \u001B[0m string The path to \u001B[1myes\u001B[0m -- no default \n the folder. -- \n \u001B[3m \u001B[0m\u001B[3mincluded_files\u001B[0m\u001B[3m \u001B[0m list A list of no -- no default \n strings, -- \n include all \n files where \n the filename \n ends with that \n string. \n \u001B[3m \u001B[0m\u001B[3mexcluded_dirs \u001B[0m\u001B[3m \u001B[0m list A list of no -- no default \n strings, -- \n exclude all \n folders whose \n name ends with \n that string. \n \n \n \u001B[3m \u001B[0m\u001B[3mOutputs \u001B[0m\u001B[3m \u001B[0m \n \u001B[1m \u001B[0m\u001B[1mField name \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mType \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDescription \u001B[0m\u001B[1m \u001B[0m \n ─────────────────────────────────────────────────────────────────────── \n \u001B[3m \u001B[0m\u001B[3mfile_bundle\u001B[0m\u001B[3m \u001B[0m file_bundle The collection of files contained in the \n bundle. \n \n \n \u001B[3m \u001B[0m\u001B[3mSource code \u001B[0m\u001B[3m \u001B[0m def process(self, inputs: ValueSet, outputs: ValueSet) -> None: \n \n path = inputs.get_value_data(\"path\") \n \n included_files = inputs.get_value_data(\"included_files\") \n excluded_dirs = inputs.get_value_data(\"excluded_dirs\") \n \n import_config = FolderImportConfig( \n include_files=included_files, exclude_dirs=excluded_dirs \n ) \n \n bundle = FileBundleMetadata.import_folder( \n source=path, import_config=import_config \n ) \n \n outputs.set_values(file_bundle=bundle) \n \n \n", - "text/html": "
                                                                                             \n  Description     Import a local folder and its metadata.                                    \n                                                                                             \n  Origin                                                                                     \n                    Authors   Markus Binsteiner (markus.binsteiner@uni.lu)                   \n                                                                                             \n                                                                                             \n  Type context                                                                               \n                    Tags         core, onboarding                                            \n                    Labels       package: kiara_modules.core                                 \n                    References   source_repo:                                                \n                                 https://github.com/DHARPA-Project/kiara_modules.core        \n                                 documentation: https://dharpa.org/kiara_modules.core/       \n                                 module_doc:                                                 \n                                 https://dharpa.org/kiara_modules.core/modules_list.html…    \n                                                                                             \n                                                                                             \n  Python class                                                                               \n                    class_name    ImportFolderModule                                         \n                    module_name   kiara_modules.core.onboarding.folder                       \n                    full_name     kiara_modules.core.onboarding.folder.ImportFolderModule    \n                                                                                             \n                                                                                             \n  Configuration   {                                                                          \n                    \"constants\": {},                                                         \n                    \"defaults\": {},                                                          \n                    \"source_is_immutable\": false                                             \n                  }                                                                          \n                                                                                             \n  Inputs                                                                                     \n                    Field name       Type     Description      Required   Default            \n                   ───────────────────────────────────────────────────────────────────────   \n                    path             string   The path to      yes        -- no default      \n                                              the folder.                 --                 \n                    included_files   list     A list of        no         -- no default      \n                                              strings,                    --                 \n                                              include all                                    \n                                              files where                                    \n                                              the filename                                   \n                                              ends with that                                 \n                                              string.                                        \n                    excluded_dirs    list     A list of        no         -- no default      \n                                              strings,                    --                 \n                                              exclude all                                    \n                                              folders whose                                  \n                                              name ends with                                 \n                                              that string.                                   \n                                                                                             \n                                                                                             \n  Outputs                                                                                    \n                    Field name    Type          Description                                  \n                   ───────────────────────────────────────────────────────────────────────   \n                    file_bundle   file_bundle   The collection of files contained in the     \n                                                bundle.                                      \n                                                                                             \n                                                                                             \n  Source code     def process(self, inputs: ValueSet, outputs: ValueSet) -> None:            \n                                                                                             \n                      path = inputs.get_value_data(\"path\")                                   \n                                                                                             \n                      included_files = inputs.get_value_data(\"included_files\")               \n                      excluded_dirs = inputs.get_value_data(\"excluded_dirs\")                 \n                                                                                             \n                      import_config = FolderImportConfig(                                    \n                          include_files=included_files, exclude_dirs=excluded_dirs           \n                      )                                                                      \n                                                                                             \n                      bundle = FileBundleMetadata.import_folder(                             \n                          source=path, import_config=import_config                           \n                      )                                                                      \n                                                                                             \n                      outputs.set_values(file_bundle=bundle)                                 \n                                                                                             \n                                                                                             \n
\n" + "text/plain": " \n \u001B[3m \u001B[0m\u001B[3mDescription \u001B[0m\u001B[3m \u001B[0m Import a local folder and its metadata. \n \n \u001B[3m \u001B[0m\u001B[3mOrigin \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mAuthors\u001B[0m\u001B[3m \u001B[0m Markus Binsteiner (markus@frkl.io) \n \n \n \u001B[3m \u001B[0m\u001B[3mType context \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mTags \u001B[0m\u001B[3m \u001B[0m core, onboarding \n \u001B[3m \u001B[0m\u001B[3mLabels \u001B[0m\u001B[3m \u001B[0m \u001B[3mpackage\u001B[0m: kiara_modules.core \n \u001B[3m \u001B[0m\u001B[3mReferences\u001B[0m\u001B[3m \u001B[0m \u001B[3msource_repo\u001B[0m: \n https://github.com/DHARPA-Project/kiara_modules.core \n \u001B[3mdocumentation\u001B[0m: https://dharpa.org/kiara_modules.core/ \n \u001B[3mmodule_doc\u001B[0m: \n https://dharpa.org/kiara_modules.core/modules_list.html… \n \n \n \u001B[3m \u001B[0m\u001B[3mPython class \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mclass_name \u001B[0m\u001B[3m \u001B[0m ImportFolderModule \n \u001B[3m \u001B[0m\u001B[3mmodule_name\u001B[0m\u001B[3m \u001B[0m kiara_modules.core.onboarding.folder \n \u001B[3m \u001B[0m\u001B[3mfull_name \u001B[0m\u001B[3m \u001B[0m kiara_modules.core.onboarding.folder.ImportFolderModule \n \n \n \u001B[3m \u001B[0m\u001B[3mConfiguration\u001B[0m\u001B[3m \u001B[0m \u001B[38;2;248;248;242;49m{\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"constants\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m{},\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"defaults\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m{},\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"source_is_immutable\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;102;217;239;49mfalse\u001B[0m \n \u001B[38;2;248;248;242;49m}\u001B[0m \n \n \u001B[3m \u001B[0m\u001B[3mInputs \u001B[0m\u001B[3m \u001B[0m \n \u001B[1m \u001B[0m\u001B[1mField name \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mType \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDescription \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mRequired\u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDefault \u001B[0m\u001B[1m \u001B[0m \n ─────────────────────────────────────────────────────────────────────── \n \u001B[3m \u001B[0m\u001B[3mpath \u001B[0m\u001B[3m \u001B[0m string The path to \u001B[1myes\u001B[0m -- no default \n the folder. -- \n \u001B[3m \u001B[0m\u001B[3mincluded_files\u001B[0m\u001B[3m \u001B[0m list A list of no -- no default \n strings, -- \n include all \n files where \n the filename \n ends with that \n string. \n \u001B[3m \u001B[0m\u001B[3mexcluded_dirs \u001B[0m\u001B[3m \u001B[0m list A list of no -- no default \n strings, -- \n exclude all \n folders whose \n name ends with \n that string. \n \n \n \u001B[3m \u001B[0m\u001B[3mOutputs \u001B[0m\u001B[3m \u001B[0m \n \u001B[1m \u001B[0m\u001B[1mField name \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mType \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDescription \u001B[0m\u001B[1m \u001B[0m \n ─────────────────────────────────────────────────────────────────────── \n \u001B[3m \u001B[0m\u001B[3mfile_bundle\u001B[0m\u001B[3m \u001B[0m file_bundle The collection of files contained in the \n bundle. \n \n \n \u001B[3m \u001B[0m\u001B[3mSource code \u001B[0m\u001B[3m \u001B[0m def process(self, inputs: ValueSet, outputs: ValueSet) -> None: \n \n path = inputs.get_value_data(\"path\") \n \n included_files = inputs.get_value_data(\"included_files\") \n excluded_dirs = inputs.get_value_data(\"excluded_dirs\") \n \n import_config = FolderImportConfig( \n include_files=included_files, exclude_dirs=excluded_dirs \n ) \n \n bundle = FileBundleMetadata.import_folder( \n source=path, import_config=import_config \n ) \n \n outputs.set_values(file_bundle=bundle) \n \n \n", + "text/html": "
                                                                                             \n  Description     Import a local folder and its metadata.                                    \n                                                                                             \n  Origin                                                                                     \n                    Authors   Markus Binsteiner (markus@frkl.io)                   \n                                                                                             \n                                                                                             \n  Type context                                                                               \n                    Tags         core, onboarding                                            \n                    Labels       package: kiara_modules.core                                 \n                    References   source_repo:                                                \n                                 https://github.com/DHARPA-Project/kiara_modules.core        \n                                 documentation: https://dharpa.org/kiara_modules.core/       \n                                 module_doc:                                                 \n                                 https://dharpa.org/kiara_modules.core/modules_list.html…    \n                                                                                             \n                                                                                             \n  Python class                                                                               \n                    class_name    ImportFolderModule                                         \n                    module_name   kiara_modules.core.onboarding.folder                       \n                    full_name     kiara_modules.core.onboarding.folder.ImportFolderModule    \n                                                                                             \n                                                                                             \n  Configuration   {                                                                          \n                    \"constants\": {},                                                         \n                    \"defaults\": {},                                                          \n                    \"source_is_immutable\": false                                             \n                  }                                                                          \n                                                                                             \n  Inputs                                                                                     \n                    Field name       Type     Description      Required   Default            \n                   ───────────────────────────────────────────────────────────────────────   \n                    path             string   The path to      yes        -- no default      \n                                              the folder.                 --                 \n                    included_files   list     A list of        no         -- no default      \n                                              strings,                    --                 \n                                              include all                                    \n                                              files where                                    \n                                              the filename                                   \n                                              ends with that                                 \n                                              string.                                        \n                    excluded_dirs    list     A list of        no         -- no default      \n                                              strings,                    --                 \n                                              exclude all                                    \n                                              folders whose                                  \n                                              name ends with                                 \n                                              that string.                                   \n                                                                                             \n                                                                                             \n  Outputs                                                                                    \n                    Field name    Type          Description                                  \n                   ───────────────────────────────────────────────────────────────────────   \n                    file_bundle   file_bundle   The collection of files contained in the     \n                                                bundle.                                      \n                                                                                             \n                                                                                             \n  Source code     def process(self, inputs: ValueSet, outputs: ValueSet) -> None:            \n                                                                                             \n                      path = inputs.get_value_data(\"path\")                                   \n                                                                                             \n                      included_files = inputs.get_value_data(\"included_files\")               \n                      excluded_dirs = inputs.get_value_data(\"excluded_dirs\")                 \n                                                                                             \n                      import_config = FolderImportConfig(                                    \n                          include_files=included_files, exclude_dirs=excluded_dirs           \n                      )                                                                      \n                                                                                             \n                      bundle = FileBundleMetadata.import_folder(                             \n                          source=path, import_config=import_config                           \n                      )                                                                      \n                                                                                             \n                      outputs.set_values(file_bundle=bundle)                                 \n                                                                                             \n                                                                                             \n
\n" }, "execution_count": 2, "metadata": {}, @@ -103,8 +103,8 @@ "outputs": [ { "data": { - "text/plain": " \n \u001B[3m \u001B[0m\u001B[3mDescription \u001B[0m\u001B[3m \u001B[0m Create a table from a 'file_bundle'. \n \n \u001B[3m \u001B[0m\u001B[3mOrigin \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mAuthors\u001B[0m\u001B[3m \u001B[0m Markus Binsteiner (markus.binsteiner@uni.lu) \n \n \n \u001B[3m \u001B[0m\u001B[3mType context \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mTags \u001B[0m\u001B[3m \u001B[0m core, onboarding \n \u001B[3m \u001B[0m\u001B[3mLabels \u001B[0m\u001B[3m \u001B[0m \u001B[3mpackage\u001B[0m: kiara_modules.core \n \u001B[3m \u001B[0m\u001B[3mReferences\u001B[0m\u001B[3m \u001B[0m \u001B[3msource_repo\u001B[0m: \n https://github.com/DHARPA-Project/kiara_modules.core \n \u001B[3mdocumentation\u001B[0m: https://dharpa.org/kiara_modules.core/ \n \u001B[3mmodule_doc\u001B[0m: \n https://dharpa.org/kiara_modules.core/modules_list.html… \n \n \n \u001B[3m \u001B[0m\u001B[3mPython class \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mclass_name \u001B[0m\u001B[3m \u001B[0m CreateTableFromTextFilesModule \n \u001B[3m \u001B[0m\u001B[3mmodule_name\u001B[0m\u001B[3m \u001B[0m kiara_modules.core.table \n \u001B[3m \u001B[0m\u001B[3mfull_name \u001B[0m\u001B[3m \u001B[0m kiara_modules.core.table.CreateTableFromTextFilesModule \n \n \n \u001B[3m \u001B[0m\u001B[3mConfiguration\u001B[0m\u001B[3m \u001B[0m \u001B[38;2;248;248;242;49m{\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"constants\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m{},\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"defaults\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m{},\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"columns\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m[\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;230;219;116;49m\"id\"\u001B[0m\u001B[38;2;248;248;242;49m,\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;230;219;116;49m\"rel_path\"\u001B[0m\u001B[38;2;248;248;242;49m,\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;230;219;116;49m\"file_name\"\u001B[0m\u001B[38;2;248;248;242;49m,\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;230;219;116;49m\"content\"\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m]\u001B[0m \n \u001B[38;2;248;248;242;49m}\u001B[0m \n \n \u001B[3m \u001B[0m\u001B[3mInputs \u001B[0m\u001B[3m \u001B[0m \n \u001B[1m \u001B[0m\u001B[1mField name\u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mType \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDescription \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mRequired\u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDefault \u001B[0m\u001B[1m \u001B[0m \n ─────────────────────────────────────────────────────────────────────── \n \u001B[3m \u001B[0m\u001B[3mfiles \u001B[0m\u001B[3m \u001B[0m file_bundle The files to \u001B[1myes\u001B[0m -- no default \n use for the -- \n table. \n \n \n \u001B[3m \u001B[0m\u001B[3mOutputs \u001B[0m\u001B[3m \u001B[0m \n \u001B[1m \u001B[0m\u001B[1mField name\u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mType \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDescription \u001B[0m\u001B[1m \u001B[0m \n ─────────────────────────────────────────────────────────────────────── \n \u001B[3m \u001B[0m\u001B[3mtable \u001B[0m\u001B[3m \u001B[0m table A table with the index column 'id', a column \n 'rel_path' that indicates the relative path of \n the file in the bundle, and a column 'content' \n that holds the (text) content of every file. \n \n \n \u001B[3m \u001B[0m\u001B[3mSource code \u001B[0m\u001B[3m \u001B[0m def process(self, inputs: ValueSet, outputs: ValueSet) -> None: \n \n bundle: FileBundleMetadata = inputs.get_value_data(\"files\") \n \n columns = self.get_config_value(\"columns\") \n if not columns: \n columns = DEFAULT_COLUMNS \n \n if \"content\" in columns: \n file_dict = bundle.read_text_file_contents() \n else: \n file_dict = {} \n for rel_path in bundle.included_files.keys(): \n file_dict = None # type: ignore \n \n tabular: typing.Dict] = {} \n for column in columns: \n for index, rel_path in enumerate(sorted(file_dict.keys())): \n \n if column == \"content\": \n value: typing.Any = file_dict \n elif column == \"id\": \n value = index \n elif column == \"rel_path\": \n value = rel_path \n else: \n file_model = bundle.included_files \n value = getattr(file_model, column) \n \n tabular.setdefault(column, []).append(value) \n \n table = pa.Table.from_pydict(tabular) \n \n outputs.set_value(\"table\", table) \n \n \n", - "text/html": "
                                                                                             \n  Description     Create a table from a 'file_bundle'.                                       \n                                                                                             \n  Origin                                                                                     \n                    Authors   Markus Binsteiner (markus.binsteiner@uni.lu)                   \n                                                                                             \n                                                                                             \n  Type context                                                                               \n                    Tags         core, onboarding                                            \n                    Labels       package: kiara_modules.core                                 \n                    References   source_repo:                                                \n                                 https://github.com/DHARPA-Project/kiara_modules.core        \n                                 documentation: https://dharpa.org/kiara_modules.core/       \n                                 module_doc:                                                 \n                                 https://dharpa.org/kiara_modules.core/modules_list.html…    \n                                                                                             \n                                                                                             \n  Python class                                                                               \n                    class_name    CreateTableFromTextFilesModule                             \n                    module_name   kiara_modules.core.table                                   \n                    full_name     kiara_modules.core.table.CreateTableFromTextFilesModule    \n                                                                                             \n                                                                                             \n  Configuration   {                                                                          \n                    \"constants\": {},                                                         \n                    \"defaults\": {},                                                          \n                    \"columns\": [                                                             \n                      \"id\",                                                                  \n                      \"rel_path\",                                                            \n                      \"file_name\",                                                           \n                      \"content\"                                                              \n                    ]                                                                        \n                  }                                                                          \n                                                                                             \n  Inputs                                                                                     \n                    Field name   Type          Description      Required   Default           \n                   ───────────────────────────────────────────────────────────────────────   \n                    files        file_bundle   The files to     yes        -- no default     \n                                               use for the                 --                \n                                               table.                                        \n                                                                                             \n                                                                                             \n  Outputs                                                                                    \n                    Field name   Type    Description                                         \n                   ───────────────────────────────────────────────────────────────────────   \n                    table        table   A table with the index column 'id', a column        \n                                         'rel_path' that indicates the relative path of      \n                                         the file in the bundle, and a column 'content'      \n                                         that holds the (text) content of every file.        \n                                                                                             \n                                                                                             \n  Source code     def process(self, inputs: ValueSet, outputs: ValueSet) -> None:            \n                                                                                             \n                      bundle: FileBundleMetadata = inputs.get_value_data(\"files\")            \n                                                                                             \n                      columns = self.get_config_value(\"columns\")                             \n                      if not columns:                                                        \n                          columns = DEFAULT_COLUMNS                                          \n                                                                                             \n                      if \"content\" in columns:                                               \n                          file_dict = bundle.read_text_file_contents()                       \n                      else:                                                                  \n                          file_dict = {}                                                     \n                          for rel_path in bundle.included_files.keys():                      \n                              file_dict = None  # type: ignore                               \n                                                                                             \n                      tabular: typing.Dict] = {}                                             \n                      for column in columns:                                                 \n                          for index, rel_path in enumerate(sorted(file_dict.keys())):        \n                                                                                             \n                              if column == \"content\":                                        \n                                  value: typing.Any = file_dict                              \n                              elif column == \"id\":                                           \n                                  value = index                                              \n                              elif column == \"rel_path\":                                     \n                                  value = rel_path                                           \n                              else:                                                          \n                                  file_model = bundle.included_files                         \n                                  value = getattr(file_model, column)                        \n                                                                                             \n                              tabular.setdefault(column, []).append(value)                   \n                                                                                             \n                      table = pa.Table.from_pydict(tabular)                                  \n                                                                                             \n                      outputs.set_value(\"table\", table)                                      \n                                                                                             \n                                                                                             \n
\n" + "text/plain": " \n \u001B[3m \u001B[0m\u001B[3mDescription \u001B[0m\u001B[3m \u001B[0m Create a table from a 'file_bundle'. \n \n \u001B[3m \u001B[0m\u001B[3mOrigin \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mAuthors\u001B[0m\u001B[3m \u001B[0m Markus Binsteiner (markus@frkl.io) \n \n \n \u001B[3m \u001B[0m\u001B[3mType context \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mTags \u001B[0m\u001B[3m \u001B[0m core, onboarding \n \u001B[3m \u001B[0m\u001B[3mLabels \u001B[0m\u001B[3m \u001B[0m \u001B[3mpackage\u001B[0m: kiara_modules.core \n \u001B[3m \u001B[0m\u001B[3mReferences\u001B[0m\u001B[3m \u001B[0m \u001B[3msource_repo\u001B[0m: \n https://github.com/DHARPA-Project/kiara_modules.core \n \u001B[3mdocumentation\u001B[0m: https://dharpa.org/kiara_modules.core/ \n \u001B[3mmodule_doc\u001B[0m: \n https://dharpa.org/kiara_modules.core/modules_list.html… \n \n \n \u001B[3m \u001B[0m\u001B[3mPython class \u001B[0m\u001B[3m \u001B[0m \n \u001B[3m \u001B[0m\u001B[3mclass_name \u001B[0m\u001B[3m \u001B[0m CreateTableFromTextFilesModule \n \u001B[3m \u001B[0m\u001B[3mmodule_name\u001B[0m\u001B[3m \u001B[0m kiara_modules.core.table \n \u001B[3m \u001B[0m\u001B[3mfull_name \u001B[0m\u001B[3m \u001B[0m kiara_modules.core.table.CreateTableFromTextFilesModule \n \n \n \u001B[3m \u001B[0m\u001B[3mConfiguration\u001B[0m\u001B[3m \u001B[0m \u001B[38;2;248;248;242;49m{\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"constants\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m{},\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"defaults\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m{},\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;249;38;114;49m\"columns\"\u001B[0m\u001B[38;2;248;248;242;49m:\u001B[0m\u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m[\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;230;219;116;49m\"id\"\u001B[0m\u001B[38;2;248;248;242;49m,\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;230;219;116;49m\"rel_path\"\u001B[0m\u001B[38;2;248;248;242;49m,\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;230;219;116;49m\"file_name\"\u001B[0m\u001B[38;2;248;248;242;49m,\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;230;219;116;49m\"content\"\u001B[0m \n \u001B[38;2;248;248;242;49m \u001B[0m\u001B[38;2;248;248;242;49m]\u001B[0m \n \u001B[38;2;248;248;242;49m}\u001B[0m \n \n \u001B[3m \u001B[0m\u001B[3mInputs \u001B[0m\u001B[3m \u001B[0m \n \u001B[1m \u001B[0m\u001B[1mField name\u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mType \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDescription \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mRequired\u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDefault \u001B[0m\u001B[1m \u001B[0m \n ─────────────────────────────────────────────────────────────────────── \n \u001B[3m \u001B[0m\u001B[3mfiles \u001B[0m\u001B[3m \u001B[0m file_bundle The files to \u001B[1myes\u001B[0m -- no default \n use for the -- \n table. \n \n \n \u001B[3m \u001B[0m\u001B[3mOutputs \u001B[0m\u001B[3m \u001B[0m \n \u001B[1m \u001B[0m\u001B[1mField name\u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mType \u001B[0m\u001B[1m \u001B[0m \u001B[1m \u001B[0m\u001B[1mDescription \u001B[0m\u001B[1m \u001B[0m \n ─────────────────────────────────────────────────────────────────────── \n \u001B[3m \u001B[0m\u001B[3mtable \u001B[0m\u001B[3m \u001B[0m table A table with the index column 'id', a column \n 'rel_path' that indicates the relative path of \n the file in the bundle, and a column 'content' \n that holds the (text) content of every file. \n \n \n \u001B[3m \u001B[0m\u001B[3mSource code \u001B[0m\u001B[3m \u001B[0m def process(self, inputs: ValueSet, outputs: ValueSet) -> None: \n \n bundle: FileBundleMetadata = inputs.get_value_data(\"files\") \n \n columns = self.get_config_value(\"columns\") \n if not columns: \n columns = DEFAULT_COLUMNS \n \n if \"content\" in columns: \n file_dict = bundle.read_text_file_contents() \n else: \n file_dict = {} \n for rel_path in bundle.included_files.keys(): \n file_dict = None # type: ignore \n \n tabular: typing.Dict] = {} \n for column in columns: \n for index, rel_path in enumerate(sorted(file_dict.keys())): \n \n if column == \"content\": \n value: typing.Any = file_dict \n elif column == \"id\": \n value = index \n elif column == \"rel_path\": \n value = rel_path \n else: \n file_model = bundle.included_files \n value = getattr(file_model, column) \n \n tabular.setdefault(column, []).append(value) \n \n table = pa.Table.from_pydict(tabular) \n \n outputs.set_value(\"table\", table) \n \n \n", + "text/html": "
                                                                                             \n  Description     Create a table from a 'file_bundle'.                                       \n                                                                                             \n  Origin                                                                                     \n                    Authors   Markus Binsteiner (markus@frkl.io)                   \n                                                                                             \n                                                                                             \n  Type context                                                                               \n                    Tags         core, onboarding                                            \n                    Labels       package: kiara_modules.core                                 \n                    References   source_repo:                                                \n                                 https://github.com/DHARPA-Project/kiara_modules.core        \n                                 documentation: https://dharpa.org/kiara_modules.core/       \n                                 module_doc:                                                 \n                                 https://dharpa.org/kiara_modules.core/modules_list.html…    \n                                                                                             \n                                                                                             \n  Python class                                                                               \n                    class_name    CreateTableFromTextFilesModule                             \n                    module_name   kiara_modules.core.table                                   \n                    full_name     kiara_modules.core.table.CreateTableFromTextFilesModule    \n                                                                                             \n                                                                                             \n  Configuration   {                                                                          \n                    \"constants\": {},                                                         \n                    \"defaults\": {},                                                          \n                    \"columns\": [                                                             \n                      \"id\",                                                                  \n                      \"rel_path\",                                                            \n                      \"file_name\",                                                           \n                      \"content\"                                                              \n                    ]                                                                        \n                  }                                                                          \n                                                                                             \n  Inputs                                                                                     \n                    Field name   Type          Description      Required   Default           \n                   ───────────────────────────────────────────────────────────────────────   \n                    files        file_bundle   The files to     yes        -- no default     \n                                               use for the                 --                \n                                               table.                                        \n                                                                                             \n                                                                                             \n  Outputs                                                                                    \n                    Field name   Type    Description                                         \n                   ───────────────────────────────────────────────────────────────────────   \n                    table        table   A table with the index column 'id', a column        \n                                         'rel_path' that indicates the relative path of      \n                                         the file in the bundle, and a column 'content'      \n                                         that holds the (text) content of every file.        \n                                                                                             \n                                                                                             \n  Source code     def process(self, inputs: ValueSet, outputs: ValueSet) -> None:            \n                                                                                             \n                      bundle: FileBundleMetadata = inputs.get_value_data(\"files\")            \n                                                                                             \n                      columns = self.get_config_value(\"columns\")                             \n                      if not columns:                                                        \n                          columns = DEFAULT_COLUMNS                                          \n                                                                                             \n                      if \"content\" in columns:                                               \n                          file_dict = bundle.read_text_file_contents()                       \n                      else:                                                                  \n                          file_dict = {}                                                     \n                          for rel_path in bundle.included_files.keys():                      \n                              file_dict = None  # type: ignore                               \n                                                                                             \n                      tabular: typing.Dict] = {}                                             \n                      for column in columns:                                                 \n                          for index, rel_path in enumerate(sorted(file_dict.keys())):        \n                                                                                             \n                              if column == \"content\":                                        \n                                  value: typing.Any = file_dict                              \n                              elif column == \"id\":                                           \n                                  value = index                                              \n                              elif column == \"rel_path\":                                     \n                                  value = rel_path                                           \n                              else:                                                          \n                                  file_model = bundle.included_files                         \n                                  value = getattr(file_model, column)                        \n                                                                                             \n                              tabular.setdefault(column, []).append(value)                   \n                                                                                             \n                      table = pa.Table.from_pydict(tabular)                                  \n                                                                                             \n                      outputs.set_value(\"table\", table)                                      \n                                                                                             \n                                                                                             \n
\n" }, "execution_count": 3, "metadata": {}, diff --git a/docs/architecture/assumptions.md b/docs/architecture/assumptions.md new file mode 100644 index 000000000..c87801f93 --- /dev/null +++ b/docs/architecture/assumptions.md @@ -0,0 +1,32 @@ +# Assumptions & considerations + +## Core assumptions + +I consider the following assumptions a given. They are not fuelled by user stories, but are the 'minimal' requirements +that emerged after initially presenting the 'open questions', and in other discussions with Sean and the team. If any +of those assumptions are wrong, some of the conclusions below will have to be adjusted. + +- our (only) target audience (for now) are digital historians (and maybe also other digital humanity researchers) who can't code themselves +- the most important outcome of our project is for our target audience to be able to execute workflows in order to explore, explain, transform or augment their data +- we want the creation of workflows to be as easy and frictionless as possible, although not at the expense of end-user usability +- we want our product to be used by all DH researchers around the word, independent of their affiliation(s) +q- collaboration/sharing of data is not a priority, most of our target audience are either individuals, sometimes small teams (sharing of results and sharing of workflows are different issues, and not included in this assumption) + +## Considerations around adoption + +One way to look at how to prioritize and implement some of our user stories is through the lens of ease-of-adoption: +which characteristics make our application more likely to be adopted, by a larger group of researchers? + +Those ones are obvious (at least to me) -- in no particular order: + + - ease of workflow use + - ease of file-management use + - ease of installation (if there is one involved) + - whether there is a login/account creation requirement + - how well it integrates and plays with tools researchers already use day to day + - provides relevant (to them) workflows + - the cheaper to use the better (free/monthly cost/pay-per-usage) + - stability / reliability + - performance (most importantly on the compute side, but also UI) + - how easy it is to create workflows, and what skills are necessary to do that (easier creation -> more workflows) + - whether and how easy it will be to share, re-use and adapt workflows (different to sharing data) diff --git a/docs/architecture/data/.ipynb_checkpoints/data_formats-checkpoint.ipynb b/docs/architecture/data/.ipynb_checkpoints/data_formats-checkpoint.ipynb new file mode 100644 index 000000000..9996a0c4f --- /dev/null +++ b/docs/architecture/data/.ipynb_checkpoints/data_formats-checkpoint.ipynb @@ -0,0 +1,608 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from dharpa.benchmarking.data import clear_system_cache, MemoryRecorder, get_example_file\n", + "\n", + "from rich.jupyter import print\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "This document is a primer on data formats and structures, and how and why those affect our project. I have no idea about how much of this is common knowledge, and how much is news to the majority. I have noticed a few common misconceptions and assumptions about some of the topics in here, so I figured it makes sense to try to get everyone on the same page. I've tried to keep this simple and short, so there are some things in here that are over-simplified bordering on incorrect.\n", + "\n", + "My educated guess is that in our project we will mostly be concerned about structured, tabular data, which is why I'll be focussing on that. I might add a companion document about 'binary-blob' data later on.\n", + "\n", + "## Data serialization and storage\n", + "\n", + "- data lives in memory or on disk\n", + "- lots of 0's and 1's -- binary format\n", + "- only 'decoding' gets you a useful representation\n", + "- 'text' is just an often used encoding format" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "The natural habitat of (digital) data is computer memory or on disk. Data is always stored in binary form, and there is always some sort of decoding involved to make data usable in one way or another (with the exception of booleans maybe). Even when we talk about text files (seemingly implying that those are not binary since they are 'text'), we are dealing with binary data. It's just such a standard data encoding format that tools to decode that sort of data are available everywhere. Decoding text is by no means trivial, but luckily our tools have evolved so much by now -- and we have standards like utf-8 commonly available -- that we as users hardly ever come across decoding issues anymore. At least not to the extent we used to. It still happens, and I would imagine quite a bit more in the Digital Humanities than in your everyday 'business' use-case. So it helps to be aware of at least the basics involved in text encoding standards, and I would recommend anyone doing any sort of programming to read up on it.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Tabular data (serialization) formats\n", + "\n", + "- serialization/deserialization\n", + "- binary or not, here I come:\n", + " - avro, protobuf, pickle\n", + " - csv, json, yaml, xml\n", + "- 'structured formats' with schema, or without:\n", + " - avro, protobuf, thrift, flatbuffers, xml\n", + " - csv, json, yaml, messagepack, pickle\n", + "- zero-copy, memory-mapping?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "The (arguably) most important type of data we'll be dealing with is structured, tabular data. So it pays to think about how its digital form is represented in memory, and what issues are involved when trying to store, manipulate and transfer it.\n", + "\n", + "In our context, tabular data is always a 2 dimensional matrix (rows, columns), where each column has the same number of items, and each column contains items of the same data type (or union of data types). Tabular data can have a column that can be used as index, but that is optional. Each table can be described with a schema, which is basically a list of column names, and the data types associated with each of those columns.\n", + "\n", + "### Serialization / deserialization\n", + "\n", + "Every programming language represents its basic data types differently in memory. That's especially true for the arguably most common data type: the string. Also, that in-memory representation is (almost) always different to the format of the (functially) same data when exported into a file on disk.\n", + "\n", + "This means that, if we want to export our data in our code, we need to do a step that is called 'serializing' (or 'marshalling'): we convert the data into a commonly accepted representation of a commonly accepted set of data-types. This serialization is usually expensive, computationally speaking. We want to avoid it, if at all possible, or at least always postpone it until the last possible moment, when we are sure the data won't change anymore, so we only have to do it once.\n", + "The same goes for de-serializing data, just in the other direction: we only want to do it once, then keep it in memory in our native representation (if the size of the data allows it), so we don't have to read it again. Even if the content of a file is in the OS (page) cache (which would mean we don't actually have to read the file-content from disk) we'd still have to spend the cpu-cycles for de-serialization. So, big no-no, bad data-scientist!\n", + "\n", + "### Format types\n", + "\n", + "For serialization, we have two basic options: text, and binary (let's just not nitpick and assume that this distinction makes sense).\n", + "\n", + "#### Text-based formats\n", + "\n", + "Serializing into a text-based format usually means taking all the elements our tabular data consists of, one by one, then serialize each element into its textual representation (like for example ``\"hello world\"`` for a string, ``5`` for an integer, ``true`` for a boolean in json), and then assembling one big 'meta'-string out of all those sub-elements. For csv, that might include a header-row, and adding delimiters like ',' in between the elements. For json it would be adding list ('``[``', '``]``') or dictionary ('``{``', '``}``') indicators, as well as deliminters and other elements as outlined in the JSON specification.\n", + "\n", + "I haven't done any research on it, but I'd imagine csv would be one of the oldest widely-used data storage formats. Csv is a text based tabular format, and it allows you to specify an optional header to describe column names. It allows for different deliminters between row cells (whole rows are delimited by the end-of-line special character). Other commonly used text-based formats are json, yaml, toml, xml. Those are not strictly tabular data formats, they can also contain just scalars, dictionaries, or lists (tabular data is always a list of dictionaries of the same shape).\n", + "\n", + "#### Binary formats\n", + "\n", + "Serializing into a binary format is usually very specific to the format itself, so there are not really any common tools to read more than one of them (like there are for text-based formats, where you could use any text editor and at least display the content in a meaningful way), and different formats have different priorites (like small size of the resulting blob, quick random read/write access, suitability for streaming, etc). Binary formats often have compression built-in, whereas text formats never have (but can be usually compressed well enough by external tools due to certain characteristics of encoded strings). Also, they usually are a lot easier on the cpu for serialization/deserialization purposes, since it's easier to optimize them for that scenario.\n", + "\n", + "Binary formats existed for a long time, but in recent years they are used more widely again. Decentralized software architecture (microservices) as well as 'big(-ish) data' played a huge part in that. Because, as it turns out that, while serializing a few items of data per seconds into json and back is not that much of a problem, doing the same thing for millions of large (or even small) chunks of data actually is. In some cases that serialization step can take more time than the actual computation that was done on the data. To counter that issue, people came up with formats like 'Avro', 'Thrift', 'ProtoBuf'. Pythons 'pickle' can also be considerd a binary serialization format.\n", + "\n", + "\n", + "### Schema\n", + "\n", + "Another useful way to separate data formats is to check whether they include a (native) schema that describes the value types of the data they hold, or not. If schema information is present, it can either be included in the resulting file, or be stored elsewhere.\n", + "\n", + "Having schema information for a dataset is highly desirable, because it tells us exactly what type of data we are dealing with (is it a string, integer, float? what precision?). Whereas most text-based data formats don't include a schema definition format, there are sometimes external efforts to remedy that (JSON-schema, for example). None of the text-based formats I can think of at the top of my head include the schema in a resulting file. This is important, because the complexity of tools that handle data increases if they need to worry about secondary, 'side-car' files for incoming data.\n", + "\n", + "#### Slight detour: csv\n", + "\n", + "Csv is bit special in that it can contain a 'header' row in the first line, which can be used to determine column names. Since this row is optional, it is not always present which of course complicates the import process. Because csv files are so common in data science, most of the tools we use include some csv-import method that more or less smartly determines the (text-)encoding of the file, whether it has a header row, as well as the schema of the included data. This method usually serializes the data into the appropriate internal representations of the column types after it is reasonably sure the inferred schema is correct-ish. Without a definite, (externally provided) schema it is not possible to guess this reliably in every case, so a developer should always assert the correct types are present after such an import.\n", + "\n", + "### Streaming, zero-copy, memory-mapping\n", + "\n", + "One thing that everyone working semi-seriously in data science and handling big-ish data should be aware of is that in most OS'es you can read (or 'load') data in more ways than one. The 'one' way is usually something like:\n", + "\n", + "``` python\n", + "file = open('dataset.csv')\n", + "lines = file.read() # or file.readlines()\n", + "```\n", + "\n", + "When using Pandas, it'll probably take the form of:\n", + "``` python\n", + "import pandas as pd\n", + "pd.read_csv('dataset.csv')\n", + "```\n", + "\n", + "Both of those read the whole file into memory. Which will be fine if the dataset is small or there is a need for it to be\n", + "in memory in full. Depending on the situation, it might be wasteful, though. For example when calculating the mean for a column of integers. In that case it's a better strategy to read one line of the file, process the column we are interested in, eject the line from memory, then read the next line, only keeping the current total and the number of items we processed so far. That way we'll never allocate more memory than what is needed for a single line. We can even process datasets that are larger than the available memory of or our workstation.\n", + "\n", + "As a matter of fact, we could do even better if we would know the offset and length of the column we are interested in, in that case, we would only have to read the bytes that hold the integer value we need, and could ignore the other cells of a row completely. Again, this might or might be an issue depending on the size of the data in a row, but if we have a dataset with a lot of large columns, the I/O operations we would not have to do by only reading the exact data we need could improve the speed of processing considerably. Doing that doesn't really work for csv files, for example. Since there is no good way for us to know the exact offset of length of the column we are interested in. There are data formats that support that kind of operation though.\n", + "\n", + "Along with those fairly simple strategies to deal with data efficiently, there are more advanced ones that also deal with data and how it is handled in a system memory as well as on disk. For those of you who are interested, I would recommend looking up the terms 'memory-mapping', and 'zero-copy'.\n", + "\n", + "### Some random benchmarks, to illustrate" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
-- system cache cleared --\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading the whole file                                                                    │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ This iterates through all lines in memory, keeping all of them in memory at the same      │\n",
+       "│ time.                                                                                     │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function  count_lines                                                         │\n",
+       "│ ╶───────────────────┼─────────────╴                                                       │\n",
+       "│          max memory │    53.75 MB                                                         │\n",
+       "│      execution time │      144 ms                                                         │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def count_lines(path):                                                                │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     f = open(path)                                                                    │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │     length = 0  # ignores '\\n' characters                                             │ │\n",
+       "│ │     lines = f.readlines()                                                             │ │\n",
+       "│ │     for line in lines:                                                                │ │\n",
+       "│ │         counter = counter + 1                                                         │ │\n",
+       "│ │         length = length + len(line)                                                   │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return {\"no_lines\": counter, \"size\": length}                                      │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "clear_system_cache()\n", + "file_path = get_example_file()\n", + "\n", + "def count_lines(path):\n", + "\n", + " f = open(path)\n", + " counter = 0\n", + " length = 0 # ignores '\\n' characters\n", + " lines = f.readlines()\n", + " for line in lines:\n", + " counter = counter + 1\n", + " length = length + len(line)\n", + "\n", + " return {\"no_lines\": counter, \"size\": length}\n", + "\n", + "profile_count_lines = MemoryRecorder.profile_func(\"Reading the whole file\", \"This iterates through all lines in memory, keeping all of them in memory at the same time.\", False, count_lines, file_path)\n", + "print(profile_count_lines.report)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
-- system cache cleared --\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading the file line by line                                                             │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ This allocates only very little memory, since once a line is read and processed, it will  │\n",
+       "│ be disregarded.                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function  count_lines                                                         │\n",
+       "│ ╶───────────────────┼─────────────╴                                                       │\n",
+       "│          max memory │      0.0 MB                                                         │\n",
+       "│      execution time │      115 ms                                                         │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def count_lines(f):                                                                   │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │     length = 0  # ignores '\\n' characters                                             │ │\n",
+       "│ │     for line in f:   # when using open like we do here, it returns an iterator not a  │ │\n",
+       "│ │         counter = counter + 1                                                         │ │\n",
+       "│ │         length = length + len(line)                                                   │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return {\"no_lines\": counter, \"size\": length}                                      │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "clear_system_cache()\n", + "\n", + "file_obj = open(file_path, buffering=True)\n", + "def count_lines(f):\n", + " counter = 0\n", + " length = 0 # ignores '\\n' characters\n", + " for line in f: # when using open like we do here, it returns an iterator not a materialized list\n", + " counter = counter + 1\n", + " length = length + len(line)\n", + "\n", + " return {\"no_lines\": counter, \"size\": length}\n", + "\n", + "profile_count_lines = MemoryRecorder.profile_func(\"Reading the file line by line\", \"This allocates only very little memory, since once a line is read and processed, it will be disregarded.\", False, count_lines, file_obj)\n", + "file_obj.close()\n", + "print(profile_count_lines.report)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Structured (binary) data layout strategies\n", + "\n", + "- Row-based:\n", + " - (most commonly used dbs): sqlite, Postgres, MySQL, ...\n", + " - Avro\n", + "- Column-based\n", + " - OLAP dbs: duckdb, ClickHouse, BigQuery, Snowflake ...\n", + " - pandas dataframe (well, numpy)\n", + " - parquet, feather" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "In order to use the more advanced operations on data I described earlier, the data formats we use need to support them. None of the simple formats like csv, json, yaml do. There is one category of applications that had to deal with those things for decades though: databases. So I think it pays to look a bit at how they handle storing data, and what kind of trade-offs they are making. Basically, a database is a system that lets you persist (mostly) structured data on disk, and gives you an easy, memory- and processing-efficient way to query and retrieve it back. To do that, they have different ways to persist data, add indexes, cache 'hot' data, and so on. As it turns out, there are 2 main ways data can be stored on disk for efficient retrieval: row-based, and column-based (I'm ignoring document/'nosql' databases here, since -- for almost all practical use-cases -- they are inferior to relational ones).\n", + "\n", + "\n", + "\n", + "### Row-oriented databases\n", + "\n", + "The most common database type is 'row-oriented'. This means that data is stored in a way so that each row represents a continuous block of disk (or memory). Data is quick and easy to read (if you are interested in a subset of rows) and it is very easy and fast to add new rows/records. This fits the most common requirements businesses have for a database, since new items are added constantly, which is why most databases we encounter in the wild are row-based. Examples for such databases are: Postgres, sqlite, MySQL.\n", + "\n", + "### Column-oriented databases\n", + "\n", + "Column-oriented databases have existed for a long time, but they are not as prevalent as their row-based cousins, and often ignored by developers who haven't been exposed to them and their advantages. Instead of storing data row-by-row, they store data column-by-column. This means that column-cells are layed out next to each other on disk, and different columns occupy different regions of the storage (not necessarily close to each other at all). The querying logic is quite different for this type of database, the main advantage is that a certain type of analytical query is really fast (speedups of 10x or even 100x are quite possible), also it is very easy to request whole columns from such a database without it having to access any irrelevant parts of the data. Compressing data is also easier with column-oriented databases, so usually those occupy less disk space than their row-based peers. The disadvantage of those databases is that it's harder and slower to add new data, so they are more common for situations where one either has a fixed dataset, or updates are rare, and come in big(er) batches. Also, certain types of queries are less suited for that layout, which makes it always a good idea to think about what you need out of your data before deciding on a database/database type.\n", + "\n", + "\n", + "### Row-based/column-based in data science\n", + "\n", + "How is this relevant? Well, because in data science we are dealing mostly with fixed datasets, and the queries we do on them are mostly analytical in a way that fits column-oriented data layouts; although exceptions from that rule are not uncommon. So it makes sense to depart from the 'common wisdom' of using a row-based approach. In fact, Numpy arrays and Pandas dataframes (which depend on them) are kept in memory using the column-based approach. This is important to know, because it helps us using and querying data correctly in our code. For example, it's not a good idea and very slow to add 'rows' to a Pandas dataframe. Instead, we should initialize the Dataframe with the whole dataset once at the beginning, and then only add columns to it (which is very fast), but no new rows if at all possible.\n", + "\n", + "One issue with Numpy/Pandas is that commonly a dataset is loaded into memory as a whole. There are ways around that (for example by processing a csv file in batches), but very often those are not used. In reality, it's probably not that big an issue in the DH field, since datasets seem to be smaller on average. But it is still a good idea to be as efficient as possible in this regard, esp. for our purpose, since we won't have any knowledge or guarantees in advance about the data we'll be handling (which might very well be bigger than the availble memory). Also, since we are building an interactive application, it makes a difference whether a query comes back within a second, or ten.\n", + "\n", + "### More random benchmarks, this time with Pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
-- system cache cleared --\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading a csv file with Pandas as a whole.                                                │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ This is ok, as long as the dataset is not too big.                                        │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function   load_csv                                                           │\n",
+       "│ ╶───────────────────┼───────────╴                                                         │\n",
+       "│          max memory │ 63.625 MB                                                           │\n",
+       "│      execution time │    342 ms                                                           │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def load_csv(path):                                                                   │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     df = pd.read_csv(path)                                                            │ │\n",
+       "│ │     for _ in df[\"year_month\"]:                                                        │ │\n",
+       "│ │         counter = counter + 1                                                         │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return counter                                                                    │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "clear_system_cache()\n", + "import pandas as pd\n", + "\n", + "def load_csv(path):\n", + "\n", + " counter = 0\n", + "\n", + " df = pd.read_csv(path)\n", + " for _ in df[\"year_month\"]:\n", + " counter = counter + 1\n", + "\n", + " return counter\n", + "\n", + "profile_read_csv = MemoryRecorder.profile_func(\"Reading a csv file with Pandas as a whole.\", \"This is ok, as long as the dataset is not too big.\", False, load_csv, file_path)\n", + "print(profile_read_csv.report)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
-- system cache cleared --\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading a csv file with Pandas, iterating over rows                                       │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ As one can see, this is very very slow, and not a good idea at all to do in Pandas.       │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function   load_csv                                                           │\n",
+       "│ ╶───────────────────┼───────────╴                                                         │\n",
+       "│          max memory │ 80.875 MB                                                           │\n",
+       "│      execution time │  21704 ms                                                           │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def load_csv(path):                                                                   │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     df = pd.read_csv(path)                                                            │ │\n",
+       "│ │     for index, row in df.iterrows():                                                  │ │\n",
+       "│ │         counter = counter + 1                                                         │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return counter                                                                    │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "clear_system_cache()\n", + "import pandas as pd\n", + "\n", + "def load_csv(path):\n", + "\n", + " counter = 0\n", + "\n", + " df = pd.read_csv(path)\n", + " for index, row in df.iterrows():\n", + " counter = counter + 1\n", + "\n", + " return counter\n", + "\n", + "profile_read_csv = MemoryRecorder.profile_func(\"Reading a csv file with Pandas, iterating over rows\", \"As one can see, this is very very slow, and not a good idea at all to do in Pandas.\", False, load_csv, file_path)\n", + "print(profile_read_csv.report)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading a csv file with Pandas, in chunks.                                                │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ This is a good approach when dealing with a dataset that is large, and we don't need it   │\n",
+       "│ except for a single operation on a single column. We can optimize the                     │\n",
+       "│ execution-time/memory-usage by adjusting the 'chunksize' value.                           │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function  load_csv_in_chunks                                                  │\n",
+       "│ ╶───────────────────┼────────────────────╴                                                │\n",
+       "│          max memory │       6.0390625 MB                                                  │\n",
+       "│      execution time │             892 ms                                                  │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def load_csv_in_chunks(path):                                                         │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     chunksize=1000                                                                    │ │\n",
+       "│ │     with pd.read_csv(path, chunksize=chunksize) as reader:                            │ │\n",
+       "│ │         for chunk_df in reader:                                                       │ │\n",
+       "│ │             for _ in chunk_df[\"year_month\"]:                                          │ │\n",
+       "│ │                 counter = counter + 1                                                 │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return counter                                                                    │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def load_csv_in_chunks(path):\n", + "\n", + " counter = 0\n", + "\n", + " chunksize=1000\n", + " with pd.read_csv(path, chunksize=chunksize) as reader:\n", + " for chunk_df in reader:\n", + " for _ in chunk_df[\"year_month\"]:\n", + " counter = counter + 1\n", + "\n", + " return counter\n", + "\n", + "profile_read_csv = MemoryRecorder.profile_func(\"Reading a csv file with Pandas, in chunks.\", \"This is a good approach when dealing with a dataset that is large, and we don't need it except for a single operation on a single column. We can optimize the execution-time/memory-usage by adjusting the 'chunksize' value.\", False, load_csv_in_chunks, file_path)\n", + "print(profile_read_csv.report)\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/architecture/data/.ipynb_checkpoints/requirements-checkpoint.ipynb b/docs/architecture/data/.ipynb_checkpoints/requirements-checkpoint.ipynb new file mode 100644 index 000000000..d7373199e --- /dev/null +++ b/docs/architecture/data/.ipynb_checkpoints/requirements-checkpoint.ipynb @@ -0,0 +1,936 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import dharpa\n", + "from rich.jupyter import print\n", + "from dharpa import DHARPA_TOOLBOX_DEFAULT_WORKFLOWS_FOLDER\n", + "from dharpa.data.core import schemas_to_dict\n", + "from dharpa.graphs.utils import graph_to_image\n", + "from dharpa.utils import get_data_from_file" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Context & Requirements\n", + "\n", + "## Types of data\n", + "\n", + "- scalars (mostly user inputs, booleans, strings, enums, numbers)\n", + "- lists of items of the same type\n", + "- tabular data (a collection of lists of items of the same type, each with the same number of items, incl. schema)\n", + "- binary data: images, videos, audio files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In our application, we'll deal with a few basic types of data:\n", + "\n", + "- scalars (mostly user inputs, booleans, strings, enums, numbers)\n", + "- lists of items of the same type\n", + "- tabular data (a collection of lists of items of the same type, each with the same number of items, incl. schema)\n", + "- binary data: images, videos, audio files\n", + "\n", + "*Sidenote*: I consider every kind of user input as data, since it is conceptually the same thing and needs to be recorded and managed the same way.\n", + "\n", + "For our purpose, we can ignore scalars because they are easy and cheap to handle, and can be attached to any sort of data or metadata in a few different ways. Also, let's ignore binary data for now, while acknowledging that we will need a strategy to deal with efficiently, in a way that is not too different from how we deal with other types.\n", + "\n", + "Which leaves us with lists and tabular data. Those are different to scalars, because there is no telling in advance how many rows they will have, and how large its cells will be (aka 'how many bytes are we dealing with, KBs, MBs, GBs, TBs?'). List (arrays) will be our main data type, along with tables (dataframes) -- the latter are really just lists of lists (including a schema/description of the type of each list). In a lot of cases a module will receive a table, and the output will be a list of the same length as the table. When using Pandas, we usually assign dataframes to variables, this is handy because we have access to the whole dataset via a single variable, and can access the columns seperately via their names. For our case, because we will have connected modules, we will probably deal with 2 scenarios:\n", + "\n", + "- a module changes the data in a dataframe in one or several columns: this will be rare, but in this case the result of such a module will be a new dataframe\n", + "- a module adds one or several column to a dataset: this is much more common. It doesn't make much sense to have dataframes as outputs in this case, since those would contain the same data as the input. There is no need to allocate double the amount of memory, for an exact copy of something we already have available (for read purposes). So, in those cases the output will be one or several lists, with the same amount of rows as the input dataframe. Those lists can then be easily assembled into a dataframe at a later stage, if the need arises.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "## Requirements\n", + "\n", + "Since data will be the central object our application handles, we need to decide on an internal (as well as import/export) data format. The obvious thing to do would be to use the most common format (probably json), and just use that. For several reasons (layed out in the [data_formats document](data_formats.ipynb)), I don't think this is a good idea in our case. I think we can anticipate our main requirements on a data format before writing any code, which is why I created this document: to list those requirements, and to come up with a recommendation that is based upon them." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Technical requirements\n", + "\n", + "- schema-aware (ideally included in the format)\n", + "- binary format (performance, filesize)\n", + "- column-based (for tabular data -- analytics query performance)\n", + "- zero-copy, memory-mapping\n", + "- compression in-build (preferrable)\n", + "- possible to use from different programming languages (at least Python & JS)\n", + "- as little cpu, memory, and disk utilization as possible\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The first group of requirements is technical: we are creating an interactive application, which means we should at least spend some time optimizing for speed (in those instances where it's possbile). In addition: the more we know about our data and its 'shape', the less complex our code has to be, since that removes the need for investigating and validating data at multiple points along its journey.\n", + "\n", + "The latter can be achieved by using a data format that is schema aware (e.g. not csv), and ideally includes that schema as metadata in its specification, so we can query the data(-bytes) directly, without having to read seperate, external specifications.\n", + "\n", + "For the performance requirements, it's fairly easy to see why we should be looking for a binary, column-based format, that ideally has extra features like memory-mapping and compression.\n", + "\n", + "Last but not least we want to be able to access our data from different programming languages. Python and JavaScript support will be mandatatory, but being able to read from Julia and R would also be highly desirable.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### General requirements\n", + "\n", + "- option to attach metadata\n", + "- versioning of datasets\n", + "- versioning of metadata\n", + "- we want to be able to treat all data the same way, independent of size, format, other characteristics\n", + "- we want all of this to be more or less transparent to our end-users!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Because it's in the nature of our application that we won't exactly know hob big and what shape the data we will be dealing with will have, we have to anticipate a wide range of types and sizes. In order to not have to deal with those differentely each time, it would be highly adventageous if we can come up with a standard 'interface' for our datasets, that lets us, as a minimum, query minimal required metadata (schema, authors, size), and which allows us to forward the dataset to other parts of our application (other modules, frontend), without having to convert or serialize/deserialize it.\n", + "\n", + "Most importantly, we will have to figure out a way to make most of this transparent to users. This is probably nothing a data format can help us with directly, but there might be factors in the periphery which can make this easier, or harder (e.g.: how common is that data-format, how much tooling exists for it?)\n", + "\n", + "One of our main requirements is to be able to easily attach metadata to our datasets. In addition we want it to be as easy as possible to 'version' the containing data, as well as the attached metadata. Those requirements stem from the need for good research data practices, and should not need further explanation. Let's look at those two points in a bit more detail:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "#### Technical metadata (automatic)\n", + " - data type\n", + " - schema (if tabular data)\n", + " - statistics on columns, rows (number of rows, min in column, max in column)\n", + " - data specific indicators/summaries (e.g. geographic range, resolution, ...)\n", + " - digest / checksum (unique id!)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The most important metadata we'll be dealing with is the type of data, and its schema in case its in tabular form. As was mentioned above, ideally this would be forced and included by/in the data format we choose, so we can rely on it to be available, always.\n", + "In addition, in a lot of cases it aids performance if certain characteristics of a dataset are known without having to actually touch it. One example would be min/max values for numeric columns. Geographic range, resolution could be interesting for location data, creation date for photos, and so on.\n", + "A special item of metadata is a checksum: that enables us to confirm the bytes of a dataset haven't changed since we last checked, and it also makes things like caching or lookups easier.\n", + "All of those metadata items can be created more or less automatically, without any user input. This is important to differentiate, because that means we don't need to worry about providing a user-interface so they can be added/attached.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + " \n", + "#### Other metadata (requires user input)\n", + " - provenance / lineage / heritage\n", + " - author(s)/creator(s) incl. contact info\n", + " - creation / modification date\n", + " - comments, annotations\n", + " - \"ALL THE METADATA\" (Angela R. Cunningham, PhD)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The second category of metadata is defined by the necessity for manual user input (at least in parts). Which of course means we need to somehow provide a metadata editing facility for those items. Authorship information as well has the provenance-related metadata is arguably the most important one here. But I imagine we'll come up with quite a few more metadata fields we will want to be able to attach. It's probabaly a good idea to talk to our colleagues who develop Zotero and Tropy for some input in this regard." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Dataset versioning\n", + "\n", + "- versioning of the 'actual' data:\n", + " - new data added\n", + " - existing data changed/fixed\n", + " - existing data removed\n", + "- metadata versioning:\n", + " - independent of actual data changes (except for last modification dates, new authors added, checksum)\n", + " - new metadata added\n", + " - existing metadata changed/fixed\n", + " - metadata removed\n", + " - no new dataset version necessary\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Data versioning is usually a bit overlooked (although that seems to be changing now, and there are some 'git for data' services and tools cropping up). But it's crucial for good data practices.\n", + "\n", + "In order to always know how result data was created, we need to know exactly which inputs were used, and what exactly was done to them. If any of the inputs changes, and we don't record it, then there will be confusion later, when someone tries to recreate a result with the changed input.\n", + "\n", + "This implies we have a way to point to datasets using some sort of identifier, something like a DOI -- but it does not need to be globally unique, just locally (unless the data gets shared/exported).\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Contexts in which we handle data\n", + "\n", + "- 'onboarding' data:\n", + " - external data gets fed into a workflow / into our app\n", + " - we store a copy internally (to prevent changes we are not aware of)\n", + " - some minimal metadata needs to be provided (but can be at least partly determined automatically)\n", + " - gets unique id / alias(es) & checksum & version '1'\n", + "- internal data transformation & transfer:\n", + " - each module processes input data and creates output data\n", + " - output data gets fed into the input of another module\n", + " - input/output data is requested by frontend for display purposes (viz, statistics, table-view, ...)\n", + "- exporting data:\n", + " - researcher needs data in a specific format (csv, Excel, json...) for further processing, publishing, etc." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Along with listing requirements, it makes sense to think about in which contexts we deal with data, and how. I think we can seperate three main areas:\n", + "\n", + "- data onboarding\n", + "- internal data transformation & transfer\n", + "- data export\n", + "\n", + "For the first and last items the 'interface' of the data is important, which means we are concerned about how to translate external dataformat into our internal one, as well as the other way around. For the second item we only deal with our internal format, so performance and code complexity are more important considerations.\n", + "\n", + "For data onboarding, one thing is important is that we store a copy of the dataset the user points us to in a space where we can be sure the data doesn't get changed by external means. We would also add some automatic metadata, and might or might not require the user to provide some basic required metadata-fields manually. We would also give a newly onboarded dataset a version '1' (or maybe '1.0').\n", + "\n", + "Data export is the least problematic area: since we have a minimal set of required metadata for every piece of data we use internally, it should be fairly trivial to export it into any viable export format (csv, excel, json, parquet,...).\n", + "\n", + "Data onboarding and export could also be combined in some scenarios: for example if we don't provide a tool to 'clean' up data (or do something else that would require a version change on the dataset) and users would have to do it externally, we could export the dataset into a temporary folder, let the user do their thing, and then re-import the changed dataset into a new version of the existing, internal one, copying the existing metadata with some additions that describe what was done to the data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Solution proposal" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Apache Arrow\n", + "\n", + "- binary, column-based, language-independent in-memory format\n", + "- well defined schema and data types, rudimentary custom metadata support\n", + "- native support for 2 on-disk formats:\n", + " - feather (same as in-memory format), parquet\n", + "- client implementations for most relevant languages\n", + "- growing ecosystem:\n", + " - Arrow Flight (fast data transport framework)\n", + " - Plasma (In-Memory object store)\n", + " - Vaex (native support for memory-mapped feather files, memory-mapped querying)\n", + " - duckdb (column-based, python-native sql engine)\n", + " - easy import/export to NumPy/Pandas types (Arrays, DataFrames) -- still some serialization cost\n", + "- likely to be the standard format for data exchange in data science/data engineering in the future" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In my research, [Apache Arrow](https://arrow.apache.org/) came closest to match our technical requirements, and should let us implement most of the other ones too. It is a binary, column based in-memory format that comes with implementations in a number of programming languages (incl. the ones we are interested in).\n", + "\n", + "From the Arrow website:\n", + "\n", + "> Apache Arrow is a software development platform for building high performance applications that process and transport large data sets. It is designed to both improve the performance of analytical algorithms and the efficiency of moving data from one system or programming language to another.\n", + ">\n", + "> A critical component of Apache Arrow is its in-memory columnar format, a standardized, language-agnostic specification for representing structured, table-like datasets in-memory. This data format has a rich data type system (included nested and user-defined data types) designed to support the needs of analytic database systems, data frame libraries, and more.\n", + "\n", + "In addition to the efficient in-memory format, it supports 2 on-disk formats: feather & parquet. The former one is basically the same as the in-memory format (with all the advantages that come with that), and the latter is a fairly standard format to exchange large(-ish) datasets between processes and infrastructure components.\n", + "\n", + "In my opinion (and I'm not alone), Arrow will be the de-facto standard data format for tabular data in the future, in both data science and data engineering. It is well designed, and a lot of the reasons why it came about line up fairly well with our own requirements (althought, at a different scale obviously). Because of that, there is a rich tooling ecosystem growing around Apache Arrow at the moment, which I think we can expect to satisfy to most of our current and future needs in the near to medium-term future, if not already.\n", + "\n", + "Esp. [vaex](https://vaex.io/) and [duckdb](https://duckdb.org/) look like very interesting developments. Pandas and Numpy import/export is very well supported, and as well optimized as seems possible. [Apache Arrow Flight](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) and the [Plasma Object store](https://arrow.apache.org/docs/python/plasma.html) look like good contenders that could handle our potential data transport needs in the future." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Identifying and versioning datasets\n", + "\n", + "- every dataset gets it's unique id (uuid) as well as one or several user-defined and automatic aliases\n", + "- a new version of a dataset is created when its data content changes (content can be entirely different)\n", + "- a user can 'designate' a new version of data, in some cases it can be done by our application automatically\n", + "- versioning of metadata is independent of dataset version\n", + "- allows us to discover 'out-of-date' results (via their connected input-ids), and recreating them with updated input dataset\n", + "- frontend must be able to list, access and query datasets/inputs/outputs via unique id & version number\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "It should be obvious that and why we need some sort of (at least internal) unique identifier for each dataset. The main scenario where users will come in touch with such an identifier is when they are asked to choose an input dataset for a module/workflow. It's possible to make that 100% transparent to the user, and let them for example select a folder of csv files, which we would then copy into our internal data repository, assign it an id, and use that for our calculation. That would mean though, that the next time the user wants to use the same dataset again, we would do the same again, basically duplicating our internal dataset. We probably could be smart about it, and recognize those sort of duplicates, but that would involve fairly complex and fragile code I think we should rather avoid, and come up with an interface metaphor/language that makes users aware what is going on, and which empowers them with proper tooling to manage their research data according using best practices (metadata, versioning, etc.).\n", + "\n", + "So, I propose that we should have a 'data management' section in our application UI, which could be used to both 'onboard' and manage datasets independent of a workflow, but also within the context of a workflow (for example by re-using some of the file selection widgets and filling in a newly create dataset id into a workflow input, right after onboarding). How that would look like exactly, we'd have to figure out and I think it would be a work-item on itself.\n", + "\n", + "The same goes for dataset versioning. One way I can imagine this working is to have a ``..`` postfix to our unique dataset identifier, where the ``minor`` part gets incremented with every metadata version change, and the ``major`` part for when the actual data changes. Another point to consider is whether to only use version number increases, or also have a concept of 'branching', where the versions of datasets can diverge, from a common parent. I think there is a point to be made for not making things to complicated unless really necessary, so most of this can be solved with a simple versioning scheme, and assigning totally new datasets id if something significant changes in the data of a dataset (while potentially preserving the lineage information by storing the 'parent id' in the new datasets metadata). But, as I said above, I think this would be a good item to investigate independently." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Storing module results\n", + "\n", + "- requirements: workflow history & snapshots & long running processes\n", + "- need for caching of at least the latest results\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "This section includes a quick recapitulation how our workflows are described and managed by the backend, as well as an outline how to handle and store temporary as well as final workflow outputs. This is important, because having access to already computed results is necessary for some of our requirements (derived from our user-stories):\n", + " - workflow history: enable the user to move back in the history of input sets of a workflow session\n", + " - snapshots: 'tag' certain input sets (basically creating a snapshot of that particular workflow state)\n", + " - support for long running processes: a user will want to have access to computational results, even if the had other workflow sessions inbetween (while a particularly long running job was running)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Quick recap: workflow modularity\n", + "\n", + "Every module has:\n", + " - one or several named inputs\n", + " - one or several named outputs\n", + " - as well as schema information for each input and output\n", + "\n", + "A workflow is a user-facing entity that:\n", + " - can also be used as a module (has inputs, outputs, schema)\n", + " - contains one or several modules\n", + " - where some inputs of some (internal) modules can be connected to an output of another (internal) module\n", + " - inputs of modules that are not connected to an output of another (internal) module are user inputs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In this example we'll use a workflow that is simlates a ``nand`` logic-gate. Such a logic gate can be created by using ``and`` and ``not`` logic gates one after the other. Below you can see a short description of the modules and their inputs, as well as how that would be configured in a workflow description json file. The important part is the ``modules`` value." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### example module: ``nand``\n", + "\n", + " - consists of two other modules: \n", + " - ``and``\n", + " - inputs: ``a`` & ``b`` (booleans)\n", + " - output: ``y`` (boolean - true if both inputs are true, otherwise false)\n", + " - ``not``:\n", + " - input: ``a`` (boolean - connected to ``y`` output of ``and``)\n", + " - output: ``y`` (boolean - negated input)\n", + " - two inputs: ``a`` & ``b`` (booleans, connect directly to ``and`` inputs)\n", + " - one output: ``y`` (false if 'a' & 'b' are true, otherwise true -- connects to ``y`` output of ``not`` module)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
Module description: nand\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
{\n",
+       "    'modules': [\n",
+       "        {'module_type': 'and'},\n",
+       "        {'module_type': 'not', 'input_links': {'a': 'and.y'}}\n",
+       "    ],\n",
+       "    'input_aliases': {'and__a': 'a', 'and__b': 'b'},\n",
+       "    'output_aliases': {'not__y': 'y'},\n",
+       "    'module_type_name': 'nand',\n",
+       "    'meta': {'doc': \"Returns 'True' if both inputs are 'False'.\"}\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(\"Module description: [b]nand[/b]\")\n", + "print(get_data_from_file(os.path.join(DHARPA_TOOLBOX_DEFAULT_WORKFLOWS_FOLDER, \"logic_gates\", \"nand.json\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "After creating the workflow description file, we create the workflow object in code, and for illustration purposes, we display the execution order and the state graph of the workflow (in its inital, stale state without any inputs)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAGwAAAD7CAYAAACPBXE2AAAABmJLR0QA/wD/AP+gvaeTAAANP0lEQVR4nO2dX0xb5RvHv4e2tKWwgo38byJZAleyRIMJMHQJyp+AKSwFDHZsiRrvFrZ4odmNicm8cDpnssSwK71QcSaQsBknIeVCKIl3Ji60ZrpIFUi7za4wykb3/C74UVda5lhP6XnOnk/yXvQ5b1+e93xy3vOmvOe8ChERBDbk5ToBYXeIMGaIMGYYtweCwSBmZ2dzkYuwjaamJlRXVycHaRujo6MEQIoGyujo6HY9lHKFPSByp0PCHqAoStq43MOYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcIeoLCwEAcPHsx1Gg9FhDFDhDFDM8LGx8ehKEqi+P1+9Pf3w+FwJGLhcBgAcOPGDZw8eRL79+9Hfn4+SkpK0NnZCa/Xm9Luo9Q9c+YMFEXB6uoqZmZmEn/PaNxxBUXu2GkRTq5wuVwEgF566SXyer20urpKc3NzZDAYKBQK0eLiItXU1FBZWRlNTExQJBIhv99Phw8fJkVR6MKFC4m2dlOXiMhms1Fzc/Nedzkt2GERjmaFff/992mPHzt2jADQ119/nRSPxWJUWVlJVquVlpaWdl2XiIcwzQyJ23nhhRfSxsfGxgAAXV1dSXGz2YzW1lasra3hypUru67LBc0Ks9lsKbH19XVEIhFYLBYUFRWlHC8rKwMALC0t7aouJzQrLB1msxl2ux2xWAzRaDTl+PLyMgCgvLx8V3W32GktoJZgJQwAent7AQCXL19Oiq+vr2NqagpWqxXt7e27rgsABQUFuHv3buJzXV0dRkZGstKPx2b7TU0rk461tbW0x7fP/G7fvp008xsZGXmsukREHR0dZLfb6c8//6TZ2VkyGo109erVrPZ3J6D1WaLP50u7vjwd4XCYhoeHqaamhkwmE9ntdmpvb6epqamM6s7Pz1NLSwvZbDZyOp10/vx51fv5qGhemJDMTsLY3cOedEQYMzIW9uDvf5mU999/X4XuZAct9THjXzfpCXiOTEt9lCGRGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGTv+Wv/tt9/uZR7CI7KjsIGBgb3MQ3hEFNLSP3tUor+/H4A+Rwm5hzFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDGD/QN9X3zxBc6ePYt4PJ6ILS4uAgAqKioSMYPBgBMnTuDo0aN7nqOasBcWCARQV1f3SHX9fj9qa2uznFF2YT8k1tbWor6+/qHbcCiKgvr6evayAB0IA4ChoSEYDIYdjxuNRvZD4Rbsh0QA+Pvvv+F0OnH//v20xxVFwcLCAqqqqvY4M/XRxRVWWVmJpqYm5OWldicvLw/Nzc26kAXoRBgAHDlyJG1cURQMDQ3tcTbZQxdDIgDcunULpaWl2NjYSIobDAYsLy/D4XDkKDN10c0VVlJSgra2tqTJh8FgQEdHh25kAToSBgAejydp4kFE8Hg8OcxIfXQzJALAnTt34HA4EIvFAAAWiwWhUAiFhYU5zkw9dHWFFRQUwOVywWQywWg0oqenR1eyAJ0JA4DXX38dGxsbiMfjGBwczHU6qqPK3u0+nw8LCwtqNJUx8XgcVqsVRIRoNKqZN7o5nU40NjZm3pAa2/+53e60+1dK+be43W41TrV6Wyq63W7Q5p6aOS9erxfT09M5z2OruN1utU6zOkOi1njxxRdznULW0KWwdL8p6gX99kyniDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDAA33zzDRRFgaIosFgsuU7noYgwAK+99hqICK2trblO5T8RYcwQYcwQYczQpLCNjQ2Mjo7ilVdeQXl5OaxWK5599lmcO3cuaaHo+Ph4YrKgKAquX7+OgYEBFBcXw+FwoLu7G9euXUtpf35+Hj09PbDb7bDZbGhpacFPP/20l118fDJaEfJ/3G63aotMiIgmJiYIAJ0+fZpu3rxJoVCIPvvsM8rLy6N33nknpb7L5SIA5HK5aHZ2llZWVmhycpKsVis1NDQk1f3tt9+ouLiYqqqq6Mcff6RoNEq//PILtbW10TPPPENms1m1fmyh5vnRrLBDhw6lxD0eD5lMJopEIknxLWETExMpeQGgUCiUiPX19REA+u6775Lq/vXXX2Q2mzUvTJNDYnd3N7xeb0r8wIEDuHfvHn799de032toaEj67HQ6AWw+8LfFDz/8AABob29PqltZWcnikVpNLsKJRCL4+OOPMTY2hmAwiH/++Sfp+J07d9J+z263J33Oz88HgMR9b319HdFoFBaLJe0S7tLSUgQCATW6kDU0eYW9+uqr+OCDD/DWW28hEAjg/v37ICKcPXsWwOZTKY+D2WxGUVERYrEYVlZWUo7fvHkzo7z3As0Ji8fjmJmZQXl5OY4fP46nn3468YaAtbW1jNvv7OwE8O/QuEU4HIbf78+4/WyjOWEGgwGHDh3C0tISPvroI4TDYaytrcHr9eLzzz/PuP3Tp0/jqaeewvDwMCYnJ7GysoKrV6/C4/HweNJFjZmL2rPEUChEb7/9NjmdTjKZTFRWVkbHjh2jd999N7FW/fnnnyefz5eyhv3UqVNERCnxrq6uRPt+v596enpo3759ian/pUuXqLW1NVH/jTfeUK0/ap4fVR7o6+vrAwBcvHgx06Z0iZrnR3NDovBwRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzVFvmFgwGNfNuQq0RDAZRXV2tSluqCZubm8PAwIBazekOtV7Bp6uXNG/R398PALq84uUexgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgz2D/R9+eWX+OSTTxCPxxOxxcVFAEBFRUUiZjAYcOLECRw9enTPc1QT9sICgQDq6uoeqa7f72exv8rDYD8k1tbW4sCBA4ndI9KhKArq6+vZywJ0IAwAhoaGYDAYdjxuNBrZD4VbsB8Sgc3tppxOZ9LufQ+iKAoWFhZQVVW1x5mpjy6usMrKSjQ1NSEvL7U7eXl5aG5u1oUsQCfCAODIkSNp44qiYGhoaI+zyR66GBIB4NatWygtLcXGxkZS3GAwYHl5GQ6HI0eZqYturrCSkhK0tbUlTT4MBgM6Ojp0IwvQkTAA8Hg8SRMPIoLH48lhRuqjmyER2Nwb0+FwIBaLAQAsFgtCoRCPjdweEV1dYQUFBejt7YXJZILRaERvb6+uZAE6EwYAg4ODuHfvHjY2NjA4OJjrdFRHldfv+Xw+LCwsqNFUxsTjcRQUFICIcPv2bc280c3pdKKxsTHzhtTYl3FrR3IpOxfN7ZTudrtBRJoo09PTmJ6eznkeW0Wtl1sCGt0pPVNaWlpynULW0KWwdL8p6gX99kyniDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmPNHCCgsLcfDgwVynsSueaGEcEWHM0Kyw8fFxKIqSKNevX8fAwACKi4vhcDjQ3d2Na9eupXzvxo0bOHnyJPbv34/8/HyUlJSgs7MTXq83UefMmTNQFAWrq6uYmZlJ/A2jkcH/czNbErKJ2+1WbZHJdlwuFwEgl8tFs7OztLKyQpOTk2S1WqmhoSGp7uLiItXU1FBZWRlNTExQJBIhv99Phw8fJkVR6MKFC0n1bTYbNTc3ZyXvB1Hz/Gj2CtvOm2++icbGRthsNrz88svo6urCzz//jHA4nKjz3nvv4Y8//sCnn36K7u5u7Nu3D7W1tfjqq69QUVGB48ePY3l5OYe9yBw2whoaGpI+O51OAJsP820xNjYGAOjq6kqqazab0drairW1NVy5ciXLmWYXNsLsdnvS5/z8fABIPPywvr6OSCQCi8WCoqKilO+XlZUBAJaWlrKcaXZhI+y/MJvNsNvtiMViiEajKce3hsLy8vJE7GEPsmsV3QgDgN7eXgDA5cuXk+Lr6+uYmpqC1WpFe3t7Il5QUIC7d+8mPtfV1WFkZGRvkn1MdCXsww8/RE1NDYaHh3Hp0iVEo1EEAgEMDg5icXER586dSwyNAPDcc88hEAhgYWEBPp8Pv//+u/YXoaox1czGtN7n86WsTz916hQRUUq8q6sr8b1wOEzDw8NUU1NDJpOJ7HY7tbe309TUVMrfmJ+fp5aWFrLZbOR0Oun8+fOq9mELNc+PKg/09fX1AQAuXryYaVO6RM3zo6sh8UlAhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDtbXJwWBQM+8m1BrBYBDV1dWqtKWasLm5OQwMDKjVnO5Q6xV8unpJ85OA3MOYIcKYIcKYYQQgiwkZ8T+HpntUF0NGGgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "workflow = dharpa.create_workflow(\"nand\")\n", + "graph_to_image(workflow.structure.execution_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph_to_image(workflow.create_state_graph(show_structure=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Now, we set the inputs (both ``True``, which means the end-result should be ``False``). As you can see from the state graph, the workflow inputs are directly connected to the module inputs of the ``and`` module." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "processing started: nand.nand\n", + "processing started: nand.and\n", + "processing finished: nand.and\n", + "processing started: nand.not\n", + "processing finished: nand.not\n", + "processing finished: nand.nand\n" + ] + } + ], + "source": [ + "workflow.inputs.a = True\n", + "workflow.inputs.b = True\n", + "\n", + "await workflow.process()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Again, lets look at the workflow state, this time we display it using a json data structure, not a network graph:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'alias': 'nand',\n",
+       "    'address': 'nand.nand',\n",
+       "    'type': 'nand',\n",
+       "    'is_pipeline': True,\n",
+       "    'state': 'results_ready',\n",
+       "    'inputs': {\n",
+       "        'a': {'schema': {'type': 'boolean', 'default': None}, 'value': True},\n",
+       "        'b': {'schema': {'type': 'boolean', 'default': None}, 'value': True}\n",
+       "    },\n",
+       "    'outputs': {'y': {'schema': {'type': 'boolean', 'default': None}, 'value': False}},\n",
+       "    'execution_stage': None,\n",
+       "    'doc': \"Returns 'True' if both inputs are 'False'.\",\n",
+       "    'pipeline_structure': {\n",
+       "        'workflow_id': 'nand',\n",
+       "        'modules': [\n",
+       "            {\n",
+       "                'module': {\n",
+       "                    'alias': 'and',\n",
+       "                    'address': 'nand.and',\n",
+       "                    'type': 'and',\n",
+       "                    'is_pipeline': False,\n",
+       "                    'state': 'results_ready',\n",
+       "                    'inputs': {\n",
+       "                        'a': {'schema': {'type': 'boolean', 'default': None}, 'value': True},\n",
+       "                        'b': {'schema': {'type': 'boolean', 'default': None}, 'value': True}\n",
+       "                    },\n",
+       "                    'outputs': {\n",
+       "                        'y': {'schema': {'type': 'boolean', 'default': None}, 'value': True}\n",
+       "                    },\n",
+       "                    'execution_stage': 1,\n",
+       "                    'doc': \"Returns 'True' if both inputs are 'True'.\",\n",
+       "                    'pipeline_structure': None\n",
+       "                },\n",
+       "                'input_connections': {'a': '__parent__.a', 'b': '__parent__.b'},\n",
+       "                'output_connections': {'y': ['not.a']}\n",
+       "            },\n",
+       "            {\n",
+       "                'module': {\n",
+       "                    'alias': 'not',\n",
+       "                    'address': 'nand.not',\n",
+       "                    'type': 'not',\n",
+       "                    'is_pipeline': False,\n",
+       "                    'state': 'results_ready',\n",
+       "                    'inputs': {\n",
+       "                        'a': {'schema': {'type': 'boolean', 'default': None}, 'value': True}\n",
+       "                    },\n",
+       "                    'outputs': {\n",
+       "                        'y': {'schema': {'type': 'boolean', 'default': None}, 'value': False}\n",
+       "                    },\n",
+       "                    'execution_stage': 2,\n",
+       "                    'doc': 'Negates the input.',\n",
+       "                    'pipeline_structure': None\n",
+       "                },\n",
+       "                'input_connections': {'a': 'and.y'},\n",
+       "                'output_connections': {'y': ['__parent__.y']}\n",
+       "            }\n",
+       "        ],\n",
+       "        'workflow_input_connections': {'a': ['and.a'], 'b': ['and.b']},\n",
+       "        'workflow_output_connections': {'y': 'not.y'}\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "state = workflow.to_dict(include_structure=True)\n", + "print(state)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### How to actually deal with workflow/module outputs?\n", + "\n", + "- why not store all results?\n", + "- smart way of storing/deleting/managing storage:\n", + " - compression\n", + " - efficient module design\n", + " - cleanup process\n", + " - only store results if good execution time/result size ratio, otherwise just re-process" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To satisfy the above mentioned requirements, my current plan is to just store all results of all module runs, instead of coming up with a complicated caching scheme. There will have to be some sort of 'result-cleaning' and consolidation, but I think if we are being smart about it this might be the most promising strategy, which will introduce the least amount of complexity.\n", + "\n", + "A folder structure to accomodate that would probably look something like this:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "\n", + "\n", + "- each module has its own name/id, all results for a module will be stored under same folder\n", + "- 'result.feather' has one or several columns that represent output values\n", + "- also, one column with runtime metadata (execution time, version of workflow, etc.)\n", + "- this works well with the 'dataset' API in Apache Arrow: https://arrow.apache.org/docs/python/dataset.html (which means we can lazy-load all results of a workflow/module into the same dataframe, and do 'meta'-queries and -analysis on that if we choose to)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In order to not waste too much hard-disk space (which would be the most obvious concern here), I think we have a few different options. For one, we'd store all results with compression enabled. We would implement our modules in an efficient way that is aware of how we store results. We might have a cleanup process running in the background that is aware of how often a result is accessed, and how it's compute-time/result-size ratio is. In some cases where that ratio leans very much towards result-size, we might decide to not store those results at all, but re-process every time." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Streaming module results\n", + "\n", + "TBD" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "This is an area I haven't done too much work on yet, but in general: we will want to have access to intermediate results (or, rather: partial results in real-time), so we can provide the user with information they can use to determine whether to cancel a running process or not. Even though we will probably not have that functionality available in our initial, first version, I think we should anticipate that requirement, and design our data management with it in mind, so it can be added later without having to re-write a lot of code." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Default data format (for import/export)\n", + "\n", + "- every result can be described by specifying:\n", + " - the input dataset(s) and other inputs\n", + " - the workflow (and workflow version) that was used to produce it\n", + " - -> theoretically, every (result) dataset can be described by very small json file/metadata set\n", + "- proposal: invent our own (small) set of file formats (including version-nr, metadata schema, payload)\n", + " - Apache Arrow based for tabular/scalar data\n", + " - folder/zip based for binary data\n", + " - all our import modules would create files in that format\n", + " - provide tooling (and modules) to convert/export those to all common data formats\n", + " - possibility of data registries:\n", + " - very simple implementation compared to products like dataverse, ckan\n", + " - high performance data transfer (using Apache Flight)\n", + " - different levels: local (within our app), organization-wide, global (aka default registry)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The last thing to decide is whether we want to provide a 'standard' data format for our application. This will be modelled closely upon the format we will use internally, but with some added metadata fields and possibly restrictions.\n", + "\n", + "This is mostly for the purpose of sharing, transferring, and publishing data. In principle, there is a really lightweight way to share our work: since we can describe everything we do by specifying the workflow, and listing all the inputs we use with it. Assuming all inputs are either scalars or, in case of datasets, available via download, this description could be very lightweight: it's just a json file containing the workflow structure (incl. maybe version information), and input-data urls. With that, everyone with access to the data can in theory replicate any end- and intermediate result.\n", + "\n", + "In theory, that json structure can also be attached to every result dataset, which means that our results will always come with information how they were produced (and how to re-produce them).\n", + "\n", + "Since all this is very dependent on being able to have access to metadata alongside the 'actual' data, and because in my experience systems and architectures that store metadata seperately to data are either fairly complex, specific and hard to maintain, I would propose we come up with a way to package our data in a way that allows for our metadata to always be included, and where it's easy to access both data and metadata without having to open the whole file. Arrow gets us a long way toward that (for tabular data), the only thing that is missing is a standard way to include metadata. For that we have two options: use the Arrow 'metadata' field (which is fairly limited, it only takes encoded byte-arrays as keys/values), or store our metadata in a seperate column. Currently, I'm leaning toward the latter option, but this is something we'll have to try out and play with to get a better idea how feasable it is.\n", + "For other types of data (binary blobs, images, etc.), I propose we use an archive format (zip, tar, ...) with a json file at a standard location (e.g. './.metadata.json') that includes the same metadata schema a tabular dataset would use. That way our datasets always have the same 'interface'. And we can provide a set of standard tools (which could be implemented as workflow modules and workflows) to import and export 'our' data from/to commonly used formats like csv, excel, etc (which in most cases would not include metadata at all).\n" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/architecture/data/data_centric_approach.ipynb b/docs/architecture/data/data_centric_approach.ipynb new file mode 100644 index 000000000..00858e863 --- /dev/null +++ b/docs/architecture/data/data_centric_approach.ipynb @@ -0,0 +1,50 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# A (parallel?) data centric approach for kiara/lumy" + ] + }, + { + "cell_type": "markdown", + "source": [ + "- decision between Workflow creation and Workflow execution\n", + "-" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/architecture/data/data_formats.ipynb b/docs/architecture/data/data_formats.ipynb new file mode 100644 index 000000000..aac4f897f --- /dev/null +++ b/docs/architecture/data/data_formats.ipynb @@ -0,0 +1,608 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from dharpa.benchmarking.data import clear_system_cache, MemoryRecorder, get_example_file\n", + "\n", + "from rich.jupyter import print\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "This document is a primer on data formats and structures, and how and why those affect our project. I have no idea about how much of this is common knowledge, and how much is news to the majority. I have noticed a few common misconceptions and assumptions about some of the topics in here, so I figured it makes sense to try to get everyone on the same page. I've tried to keep this simple and short, so there are some things in here that are over-simplified bordering on incorrect.\n", + "\n", + "My educated guess is that in our project we will mostly be concerned about structured, tabular data, which is why I'll be focussing on that. I might add a companion document about 'binary-blob' data later on.\n", + "\n", + "## Data serialization and storage\n", + "\n", + "- data lives in memory or on disk\n", + "- lots of 0's and 1's -- binary format\n", + "- only 'decoding' gets you a useful representation\n", + "- 'text' is just an often used encoding format" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "The natural habitat of (digital) data is computer memory or on disk. Data is always stored in binary form, and there is always some sort of decoding involved to make data usable in one way or another (with the exception of booleans maybe). Even when we talk about text files (seemingly implying that those are not binary since they are 'text'), we are dealing with binary data. It's just such a standard data encoding format that tools to decode that sort of data are available everywhere. Decoding text is by no means trivial, but luckily our tools have evolved so much by now -- and we have standards like utf-8 commonly available -- that we as users hardly ever come across decoding issues anymore. At least not to the extent we used to. It still happens, and I would imagine quite a bit more in the Digital Humanities than in your everyday 'business' use-case. So it helps to be aware of at least the basics involved in text encoding standards, and I would recommend anyone doing any sort of programming to read up on it.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Tabular data (serialization) formats\n", + "\n", + "- serialization/deserialization\n", + "- binary or not, here I come:\n", + " - avro, protobuf, pickle\n", + " - csv, json, yaml, xml\n", + "- 'structured formats' with schema, or without:\n", + " - avro, protobuf, thrift, flatbuffers, xml\n", + " - csv, json, yaml, messagepack, pickle\n", + "- zero-copy, memory-mapping?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "The (arguably) most important type of data we'll be dealing with is structured, tabular data. So it pays to think about how its digital form is represented in memory, and what issues are involved when trying to store, manipulate and transfer it.\n", + "\n", + "In our context, tabular data is always a 2 dimensional matrix (rows, columns), where each column has the same number of items, and each column contains items of the same data type (or union of data types). Tabular data can have a column that can be used as index, but that is optional. Each table can be described with a schema, which is basically a list of column names, and the data types associated with each of those columns.\n", + "\n", + "### Serialization / deserialization\n", + "\n", + "Every programming language represents its basic data types differently in memory. That's especially true for the arguably most common data type: the string. Also, that in-memory representation is (almost) always different to the format of the (functially) same data when exported into a file on disk.\n", + "\n", + "This means that, if we want to export our data in our code, we need to do a step that is called 'serializing' (or 'marshalling'): we convert the data into a commonly accepted representation of a commonly accepted set of data-types. This serialization is usually expensive, computationally speaking. We want to avoid it, if at all possible, or at least always postpone it until the last possible moment, when we are sure the data won't change anymore, so we only have to do it once.\n", + "The same goes for de-serializing data, just in the other direction: we only want to do it once, then keep it in memory in our native representation (if the size of the data allows it), so we don't have to read it again. Even if the content of a file is in the OS (page) cache (which would mean we don't actually have to read the file-content from disk) we'd still have to spend the cpu-cycles for de-serialization. So, big no-no, bad data-scientist!\n", + "\n", + "### Format types\n", + "\n", + "For serialization, we have two basic options: text, and binary (let's just not nitpick and assume that this distinction makes sense).\n", + "\n", + "#### Text-based formats\n", + "\n", + "Serializing into a text-based format usually means taking all the elements our tabular data consists of, one by one, then serialize each element into its textual representation (like for example ``\"hello world\"`` for a string, ``5`` for an integer, ``true`` for a boolean in json), and then assembling one big 'meta'-string out of all those sub-elements. For csv, that might include a header-row, and adding delimiters like ',' in between the elements. For json it would be adding list ('``[``', '``]``') or dictionary ('``{``', '``}``') indicators, as well as deliminters and other elements as outlined in the JSON specification.\n", + "\n", + "I haven't done any research on it, but I'd imagine csv would be one of the oldest widely-used data storage formats. Csv is a text based tabular format, and it allows you to specify an optional header to describe column names. It allows for different deliminters between row cells (whole rows are delimited by the end-of-line special character). Other commonly used text-based formats are json, yaml, toml, xml. Those are not strictly tabular data formats, they can also contain just scalars, dictionaries, or lists (tabular data is always a list of dictionaries of the same shape).\n", + "\n", + "#### Binary formats\n", + "\n", + "Serializing into a binary format is usually very specific to the format itself, so there are not really any common tools to read more than one of them (like there are for text-based formats, where you could use any text editor and at least display the content in a meaningful way), and different formats have different priorites (like small size of the resulting blob, quick random read/write access, suitability for streaming, etc). Binary formats often have compression built-in, whereas text formats never have (but can be usually compressed well enough by external tools due to certain characteristics of encoded strings). Also, they usually are a lot easier on the cpu for serialization/deserialization purposes, since it's easier to optimize them for that scenario.\n", + "\n", + "Binary formats existed for a long time, but in recent years they are used more widely again. Decentralized software architecture (microservices) as well as 'big(-ish) data' played a huge part in that. Because, as it turns out that, while serializing a few items of data per seconds into json and back is not that much of a problem, doing the same thing for millions of large (or even small) chunks of data actually is. In some cases that serialization step can take more time than the actual computation that was done on the data. To counter that issue, people came up with formats like 'Avro', 'Thrift', 'ProtoBuf'. Pythons 'pickle' can also be considerd a binary serialization format.\n", + "\n", + "\n", + "### Schema\n", + "\n", + "Another useful way to separate data formats is to check whether they include a (native) schema that describes the value types of the data they hold, or not. If schema information is present, it can either be included in the resulting file, or be stored elsewhere.\n", + "\n", + "Having schema information for a dataset is highly desirable, because it tells us exactly what type of data we are dealing with (is it a string, integer, float? what precision?). Whereas most text-based data formats don't include a schema definition format, there are sometimes external efforts to remedy that (JSON-schema, for example). None of the text-based formats I can think of at the top of my head include the schema in a resulting file. This is important, because the complexity of tools that handle data increases if they need to worry about secondary, 'side-car' files for incoming data.\n", + "\n", + "#### Slight detour: csv\n", + "\n", + "Csv is bit special in that it can contain a 'header' row in the first line, which can be used to determine column names. Since this row is optional, it is not always present which of course complicates the import process. Because csv files are so common in data science, most of the tools we use include some csv-import method that more or less smartly determines the (text-)encoding of the file, whether it has a header row, as well as the schema of the included data. This method usually serializes the data into the appropriate internal representations of the column types after it is reasonably sure the inferred schema is correct-ish. Without a definite, (externally provided) schema it is not possible to guess this reliably in every case, so a developer should always assert the correct types are present after such an import.\n", + "\n", + "### Streaming, zero-copy, memory-mapping\n", + "\n", + "One thing that everyone working semi-seriously in data science and handling big-ish data should be aware of is that in most OS'es you can read (or 'load') data in more ways than one. The 'one' way is usually something like:\n", + "\n", + "``` python\n", + "file = open('dataset.csv')\n", + "lines = file.read() # or file.readlines()\n", + "```\n", + "\n", + "When using Pandas, it'll probably take the form of:\n", + "``` python\n", + "import pandas as pd\n", + "pd.read_csv('dataset.csv')\n", + "```\n", + "\n", + "Both of those read the whole file into memory. Which will be fine if the dataset is small or there is a need for it to be\n", + "in memory in full. Depending on the situation, it might be wasteful, though. For example when calculating the mean for a column of integers. In that case it's a better strategy to read one line of the file, process the column we are interested in, eject the line from memory, then read the next line, only keeping the current total and the number of items we processed so far. That way we'll never allocate more memory than what is needed for a single line. We can even process datasets that are larger than the available memory of or our workstation.\n", + "\n", + "As a matter of fact, we could do even better if we would know the offset and length of the column we are interested in, in that case, we would only have to read the bytes that hold the integer value we need, and could ignore the other cells of a row completely. Again, this might or might be an issue depending on the size of the data in a row, but if we have a dataset with a lot of large columns, the I/O operations we would not have to do by only reading the exact data we need could improve the speed of processing considerably. Doing that doesn't really work for csv files, for example. Since there is no good way for us to know the exact offset of length of the column we are interested in. There are data formats that support that kind of operation though.\n", + "\n", + "Along with those fairly simple strategies to deal with data efficiently, there are more advanced ones that also deal with data and how it is handled in a system memory as well as on disk. For those of you who are interested, I would recommend looking up the terms 'memory-mapping', and 'zero-copy'.\n", + "\n", + "### Some random benchmarks, to illustrate" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
-- system cache cleared --\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading the whole file                                                                    │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ This iterates through all lines in memory, keeping all of them in memory at the same      │\n",
+       "│ time.                                                                                     │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function  count_lines                                                         │\n",
+       "│ ╶───────────────────┼─────────────╴                                                       │\n",
+       "│          max memory │    53.75 MB                                                         │\n",
+       "│      execution time │      144 ms                                                         │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def count_lines(path):                                                                │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     f = open(path)                                                                    │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │     length = 0  # ignores '\\n' characters                                             │ │\n",
+       "│ │     lines = f.readlines()                                                             │ │\n",
+       "│ │     for line in lines:                                                                │ │\n",
+       "│ │         counter = counter + 1                                                         │ │\n",
+       "│ │         length = length + len(line)                                                   │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return {\"no_lines\": counter, \"size\": length}                                      │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "clear_system_cache()\n", + "file_path = get_example_file()\n", + "\n", + "def count_lines(path):\n", + "\n", + " f = open(path)\n", + " counter = 0\n", + " length = 0 # ignores '\\n' characters\n", + " lines = f.readlines()\n", + " for line in lines:\n", + " counter = counter + 1\n", + " length = length + len(line)\n", + "\n", + " return {\"no_lines\": counter, \"size\": length}\n", + "\n", + "profile_count_lines = MemoryRecorder.profile_func(\"Reading the whole file\", \"This iterates through all lines in memory, keeping all of them in memory at the same time.\", False, count_lines, file_path)\n", + "print(profile_count_lines.report)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
-- system cache cleared --\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading the file line by line                                                             │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ This allocates only very little memory, since once a line is read and processed, it will  │\n",
+       "│ be disregarded.                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function  count_lines                                                         │\n",
+       "│ ╶───────────────────┼─────────────╴                                                       │\n",
+       "│          max memory │      0.0 MB                                                         │\n",
+       "│      execution time │      115 ms                                                         │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def count_lines(f):                                                                   │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │     length = 0  # ignores '\\n' characters                                             │ │\n",
+       "│ │     for line in f:   # when using open like we do here, it returns an iterator not a  │ │\n",
+       "│ │         counter = counter + 1                                                         │ │\n",
+       "│ │         length = length + len(line)                                                   │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return {\"no_lines\": counter, \"size\": length}                                      │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "clear_system_cache()\n", + "\n", + "file_obj = open(file_path, buffering=True)\n", + "def count_lines(f):\n", + " counter = 0\n", + " length = 0 # ignores '\\n' characters\n", + " for line in f: # when using open like we do here, it returns an iterator not a materialized list\n", + " counter = counter + 1\n", + " length = length + len(line)\n", + "\n", + " return {\"no_lines\": counter, \"size\": length}\n", + "\n", + "profile_count_lines = MemoryRecorder.profile_func(\"Reading the file line by line\", \"This allocates only very little memory, since once a line is read and processed, it will be disregarded.\", False, count_lines, file_obj)\n", + "file_obj.close()\n", + "print(profile_count_lines.report)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Structured (binary) data layout strategies\n", + "\n", + "- Row-based:\n", + " - (most commonly used dbs): sqlite, Postgres, MySQL, ...\n", + " - Avro\n", + "- Column-based\n", + " - OLAP dbs: duckdb, ClickHouse, BigQuery, Snowflake ...\n", + " - pandas dataframe\n", + " - parquet, feather" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "In order to use the more advanced operations on data I described earlier, the data formats we use need to support them. None of the simple formats like csv, json, yaml do. There is one category of applications that had to deal with those things for decades though: databases. So I think it pays to look a bit at how they handle storing data, and what kind of trade-offs they are making. Basically, a database is a system that lets you persist (mostly) structured data on disk, and gives you an easy, memory- and processing-efficient way to query and retrieve it back. To do that, they have different ways to persist data, add indexes, cache 'hot' data, and so on. As it turns out, there are 2 main ways data can be stored on disk for efficient retrieval: row-based, and column-based (I'm ignoring document/'nosql' databases here, since -- for almost all practical use-cases -- they are inferior to relational ones).\n", + "\n", + "\n", + "\n", + "### Row-oriented databases\n", + "\n", + "The most common database type is 'row-oriented'. This means that data is stored in a way so that each row represents a continuous block of disk (or memory). Data is quick and easy to read (if you are interested in a subset of rows) and it is very easy and fast to add new rows/records. This fits the most common requirements businesses have for a database, since new items are added constantly, which is why most databases we encounter in the wild are row-based. Examples for such databases are: Postgres, sqlite, MySQL.\n", + "\n", + "### Column-oriented databases\n", + "\n", + "Column-oriented databases have existed for a long time, but they are not as prevalent as their row-based cousins, and often ignored by developers who haven't been exposed to them and their advantages. Instead of storing data row-by-row, they store data column-by-column. This means that column-cells are layed out next to each other on disk, and different columns occupy different regions of the storage (not necessarily close to each other at all). The querying logic is quite different for this type of database, the main advantage is that a certain type of analytical query is really fast (speedups of 10x or even 100x are quite possible), also it is very easy to request whole columns from such a database without it having to access any irrelevant parts of the data. Compressing data is also easier with column-oriented databases, so usually those occupy less disk space than their row-based peers. The disadvantage of those databases is that it's harder and slower to add new data, so they are more common for situations where one either has a fixed dataset, or updates are rare, and come in big(er) batches. Also, certain types of queries are less suited for that layout, which makes it always a good idea to think about what you need out of your data before deciding on a database/database type.\n", + "\n", + "\n", + "### Row-based/column-based in data science\n", + "\n", + "How is this relevant? Well, because in data science we are dealing mostly with fixed datasets, and the queries we do on them are mostly analytical in a way that fits column-oriented data layouts; although exceptions from that rule are not uncommon. So it makes sense to depart from the 'common wisdom' of using a row-based approach. In fact, Numpy arrays and Pandas dataframes (which depend on them) are kept in memory using the column-based approach. This is important to know, because it helps us using and querying data correctly in our code. For example, it's not a good idea and very slow to add 'rows' to a Pandas dataframe. Instead, we should initialize the Dataframe with the whole dataset once at the beginning, and then only add columns to it (which is very fast), but no new rows if at all possible.\n", + "\n", + "One issue with Numpy/Pandas is that commonly a dataset is loaded into memory as a whole. There are ways around that (for example by processing a csv file in batches), but very often those are not used. In reality, it's probably not that big an issue in the DH field, since datasets seem to be smaller on average. But it is still a good idea to be as efficient as possible in this regard, esp. for our purpose, since we won't have any knowledge or guarantees in advance about the data we'll be handling (which might very well be bigger than the availble memory). Also, since we are building an interactive application, it makes a difference whether a query comes back within a second, or ten.\n", + "\n", + "### More random benchmarks, this time with Pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
-- system cache cleared --\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading a csv file with Pandas as a whole.                                                │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ This is ok, as long as the dataset is not too big.                                        │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function   load_csv                                                           │\n",
+       "│ ╶───────────────────┼───────────╴                                                         │\n",
+       "│          max memory │ 63.625 MB                                                           │\n",
+       "│      execution time │    342 ms                                                           │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def load_csv(path):                                                                   │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     df = pd.read_csv(path)                                                            │ │\n",
+       "│ │     for _ in df[\"year_month\"]:                                                        │ │\n",
+       "│ │         counter = counter + 1                                                         │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return counter                                                                    │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "clear_system_cache()\n", + "import pandas as pd\n", + "\n", + "def load_csv(path):\n", + "\n", + " counter = 0\n", + "\n", + " df = pd.read_csv(path)\n", + " for _ in df[\"year_month\"]:\n", + " counter = counter + 1\n", + "\n", + " return counter\n", + "\n", + "profile_read_csv = MemoryRecorder.profile_func(\"Reading a csv file with Pandas as a whole.\", \"This is ok, as long as the dataset is not too big.\", False, load_csv, file_path)\n", + "print(profile_read_csv.report)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
-- system cache cleared --\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading a csv file with Pandas, iterating over rows                                       │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ As one can see, this is very very slow, and not a good idea at all to do in Pandas.       │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function   load_csv                                                           │\n",
+       "│ ╶───────────────────┼───────────╴                                                         │\n",
+       "│          max memory │ 80.875 MB                                                           │\n",
+       "│      execution time │  21704 ms                                                           │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def load_csv(path):                                                                   │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     df = pd.read_csv(path)                                                            │ │\n",
+       "│ │     for index, row in df.iterrows():                                                  │ │\n",
+       "│ │         counter = counter + 1                                                         │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return counter                                                                    │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "clear_system_cache()\n", + "import pandas as pd\n", + "\n", + "def load_csv(path):\n", + "\n", + " counter = 0\n", + "\n", + " df = pd.read_csv(path)\n", + " for index, row in df.iterrows():\n", + " counter = counter + 1\n", + "\n", + " return counter\n", + "\n", + "profile_read_csv = MemoryRecorder.profile_func(\"Reading a csv file with Pandas, iterating over rows\", \"As one can see, this is very very slow, and not a good idea at all to do in Pandas.\", False, load_csv, file_path)\n", + "print(profile_read_csv.report)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Reading a csv file with Pandas, in chunks.                                                │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│ This is a good approach when dealing with a dataset that is large, and we don't need it   │\n",
+       "│ except for a single operation on a single column. We can optimize the                     │\n",
+       "│ execution-time/memory-usage by adjusting the 'chunksize' value.                           │\n",
+       "│                                                                                           │\n",
+       "│                                                                                           │\n",
+       "│                     ╷                                                                     │\n",
+       "│   Profiled function  load_csv_in_chunks                                                  │\n",
+       "│ ╶───────────────────┼────────────────────╴                                                │\n",
+       "│          max memory │       6.0390625 MB                                                  │\n",
+       "│      execution time │             892 ms                                                  │\n",
+       "│                     ╵                                                                     │\n",
+       "│ ╭─ Code ────────────────────────────────────────────────────────────────────────────────╮ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │ def load_csv_in_chunks(path):                                                         │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     counter = 0                                                                       │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     chunksize=1000                                                                    │ │\n",
+       "│ │     with pd.read_csv(path, chunksize=chunksize) as reader:                            │ │\n",
+       "│ │         for chunk_df in reader:                                                       │ │\n",
+       "│ │             for _ in chunk_df[\"year_month\"]:                                          │ │\n",
+       "│ │                 counter = counter + 1                                                 │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ │     return counter                                                                    │ │\n",
+       "│ │                                                                                       │ │\n",
+       "│ ╰───────────────────────────────────────────────────────────────────────────────────────╯ │\n",
+       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def load_csv_in_chunks(path):\n", + "\n", + " counter = 0\n", + "\n", + " chunksize=1000\n", + " with pd.read_csv(path, chunksize=chunksize) as reader:\n", + " for chunk_df in reader:\n", + " for _ in chunk_df[\"year_month\"]:\n", + " counter = counter + 1\n", + "\n", + " return counter\n", + "\n", + "profile_read_csv = MemoryRecorder.profile_func(\"Reading a csv file with Pandas, in chunks.\", \"This is a good approach when dealing with a dataset that is large, and we don't need it except for a single operation on a single column. We can optimize the execution-time/memory-usage by adjusting the 'chunksize' value.\", False, load_csv_in_chunks, file_path)\n", + "print(profile_read_csv.report)\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/architecture/data/dev.ipynb b/docs/architecture/data/dev.ipynb new file mode 100644 index 000000000..06f3d5fc3 --- /dev/null +++ b/docs/architecture/data/dev.ipynb @@ -0,0 +1,36 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/architecture/data/index.md b/docs/architecture/data/index.md new file mode 100644 index 000000000..3e484ed46 --- /dev/null +++ b/docs/architecture/data/index.md @@ -0,0 +1,71 @@ +From looking at the user stories, and after listening to the interviews Lorella conducted and also considering my own +personal experience in eResearch, I think its save to say that the central topic we are dealing with is data. Without data, +none of the other topics (workflows, visualisation, metadata...) would even exist. Because of its central nature I want to lay out the different forms it comes in, and which characteristics of it are +important in our context. + +## What's data? + +Data is created from sources. Sources come in different forms (analog, digital) and can be anything from handwritten +documents in an archive to a twitter feed. Photos, cave-paintings, what have you. I'm not websters dictionary, but I think +one usable working definition of data could be a 'materialized source', in our context 'materialized source in digital form'. +From here on out I'll assume we are talking about 'digital' data when I mention data. + +One thing I'll leave out in this discussion is what is usually called 'dirty data' in data engineering, although it is +an important topic. Most of the issues there map fairly well to the structured/unstructured thing below. There are a few +differences, but in the interest of clarity let's ignore those for now... + +## Structured data / Data transformations + +Important for us is that data can come in two different formats: unstructured, and, who'd have guessed... structured. The same +piece of data can be theoretically expressed in structured as well as unstructured form: the meaning to a researcher would +be 100% the same, but the ways to handle, digest and operate with the data can differ, and in most scenarios adding structure +opens up possibilities to work with the data that weren't there before. In my head I call those two forms 'useless', and +'useful' data, but researcher usually get a bit agitated when I do, so I have learned to not do that in public anymore. + +For researchers, the most (and arguably only) important feature of 'structure' is that it enables them to +do *more* with the data they already possess. By means of computation. I think it's fair to say that only structured data +can be used in a meaningful way in a computational context. With the exception that unstructured data is useful input to +create structured data. + +One more thing to mention is that the line between structured and un-structured is sometimes hard to draw, +and can depend entirely on context. "One persons structured data is another persons unstructured data.", something like that. +In addition, in some instances unstructured data can be converted to structured data trivially, meaning without much effort +or any user-interaction. I'd argue we can consider those sorts of datasets basically 'structured'. + +### Example + +Lets use a simple example to illustrate all that: *a digital image of a document*. + +Depending on what you are interested in, such an image might already be structured data. For example it could contain geo-tags, and a +timestamp, which are both digitally readable. If you want to visualize on a map where a document is from, you can do that instantly. +Structured data, yay! + +Similarly, if you are interested in the color of the paper of the document (ok, I'm stretching my argument here as this seems fairly +unlikely, but this is really just to illustrate...), you might get the color histogram of the image (which is trivial to extract, +but needs some batch-computation), and for your purposes you would also consider the image file structured data. + +Now, if you are interested in the text content of the document, things get more interesting. You will have to jump +through some hoops, and feed the image file to an OCR pipeline that will spit out a text file for example. The data +itself would still be the same, but now computers can access not only some probably irrelevant metadata, but also the text content, +which, in almost all cases, is where the 'soul' of the data is. + +It could be argued that 'just' a text file is not actually structured. I'd say that groups of ascii-characters that +can be found in english-language dictionaries, separated by whitespaces and new-lines can be considered a structure, +even if only barely. The new format certainly allows the researcher to interact with the data in other ways (e.g. full-text search). + +We can go further, and might be interested in characteristics of the text content (language, topics, etc.). This is where +the actual magic happens, everything before that is just rote data preparation: turning unstructured (or 'other-ly' structured) +data into (meaningful) structured data... On a technical level, those two parts (preparation/computation) of a research workflow might look (or be) +the same, but I think there is a difference worth keeping in mind. If I don't forget I'll elaborate on that later. + +## 'Big-ish' data + +I'm not talking about real 'Big data'-big data here, just largish files, or lots of them, or both. I don't think we'll encounter many use-cases where we have to move +or analyze terabytes of data, but I wouldn't be surprised if we come across a few gigabytes worth of it every now and then. + +There are a few things we have to be prepared for, in those cases: + +- transferring that sort of data is not trivial (esp. from home internet connections with limited upload bandwidth) -- and we will most likely have to be able to offer some sort of resumable-upload (and download) option (in case of a hosted solution) +- if we offer a hosted service, we will have to take into account and plan for this, so we don't run out of storage space (we might have to impose quotas, for example) +- computation-wise, we need to make sure we are prepared for large datasets and handle that in a smart way (if we load a huge dataset into memory, it can crash the machine where that is done) +- similarly, when we feed large datasets into a pipeline, we might not be able to just duplicate and edit the dataset like we could do for small amounts of data (too expensive, storage-wise) -- so we might need to have different strategies in place on how to execute a workflow, depending on file sizes (for example some sort of copy-on-write) diff --git a/docs/architecture/data/persistence.md b/docs/architecture/data/persistence.md new file mode 100644 index 000000000..b97e2b3ea --- /dev/null +++ b/docs/architecture/data/persistence.md @@ -0,0 +1,124 @@ +# Data persistence + +This is a document to describe my plans for storing data (and metadata) in *kiara*. (Almost) nothing I describe here is inmplemented yet, so it only reflects my current thinking. I think the overall strategy will hold, but there might be changes here and there. + +## The problem + +*kiara*s main functionality centers around transforming input data sets to output data sets. Those outputs need to be stored, to be of any use later on. Obviously. When deciding how to do this, we must take into account concerns about performance, disk- and memory-usage, data versioning, which metadata to attach, in what way, how to deal with metadata schemas (and versioning of both), etc. + +## The solution + +Well, solution. This is my current thinking of how to tackle the problem in a way that takes into account all of the aspects described above, while still being flexible enough to hopefully be able to incorporate solutions for future unforsseen issues. + +I am having trouble coming up with a good structure for this document, so I think I'll just try to tell the story from the point of view of data. Starting from a state where data exists outside of *kiara*, to when it is in a state to be ready to be published. As with everything I'm writing here as an explanation of generic and abstract concepts, some of the technical details I'm describing might be simplified to the point of being incorrect... + +## The 7 stages of data + +One thing I'd like to say before I start to describe those stages: the transformation of a dataset, from one stage to the next, **always** **always** **always** happens by piping the dataset through a *kiara* module. At **absolutely** **no** point is this done without *kiara*s involvement and knowledge. The dataset is used as input for a module, and the result (technically a new dataset) is a representation of the dataset in its next stage. This is important to keep in mind, as it is crucial for us so we can track data lineage. I'll write more on the specifics of this below, where it makes more sense. + +### 1) Unmanaged + +At the beginning, there was csv. Whether I like it or not, csv is the most predominant form data comes in. Csv is bad in a lot of ways, but in my mind the worst thing about it is that it is schema-less. True, in some cases you have a header-line, which gives you column-names, but that's not a requirement. Also, in a lot of cases you can auto-determine the type of each column, and luckily libraries like Pandas or Apache Arrow solved that problem for us so we don't have to do it ourselves every time. But those auto-parsers are not fool-proof, and you end up with integers where you wanted floats (or doubles), or integers where you wanted strings, or vice versa. + +In some cases we get data in a form that includes at least a semi-schema. Like a sqlite database file (which is more 'strongly' typed). But it's a lucky day when we get data that contains metadata about authorship, how and when it was created, from what sources, etc. + +### 2) Onboarded + +This is the first thing we need to do to unmanaged data: we need to 'onboard' it, so *kiara* knows the data exists, and what exact bytes it consists of. This last thing is very important: we have to be able to make sure the data we are talking about is not being changed externally, a lot of things in *kiara*s approach to data depend on this. + +Practically, in most cases this means *kiara* will copy one or several files into a protected area that no other application can/should access. That way we always have a reference version of the dataset (the bytes) we are talking about. + +One thing *kiara* does at this stage is give the dataset a uniuqe id, which can be used to reference it later (by users, or other objects/functions). Another thing is to collect some basic metadata: when the file/folder was imported, from what original path, what the filenames are, mime-type, size of files, original file attributes (creation data, permissions, etc.). This can all be captured automatically. We can also record who it was that imported the dataset, if we have some app-global configuration about the current user, like a full name and email-address. Note, that this might or might not be the person who created the dataset. + +So, at this stage all we did was copy a file(set) into a protected area to sort of 'freeze' it, and augment it with very basic metadata. We don't know anything about the nature of the dataset yet, all we know is the bytes the datasets consists of. It is important to point out that we would not have to store those chunks of bytes as files again, using the same structure as the original set of files. The dataset ceased to be 'files' here for us, we are only interested in the chunks of bytes (and their meaning) from here on out. We could store the data in an object store, zipped, tarred and feathered (pun intended). Or as byte-stream directly on block storage, if we were crazy enough. + +A side-note that makes things a bit more complicated, but it is probably necessary to address potential concerns: no, we don't actually need to copy the files, and can leave them in place and only generate the metadata and id for them. This might be necessary in cases where the source data is very big (photos, movies, audio-files, other large datasets). I don't think we need to figure out how exactly we deal with this scenario right now, but it basically comes down to making the user aware of what is happening, and what the implications are if the source data is changed externally (inconsistent metadata and potential incorrect result data-sets further down the line). There are strategies to help prevent some of those potential issues (checksums, for example), but overall we have to acknowledge that working with large-sized datasets is always a challenge, and in some cases we might just have to say: "sorry, this is too big for us right now". + +### 3) Augmented with more (basic) metadata + +To recapitulate: at this stage we have data (chunks of bytes -- not files!!! hit yourself over the head twice with something semi-heavy if you are still think in terms of files from here on out!) in a protected area, some very basic metadata, and an id for each dataset. We might or might not have authorship metadata (arguably one of the most important pieces of metadata), depending on whether who 'onboarded' the dataset actually created it. + +So, as a first step and following good practice, at this stage we should try to get the user to tell us about authorship and other core metadata about our dataset (licensing, copyright, ...). I don't think we can make this step mandatory, in practice, but we should push fairly hard, even if that means a slight decrease in user experience. It is very important information to have... + +So, one thing we could do was to have a checkbox that lets the user confirm: I created the data (in which case we can just copy the 'imported-by' field). + + + +### 3) Typed + +Chunks of bytes are not very useful by itself. They need to be interpreted to be of use. This means: determining in some way what the structure of the chunks of bytes is, and then applying common conventions for that particular structure (aka data type/format) when reading the chunks of bytes. Therefore 'interpreting' the chunks of bytes. This is a very obvious thing that happens all the time we use computers, but I think it makes sense to point it out here, because usually this is transparent to the user when they click an 'Open file' button in an application, and even some developers are ignorant to the underlying concept (and can afford to be, since they usually work several abstraction layers above where that is happening). + +To encapsulate this concept, we will create a 'data type' for each important group of datasets that share some important characteristics. Examples for very simple data types are strings, integers, booleans. I'll ignore those, because those are trivial to use, and that triviality actually makes it harder to explain the concept I'm talking about. More relevant data types are: 'table', 'network graph', 'text corpus', 'photo collection'. Every data type inherently contains a description of, well, the 'type' of data represented by it, and, with that, information about how a user or code caqn access the actual data, and/or some of its properties. + +From here on out, I'll focus on tabular data in some form or other, since I expect that this will be one of our most important (base-) data types. I expect the reader to 'translate' whatever I'm saying below to other types, and extrapolate the practical differences. + +So, to explain this step I decided to look at three different use-cases (mostly because we use them in 2 of our example workflows, so people should be familiar with them): + +- a csv file with tabular data +- an imported folder of text files (a corpus) +- two imported csv files containing edge and node information to form a network graph + +#### Example: tabular data + +This is the simplest case, and very common: we have a csv file, and need to have some sort of tabular data structure that we can use to query and analyze the data contained in it. + +Let's assume we have onboarded a csv file using *kiara*, so we have a dataset id that we use to point to it. Technically, this dataset is already 'typed': it has the type 'file'. This is not a very useful type, all it allows can tell us is a file name (which in a way is metadata), and the file content. We can ask *kiara* to interpret the content of this file as table, though, because we know it must be one. This means we 'overlay' a different, more specific data type on top of the same data. + +Under the hood, *kiara* will use the Apache Arrow [``read_csv``](TODO) helper method, which is very smart and fast, and it can create an Arrow Table object out of a csv file. It can figure out file encoding, column names (if present), column types, seperator characters. This detection is not fool-proof, but should work good enough in practice that we don't need to worry about it here. What really happens here is that the ``read_csv`` method is not just reading our data, but also, at the same time, is adding some important metadata to our dataset. Pandas can do the same with its csv import method. Even though this adding of metadata is more or less transparent to the user -- so they are not really aware of it -- it happens, and it is a very important thing that must happen to make our dataset useful. In our application, we might or might not want to ask users whether the extracted column names and types are correct, but this is a UI-specific implementation detail. + +So, considering all this, the important point here is that at this stage we have actual 'table' data, there is no need for the original csv file anymore (except as a reference for data lineage purposes). Our dataset is now of the data type 'table'. Which means we have an assurance that we can query it for table metadata properties (number of rows, number and name of columns, column types, size in bytes, etc.). And we can apply functions against it that are commonly applied against tabular data (sql queries, filters, etc.). That means, a 'data type' is really just a convention, a marker, that tells us how the bytes are organized for a particular set of bytes, and how to interact with it. This is all users and 3rd party-code needs to worry about. Implementation details about how this data is stored or loaded are irrelevant on this level of abstraction. This reduces complexity for *kiara*s external facing API, while, of course, introducing some extra complexity internally. + + +#### Example: text corpus + +The source data comes as a folder of files, each file contains (just) text (not structured like json, csv, etc.). When we do the 'onboarding' step for this data, all we do is copy the files verbatim into a new location. There could be some metadata implicit in the relative paths of each file (e.g. languages -- files for the same language live in a subfolder named after the language), and there can also be some metadata in the file names. We preserve all that metadata by copying the folder one-to-one, without changing anything. But it is important to note that this metadata, as of yet, is still uncaptured. + +The 'soul' of this dataset (meaning: the properties of the dataset we are interested in and we want to use in our investigation, and which will hopefully answer our research question) is in the content of each text file (aka the unicode encoded chunks of bytes). It is important to say again: at this stage the dataset ceased to be a set of files! It is a dataset within *kiara* that has an id (a single one! not one for every text!), and it has a basic set of metadata fields (the ones we could collect automatically). Yes, the dataset is backed by a set of files in the *kiara* data store, but that is an implementation detail nobody needs to know about, and I think we should try hard to hide from users. If you haven't noticed so far: I strongly believe the file metaphor is a distraction, and not necessary for us, except when import/export is concerned. + +Anyway, *kiara* does not know much about the dataset at this stage. To be of actual use, we need to interpret the data. In this case, we know we want to interpret the data as a text corpus. + +The most basic shape we can imagine a text corpus to look like is a list of strings (an array, or a single-column table). For making it easier to work with the text corpus in the future, let's make up a convention to save in tabular form, and the column containing the text items is always named ``text_content``. If we use, for example Apache Arrow to store that table, it makes the stored chunks of data much smaller (in comparison to text files), and it also makes the whole thing easier (at least faster) to query. It also allows us to easily attach more (meta-)data to the dataset. + +!!! Note + The distinction between data and metadata becomes a bit blurry here. In a lot of cases, when I say metadata, it is metadata from the point of view of the research proess, not metadata for *kiara*. I don't know how to make it clear which I'm talking about in each case without making this whole thing even more unreadable as it already is, so I will just have to ask you to figure it out yourself, in each case :-) + +Because we didn't lose any of the implied metadata when onboarding our folder of text files, it would be a shame if we wouldn't actually capture it. In this case, let's assume we didn't have any subfolders (so no metadata in their name), but our files are named in a special way: + +``` +[publication_id]_[date_of_publishing]_[other_stuff_we_are_not_interested_in] +``` + +!!! Note + The information about the format is important (in a way it is also an input) and we need to retrieve it somehow. This is a real problem that doesn't look like a big problem. But it is, for us. I'll ignore this here, because it would complicate things too much and is only of tangential relevance. + +This means, we can extract the publication id and the date of publishing with a simple regular expression, and we can add a column for each one to our table that so far only contains the actual text for each item. The publication id will be of type string (even though some of the ids might be integers -- we don't care), and the publication date will be a time format. Now we have a table with 3 columns, and we can already filter the texts by date easily, which is pretty useful! We wouldn't, strictly speaking those two additional columns to have a dataset of type 'text corpus' but it's much more useful that way. As a general rule: if we have metadata like that, it should be extracted and attached to the data in this stage. It's cheap to do in a lot of cases, and we never know when it will be needed later. + +What we have at this stage is data that has the attributes of a table (columns with name and type info, as well as rows representing an item and it's metadata). This is basically the definition of our 'text corpus' data type: something that allows us to access text content items (the actual important data) using a mandatory column named ``text_content``, and that has zero to N metadata properties for each of those text items. In addition, we can access other metadata that is inherited from the base type (table): number of rows, size in bytes, etc, as well as its lineage (via a reference to the original onboarded dataset). +Internally, we'll store this data type as an Arrow table, but again, this is an implementation detail, and neither user nor frontend needs to know about this (exceptions apply, of course, but lets not get bogged down by those just now -- none of them are deal-breakers, as far as I can see). + +#### Example: network graph data + +Similar to the text corpus case above, let's think about what a basic definition of a network graph data type would look like. It would have to include a list of nodes, and a list of edges (that tell us how those nodes are connected). Actually, the list of nodes is implied in the list of edges, so we don't need to provide that if we don't feel like it (although, that doesn't apply if we have nodes that are not part of any edge). In addition, both nodes and edges can have attributes, but those are optional. So, our network graph data type would, at a minimum, need to be able to give us this information about all this via its interface. [networkx](TODO) is one of the most used Python libaries in this space, so let's decide that internally, for us, a network graph is represented as an object of the [Graph](TODO) class, or one of its subclasses. + +This class will give us a lot of useful methods and properties to access and query, one problem left is: how do we create an object of this class in a way that fits with our overall strategy? We can't save and load a networkx object directly ([pickling](TODO) would be a bad idea for several reasons), so we need to create (and re-create) it via some other way. + +For this, lets look at the constructor arguments of this class, as well as what sort of data types we have available that we can use to feed those arguments. [One option](TODO) apparently is to use a list of edges contained in a Pandas dataframe as input, along with a name of columns representing the names of source and target column name, something like: + +``` + graph: nx.DiGraph = nx.from_pandas_edgelist( + pandas_table, + source_column, + target_column, + ) +``` + +This could work for us: as in the other example, we can use a table as the 'backing' data type for our graph object. Considering a graph without any node attributes, we can have a table with a minimum of two columns, and via a convention that we just made up, we say that the source column should be called ``edge_source``, and the target column ```edge_target```. We wrap all this in an Arrow table again, and save it as such. And later load it again, assuming the same convention (which, basically, saves us from asking for 2 column names every time). If our graph also includes node attributes, all we do is extend the implementation of our network graph data type to create a second table with a required column ``node_id``, and one or several more columns that hold node attributes, similar to the metadata in our 'text corpus' example from above. + + +### 4) Transformed + +With all that out of the way, we can finally do something interesting with the data. Everything up to this point was more or less housekeeping: importing, tagging, marking, organizing datasets. We still are operating on the same actual data as was contained in the original files (whatever type they were). But we now know exactly what we can do with it without having to ask questions. + +Using the 3 example from above, we now know we have 3 datasets: one table, one text corpus (which is also a table, but a more specific one), and a network graph. And each of those datasets also comes with metadata, and we know what metadata files are available for what data types, and what the metadata means in each context. + +A first thing we can do is automatically matching datasets to available workflows: we know what input types a workflow takes (that is included in each workflow metadata). So all we need to do is check the input types of each available workflow against the type of a dataset. This works even with different specificity: give me all workflows that take as input a generic graph. Or: give me all workflows that take a directed graph as input (this is information that is included in the metadata of each network graph dataset). diff --git a/docs/architecture/data/requirements.ipynb b/docs/architecture/data/requirements.ipynb new file mode 100644 index 000000000..68a0e7ddc --- /dev/null +++ b/docs/architecture/data/requirements.ipynb @@ -0,0 +1,936 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import dharpa\n", + "from rich.jupyter import print\n", + "from dharpa import DHARPA_TOOLBOX_DEFAULT_WORKFLOWS_FOLDER\n", + "from dharpa.graphs.utils import graph_to_image\n", + "from dharpa.utils import get_data_from_file" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Context & Requirements\n", + "\n", + "## Types of data\n", + "\n", + "- scalars (mostly user inputs, booleans, strings, enums, numbers)\n", + "- lists of items of the same type\n", + "- tabular data (a collection of lists of items of the same type, each with the same number of items, incl. schema)\n", + "- binary data: images, videos, audio files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In our application, we'll deal with a few basic types of data:\n", + "\n", + "- scalars (mostly user inputs, booleans, strings, enums, numbers)\n", + "- lists of items of the same type\n", + "- tabular data (a collection of lists of items of the same type, each with the same number of items, incl. schema)\n", + "- binary data: images, videos, audio files\n", + "\n", + "*Sidenote*: I consider every kind of user input as data, since it is conceptually the same thing and needs to be recorded and managed the same way.\n", + "\n", + "For our purpose, we can ignore scalars because they are easy and cheap to handle, and can be attached to any sort of data or metadata in a few different ways. Also, let's ignore binary data for now, while acknowledging that we will need a strategy to deal with efficiently, in a way that is not too different from how we deal with other types.\n", + "\n", + "Which leaves us with lists and tabular data. Those are different to scalars, because there is no telling in advance how many rows they will have, and how large its cells will be (aka 'how many bytes are we dealing with, KBs, MBs, GBs, TBs?'). List (arrays) will be our main data type, along with tables (dataframes) -- the latter are really just lists of lists (including a schema/description of the type of each list). In a lot of cases a module will receive a table, and the output will be a list of the same length as the table. When using Pandas, we usually assign dataframes to variables, this is handy because we have access to the whole dataset via a single variable, and can access the columns seperately via their names. For our case, because we will have connected modules, we will probably deal with 2 scenarios:\n", + "\n", + "- a module changes the data in a dataframe in one or several columns: this will be rare, but in this case the result of such a module will be a new dataframe\n", + "- a module adds one or several column to a dataset: this is much more common. It doesn't make much sense to have dataframes as outputs in this case, since those would contain the same data as the input. There is no need to allocate double the amount of memory, for an exact copy of something we already have available (for read purposes). So, in those cases the output will be one or several lists, with the same amount of rows as the input dataframe. Those lists can then be easily assembled into a dataframe at a later stage, if the need arises.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "## Requirements\n", + "\n", + "Since data will be the central object our application handles, we need to decide on an internal (as well as import/export) data format. The obvious thing to do would be to use the most common format (probably json), and just use that. For several reasons (layed out in the [data_formats document](data_formats.ipynb)), I don't think this is a good idea in our case. I think we can anticipate our main requirements on a data format before writing any code, which is why I created this document: to list those requirements, and to come up with a recommendation that is based upon them." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Technical requirements\n", + "\n", + "- schema-aware (ideally included in the format)\n", + "- binary format (performance, filesize)\n", + "- column-based (for tabular data -- analytics query performance)\n", + "- zero-copy, memory-mapping\n", + "- compression in-build (preferrable)\n", + "- possible to use from different programming languages (at least Python & JS)\n", + "- as little cpu, memory, and disk utilization as possible\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The first group of requirements is technical: we are creating an interactive application, which means we should at least spend some time optimizing for speed (in those instances where it's possbile). In addition: the more we know about our data and its 'shape', the less complex our code has to be, since that removes the need for investigating and validating data at multiple points along its journey.\n", + "\n", + "The latter can be achieved by using a data format that is schema aware (e.g. not csv), and ideally includes that schema as metadata in its specification, so we can query the data(-bytes) directly, without having to read seperate, external specifications.\n", + "\n", + "For the performance requirements, it's fairly easy to see why we should be looking for a binary, column-based format, that ideally has extra features like memory-mapping and compression.\n", + "\n", + "Last but not least we want to be able to access our data from different programming languages. Python and JavaScript support will be mandatatory, but being able to read from Julia and R would also be highly desirable.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### General requirements\n", + "\n", + "- option to attach metadata\n", + "- versioning of datasets\n", + "- versioning of metadata\n", + "- we want to be able to treat all data the same way, independent of size, format, other characteristics\n", + "- we want all of this to be more or less transparent to our end-users!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Because it's in the nature of our application that we won't exactly know hob big and what shape the data we will be dealing with will have, we have to anticipate a wide range of types and sizes. In order to not have to deal with those differentely each time, it would be highly adventageous if we can come up with a standard 'interface' for our datasets, that lets us, as a minimum, query minimal required metadata (schema, authors, size), and which allows us to forward the dataset to other parts of our application (other modules, frontend), without having to convert or serialize/deserialize it.\n", + "\n", + "Most importantly, we will have to figure out a way to make most of this transparent to users. This is probably nothing a data format can help us with directly, but there might be factors in the periphery which can make this easier, or harder (e.g.: how common is that data-format, how much tooling exists for it?)\n", + "\n", + "One of our main requirements is to be able to easily attach metadata to our datasets. In addition we want it to be as easy as possible to 'version' the containing data, as well as the attached metadata. Those requirements stem from the need for good research data practices, and should not need further explanation. Let's look at those two points in a bit more detail:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "#### Technical metadata (automatic)\n", + " - data type\n", + " - schema (if tabular data)\n", + " - statistics on columns, rows (number of rows, min in column, max in column)\n", + " - data specific indicators/summaries (e.g. geographic range, resolution, ...)\n", + " - digest / checksum (unique id!)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The most important metadata we'll be dealing with is the type of data, and its schema in case its in tabular form. As was mentioned above, ideally this would be forced and included by/in the data format we choose, so we can rely on it to be available, always.\n", + "In addition, in a lot of cases it aids performance if certain characteristics of a dataset are known without having to actually touch it. One example would be min/max values for numeric columns. Geographic range, resolution could be interesting for location data, creation date for photos, and so on.\n", + "A special item of metadata is a checksum: that enables us to confirm the bytes of a dataset haven't changed since we last checked, and it also makes things like caching or lookups easier.\n", + "All of those metadata items can be created more or less automatically, without any user input. This is important to differentiate, because that means we don't need to worry about providing a user-interface so they can be added/attached.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + " \n", + "#### Other metadata (requires user input)\n", + " - provenance / lineage / heritage\n", + " - author(s)/creator(s) incl. contact info\n", + " - creation / modification date\n", + " - comments, annotations\n", + " - \"ALL THE METADATA\" (Angela R. Cunningham, PhD)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The second category of metadata is defined by the necessity for manual user input (at least in parts). Which of course means we need to somehow provide a metadata editing facility for those items. Authorship information as well has the provenance-related metadata is arguably the most important one here. But I imagine we'll come up with quite a few more metadata fields we will want to be able to attach. It's probabaly a good idea to talk to our colleagues who develop Zotero and Tropy for some input in this regard." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Dataset versioning\n", + "\n", + "- versioning of the 'actual' data:\n", + " - new data added\n", + " - existing data changed/fixed\n", + " - existing data removed\n", + "- metadata versioning:\n", + " - independent of actual data changes (except for last modification dates, new authors added, checksum)\n", + " - new metadata added\n", + " - existing metadata changed/fixed\n", + " - metadata removed\n", + " - no new dataset version necessary\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Data versioning is usually a bit overlooked (although that seems to be changing now, and there are some 'git for data' services and tools cropping up). But it's crucial for good data practices.\n", + "\n", + "In order to always know how result data was created, we need to know exactly which inputs were used, and what exactly was done to them. If any of the inputs changes, and we don't record it, then there will be confusion later, when someone tries to recreate a result with the changed input.\n", + "\n", + "This implies we have a way to point to datasets using some sort of identifier, something like a DOI -- but it does not need to be globally unique, just locally (unless the data gets shared/exported).\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Contexts in which we handle data\n", + "\n", + "- 'onboarding' data:\n", + " - external data gets fed into a workflow / into our app\n", + " - we store a copy internally (to prevent changes we are not aware of)\n", + " - some minimal metadata needs to be provided (but can be at least partly determined automatically)\n", + " - gets unique id / alias(es) & checksum & version '1'\n", + "- internal data transformation & transfer:\n", + " - each module processes input data and creates output data\n", + " - output data gets fed into the input of another module\n", + " - input/output data is requested by frontend for display purposes (viz, statistics, table-view, ...)\n", + "- exporting data:\n", + " - researcher needs data in a specific format (csv, Excel, json...) for further processing, publishing, etc." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Along with listing requirements, it makes sense to think about in which contexts we deal with data, and how. I think we can seperate three main areas:\n", + "\n", + "- data onboarding\n", + "- internal data transformation & transfer\n", + "- data export\n", + "\n", + "For the first and last items the 'interface' of the data is important, which means we are concerned about how to translate external dataformat into our internal one, as well as the other way around. For the second item we only deal with our internal format, so performance and code complexity are more important considerations.\n", + "\n", + "For data onboarding, one thing is important is that we store a copy of the dataset the user points us to in a space where we can be sure the data doesn't get changed by external means. We would also add some automatic metadata, and might or might not require the user to provide some basic required metadata-fields manually. We would also give a newly onboarded dataset a version '1' (or maybe '1.0').\n", + "\n", + "Data export is the least problematic area: since we have a minimal set of required metadata for every piece of data we use internally, it should be fairly trivial to export it into any viable export format (csv, excel, json, parquet,...).\n", + "\n", + "Data onboarding and export could also be combined in some scenarios: for example if we don't provide a tool to 'clean' up data (or do something else that would require a version change on the dataset) and users would have to do it externally, we could export the dataset into a temporary folder, let the user do their thing, and then re-import the changed dataset into a new version of the existing, internal one, copying the existing metadata with some additions that describe what was done to the data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Solution proposal" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Apache Arrow\n", + "\n", + "- binary, column-based, language-independent in-memory format\n", + "- well defined schema and data types, rudimentary custom metadata support\n", + "- native support for 2 on-disk formats:\n", + " - feather (same as in-memory format), parquet\n", + "- client implementations for most relevant languages\n", + "- growing ecosystem:\n", + " - Arrow Flight (fast data transport framework)\n", + " - Plasma (In-Memory object store)\n", + " - Vaex (native support for memory-mapped feather files, memory-mapped querying)\n", + " - duckdb (column-based, python-native sql engine)\n", + " - easy import/export to NumPy/Pandas types (Arrays, DataFrames) -- still some serialization cost\n", + "- likely to be the standard format for data exchange in data science/data engineering in the future" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In my research, [Apache Arrow](https://arrow.apache.org/) came closest to match our technical requirements, and should let us implement most of the other ones too. It is a binary, column based in-memory format that comes with implementations in a number of programming languages (incl. the ones we are interested in).\n", + "\n", + "From the Arrow website:\n", + "\n", + "> Apache Arrow is a software development platform for building high performance applications that process and transport large data sets. It is designed to both improve the performance of analytical algorithms and the efficiency of moving data from one system or programming language to another.\n", + ">\n", + "> A critical component of Apache Arrow is its in-memory columnar format, a standardized, language-agnostic specification for representing structured, table-like datasets in-memory. This data format has a rich data type system (included nested and user-defined data types) designed to support the needs of analytic database systems, data frame libraries, and more.\n", + "\n", + "In addition to the efficient in-memory format, it supports 2 on-disk formats: feather & parquet. The former one is basically the same as the in-memory format (with all the advantages that come with that), and the latter is a fairly standard format to exchange large(-ish) datasets between processes and infrastructure components.\n", + "\n", + "In my opinion (and I'm not alone), Arrow will be the de-facto standard data format for tabular data in the future, in both data science and data engineering. It is well designed, and a lot of the reasons why it came about line up fairly well with our own requirements (althought, at a different scale obviously). Because of that, there is a rich tooling ecosystem growing around Apache Arrow at the moment, which I think we can expect to satisfy to most of our current and future needs in the near to medium-term future, if not already.\n", + "\n", + "Esp. [vaex](https://vaex.io/) and [duckdb](https://duckdb.org/) look like very interesting developments. Pandas and Numpy import/export is very well supported, and as well optimized as seems possible. [Apache Arrow Flight](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) and the [Plasma Object store](https://arrow.apache.org/docs/python/plasma.html) look like good contenders that could handle our potential data transport needs in the future." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Identifying and versioning datasets\n", + "\n", + "- every dataset gets it's unique id (uuid) as well as one or several user-defined and automatic aliases\n", + "- a new version of a dataset is created when its data content changes (content can be entirely different)\n", + "- a user can 'designate' a new version of data, in some cases it can be done by our application automatically\n", + "- versioning of metadata is independent of dataset version\n", + "- allows us to discover 'out-of-date' results (via their connected input-ids), and recreating them with updated input dataset\n", + "- frontend must be able to list, access and query datasets/inputs/outputs via unique id & version number\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "It should be obvious that and why we need some sort of (at least internal) unique identifier for each dataset. The main scenario where users will come in touch with such an identifier is when they are asked to choose an input dataset for a module/workflow. It's possible to make that 100% transparent to the user, and let them for example select a folder of csv files, which we would then copy into our internal data repository, assign it an id, and use that for our calculation. That would mean though, that the next time the user wants to use the same dataset again, we would do the same again, basically duplicating our internal dataset. We probably could be smart about it, and recognize those sort of duplicates, but that would involve fairly complex and fragile code I think we should rather avoid, and come up with an interface metaphor/language that makes users aware what is going on, and which empowers them with proper tooling to manage their research data according using best practices (metadata, versioning, etc.).\n", + "\n", + "So, I propose that we should have a 'data management' section in our application UI, which could be used to both 'onboard' and manage datasets independent of a workflow, but also within the context of a workflow (for example by re-using some of the file selection widgets and filling in a newly create dataset id into a workflow input, right after onboarding). How that would look like exactly, we'd have to figure out and I think it would be a work-item on itself.\n", + "\n", + "The same goes for dataset versioning. One way I can imagine this working is to have a ``..`` postfix to our unique dataset identifier, where the ``minor`` part gets incremented with every metadata version change, and the ``major`` part for when the actual data changes. Another point to consider is whether to only use version number increases, or also have a concept of 'branching', where the versions of datasets can diverge, from a common parent. I think there is a point to be made for not making things to complicated unless really necessary, so most of this can be solved with a simple versioning scheme, and assigning totally new datasets id if something significant changes in the data of a dataset (while potentially preserving the lineage information by storing the 'parent id' in the new datasets metadata). But, as I said above, I think this would be a good item to investigate independently." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Storing module results\n", + "\n", + "- requirements: workflow history & snapshots & long running processes\n", + "- need for caching of at least the latest results\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "This section includes a quick recapitulation how our workflows are described and managed by the backend, as well as an outline how to handle and store temporary as well as final workflow outputs. This is important, because having access to already computed results is necessary for some of our requirements (derived from our user-stories):\n", + " - workflow history: enable the user to move back in the history of input sets of a workflow session\n", + " - snapshots: 'tag' certain input sets (basically creating a snapshot of that particular workflow state)\n", + " - support for long running processes: a user will want to have access to computational results, even if the had other workflow sessions inbetween (while a particularly long running job was running)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Quick recap: workflow modularity\n", + "\n", + "Every module has:\n", + " - one or several named inputs\n", + " - one or several named outputs\n", + " - as well as schema information for each input and output\n", + "\n", + "A workflow is a user-facing entity that:\n", + " - can also be used as a module (has inputs, outputs, schema)\n", + " - contains one or several modules\n", + " - where some inputs of some (internal) modules can be connected to an output of another (internal) module\n", + " - inputs of modules that are not connected to an output of another (internal) module are user inputs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In this example we'll use a workflow that is simlates a ``nand`` logic-gate. Such a logic gate can be created by using ``and`` and ``not`` logic gates one after the other. Below you can see a short description of the modules and their inputs, as well as how that would be configured in a workflow description json file. The important part is the ``modules`` value." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### example module: ``nand``\n", + "\n", + " - consists of two other modules: \n", + " - ``and``\n", + " - inputs: ``a`` & ``b`` (booleans)\n", + " - output: ``y`` (boolean - true if both inputs are true, otherwise false)\n", + " - ``not``:\n", + " - input: ``a`` (boolean - connected to ``y`` output of ``and``)\n", + " - output: ``y`` (boolean - negated input)\n", + " - two inputs: ``a`` & ``b`` (booleans, connect directly to ``and`` inputs)\n", + " - one output: ``y`` (false if 'a' & 'b' are true, otherwise true -- connects to ``y`` output of ``not`` module)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
Module description: nand\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
{\n",
+       "    'modules': [\n",
+       "        {'module_type': 'and'},\n",
+       "        {'module_type': 'not', 'input_links': {'a': 'and.y'}}\n",
+       "    ],\n",
+       "    'input_aliases': {'and__a': 'a', 'and__b': 'b'},\n",
+       "    'output_aliases': {'not__y': 'y'},\n",
+       "    'module_type_name': 'nand',\n",
+       "    'meta': {'doc': \"Returns 'True' if both inputs are 'False'.\"}\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(\"Module description: [b]nand[/b]\")\n", + "print(get_data_from_file(os.path.join(DHARPA_TOOLBOX_DEFAULT_WORKFLOWS_FOLDER, \"logic_gates\", \"nand.json\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "After creating the workflow description file, we create the workflow object in code, and for illustration purposes, we display the execution order and the state graph of the workflow (in its inital, stale state without any inputs)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAGwAAAD7CAYAAACPBXE2AAAABmJLR0QA/wD/AP+gvaeTAAANP0lEQVR4nO2dX0xb5RvHv4e2tKWwgo38byJZAleyRIMJMHQJyp+AKSwFDHZsiRrvFrZ4odmNicm8cDpnssSwK71QcSaQsBknIeVCKIl3Ji60ZrpIFUi7za4wykb3/C74UVda5lhP6XnOnk/yXvQ5b1+e93xy3vOmvOe8ChERBDbk5ToBYXeIMGaIMGYYtweCwSBmZ2dzkYuwjaamJlRXVycHaRujo6MEQIoGyujo6HY9lHKFPSByp0PCHqAoStq43MOYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcKYIcIeoLCwEAcPHsx1Gg9FhDFDhDFDM8LGx8ehKEqi+P1+9Pf3w+FwJGLhcBgAcOPGDZw8eRL79+9Hfn4+SkpK0NnZCa/Xm9Luo9Q9c+YMFEXB6uoqZmZmEn/PaNxxBUXu2GkRTq5wuVwEgF566SXyer20urpKc3NzZDAYKBQK0eLiItXU1FBZWRlNTExQJBIhv99Phw8fJkVR6MKFC4m2dlOXiMhms1Fzc/Nedzkt2GERjmaFff/992mPHzt2jADQ119/nRSPxWJUWVlJVquVlpaWdl2XiIcwzQyJ23nhhRfSxsfGxgAAXV1dSXGz2YzW1lasra3hypUru67LBc0Ks9lsKbH19XVEIhFYLBYUFRWlHC8rKwMALC0t7aouJzQrLB1msxl2ux2xWAzRaDTl+PLyMgCgvLx8V3W32GktoJZgJQwAent7AQCXL19Oiq+vr2NqagpWqxXt7e27rgsABQUFuHv3buJzXV0dRkZGstKPx2b7TU0rk461tbW0x7fP/G7fvp008xsZGXmsukREHR0dZLfb6c8//6TZ2VkyGo109erVrPZ3J6D1WaLP50u7vjwd4XCYhoeHqaamhkwmE9ntdmpvb6epqamM6s7Pz1NLSwvZbDZyOp10/vx51fv5qGhemJDMTsLY3cOedEQYMzIW9uDvf5mU999/X4XuZAct9THjXzfpCXiOTEt9lCGRGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGSKMGTv+Wv/tt9/uZR7CI7KjsIGBgb3MQ3hEFNLSP3tUor+/H4A+Rwm5hzFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDGD/QN9X3zxBc6ePYt4PJ6ILS4uAgAqKioSMYPBgBMnTuDo0aN7nqOasBcWCARQV1f3SHX9fj9qa2uznFF2YT8k1tbWor6+/qHbcCiKgvr6evayAB0IA4ChoSEYDIYdjxuNRvZD4Rbsh0QA+Pvvv+F0OnH//v20xxVFwcLCAqqqqvY4M/XRxRVWWVmJpqYm5OWldicvLw/Nzc26kAXoRBgAHDlyJG1cURQMDQ3tcTbZQxdDIgDcunULpaWl2NjYSIobDAYsLy/D4XDkKDN10c0VVlJSgra2tqTJh8FgQEdHh25kAToSBgAejydp4kFE8Hg8OcxIfXQzJALAnTt34HA4EIvFAAAWiwWhUAiFhYU5zkw9dHWFFRQUwOVywWQywWg0oqenR1eyAJ0JA4DXX38dGxsbiMfjGBwczHU6qqPK3u0+nw8LCwtqNJUx8XgcVqsVRIRoNKqZN7o5nU40NjZm3pAa2/+53e60+1dK+be43W41TrV6Wyq63W7Q5p6aOS9erxfT09M5z2OruN1utU6zOkOi1njxxRdznULW0KWwdL8p6gX99kyniDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDAA33zzDRRFgaIosFgsuU7noYgwAK+99hqICK2trblO5T8RYcwQYcwQYczQpLCNjQ2Mjo7ilVdeQXl5OaxWK5599lmcO3cuaaHo+Ph4YrKgKAquX7+OgYEBFBcXw+FwoLu7G9euXUtpf35+Hj09PbDb7bDZbGhpacFPP/20l118fDJaEfJ/3G63aotMiIgmJiYIAJ0+fZpu3rxJoVCIPvvsM8rLy6N33nknpb7L5SIA5HK5aHZ2llZWVmhycpKsVis1NDQk1f3tt9+ouLiYqqqq6Mcff6RoNEq//PILtbW10TPPPENms1m1fmyh5vnRrLBDhw6lxD0eD5lMJopEIknxLWETExMpeQGgUCiUiPX19REA+u6775Lq/vXXX2Q2mzUvTJNDYnd3N7xeb0r8wIEDuHfvHn799de032toaEj67HQ6AWw+8LfFDz/8AABob29PqltZWcnikVpNLsKJRCL4+OOPMTY2hmAwiH/++Sfp+J07d9J+z263J33Oz88HgMR9b319HdFoFBaLJe0S7tLSUgQCATW6kDU0eYW9+uqr+OCDD/DWW28hEAjg/v37ICKcPXsWwOZTKY+D2WxGUVERYrEYVlZWUo7fvHkzo7z3As0Ji8fjmJmZQXl5OY4fP46nn3468YaAtbW1jNvv7OwE8O/QuEU4HIbf78+4/WyjOWEGgwGHDh3C0tISPvroI4TDYaytrcHr9eLzzz/PuP3Tp0/jqaeewvDwMCYnJ7GysoKrV6/C4/HweNJFjZmL2rPEUChEb7/9NjmdTjKZTFRWVkbHjh2jd999N7FW/fnnnyefz5eyhv3UqVNERCnxrq6uRPt+v596enpo3759ian/pUuXqLW1NVH/jTfeUK0/ap4fVR7o6+vrAwBcvHgx06Z0iZrnR3NDovBwRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzRBgzVFvmFgwGNfNuQq0RDAZRXV2tSluqCZubm8PAwIBazekOtV7Bp6uXNG/R398PALq84uUexgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgwRxgz2D/R9+eWX+OSTTxCPxxOxxcVFAEBFRUUiZjAYcOLECRw9enTPc1QT9sICgQDq6uoeqa7f72exv8rDYD8k1tbW4sCBA4ndI9KhKArq6+vZywJ0IAwAhoaGYDAYdjxuNBrZD4VbsB8Sgc3tppxOZ9LufQ+iKAoWFhZQVVW1x5mpjy6usMrKSjQ1NSEvL7U7eXl5aG5u1oUsQCfCAODIkSNp44qiYGhoaI+zyR66GBIB4NatWygtLcXGxkZS3GAwYHl5GQ6HI0eZqYturrCSkhK0tbUlTT4MBgM6Ojp0IwvQkTAA8Hg8SRMPIoLH48lhRuqjmyER2Nwb0+FwIBaLAQAsFgtCoRCPjdweEV1dYQUFBejt7YXJZILRaERvb6+uZAE6EwYAg4ODuHfvHjY2NjA4OJjrdFRHldfv+Xw+LCwsqNFUxsTjcRQUFICIcPv2bc280c3pdKKxsTHzhtTYl3FrR3IpOxfN7ZTudrtBRJoo09PTmJ6eznkeW0Wtl1sCGt0pPVNaWlpynULW0KWwdL8p6gX99kyniDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmiDBmPNHCCgsLcfDgwVynsSueaGEcEWHM0Kyw8fFxKIqSKNevX8fAwACKi4vhcDjQ3d2Na9eupXzvxo0bOHnyJPbv34/8/HyUlJSgs7MTXq83UefMmTNQFAWrq6uYmZlJ/A2jkcH/czNbErKJ2+1WbZHJdlwuFwEgl8tFs7OztLKyQpOTk2S1WqmhoSGp7uLiItXU1FBZWRlNTExQJBIhv99Phw8fJkVR6MKFC0n1bTYbNTc3ZyXvB1Hz/Gj2CtvOm2++icbGRthsNrz88svo6urCzz//jHA4nKjz3nvv4Y8//sCnn36K7u5u7Nu3D7W1tfjqq69QUVGB48ePY3l5OYe9yBw2whoaGpI+O51OAJsP820xNjYGAOjq6kqqazab0drairW1NVy5ciXLmWYXNsLsdnvS5/z8fABIPPywvr6OSCQCi8WCoqKilO+XlZUBAJaWlrKcaXZhI+y/MJvNsNvtiMViiEajKce3hsLy8vJE7GEPsmsV3QgDgN7eXgDA5cuXk+Lr6+uYmpqC1WpFe3t7Il5QUIC7d+8mPtfV1WFkZGRvkn1MdCXsww8/RE1NDYaHh3Hp0iVEo1EEAgEMDg5icXER586dSwyNAPDcc88hEAhgYWEBPp8Pv//+u/YXoaox1czGtN7n86WsTz916hQRUUq8q6sr8b1wOEzDw8NUU1NDJpOJ7HY7tbe309TUVMrfmJ+fp5aWFrLZbOR0Oun8+fOq9mELNc+PKg/09fX1AQAuXryYaVO6RM3zo6sh8UlAhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDhDFDtbXJwWBQM+8m1BrBYBDV1dWqtKWasLm5OQwMDKjVnO5Q6xV8unpJ85OA3MOYIcKYIcKYYQQgiwkZ8T+HpntUF0NGGgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "workflow = dharpa.create_workflow(\"nand\")\n", + "graph_to_image(workflow.structure.execution_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph_to_image(workflow.create_state_graph(show_structure=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Now, we set the inputs (both ``True``, which means the end-result should be ``False``). As you can see from the state graph, the workflow inputs are directly connected to the module inputs of the ``and`` module." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "processing started: nand.nand\n", + "processing started: nand.and\n", + "processing finished: nand.and\n", + "processing started: nand.not\n", + "processing finished: nand.not\n", + "processing finished: nand.nand\n" + ] + } + ], + "source": [ + "workflow.inputs.a = True\n", + "workflow.inputs.b = True\n", + "\n", + "await workflow.process()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Again, lets look at the workflow state, this time we display it using a json data structure, not a network graph:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'alias': 'nand',\n",
+       "    'address': 'nand.nand',\n",
+       "    'type': 'nand',\n",
+       "    'is_pipeline': True,\n",
+       "    'state': 'results_ready',\n",
+       "    'inputs': {\n",
+       "        'a': {'schema': {'type': 'boolean', 'default': None}, 'value': True},\n",
+       "        'b': {'schema': {'type': 'boolean', 'default': None}, 'value': True}\n",
+       "    },\n",
+       "    'outputs': {'y': {'schema': {'type': 'boolean', 'default': None}, 'value': False}},\n",
+       "    'execution_stage': None,\n",
+       "    'doc': \"Returns 'True' if both inputs are 'False'.\",\n",
+       "    'pipeline_structure': {\n",
+       "        'workflow_id': 'nand',\n",
+       "        'modules': [\n",
+       "            {\n",
+       "                'module': {\n",
+       "                    'alias': 'and',\n",
+       "                    'address': 'nand.and',\n",
+       "                    'type': 'and',\n",
+       "                    'is_pipeline': False,\n",
+       "                    'state': 'results_ready',\n",
+       "                    'inputs': {\n",
+       "                        'a': {'schema': {'type': 'boolean', 'default': None}, 'value': True},\n",
+       "                        'b': {'schema': {'type': 'boolean', 'default': None}, 'value': True}\n",
+       "                    },\n",
+       "                    'outputs': {\n",
+       "                        'y': {'schema': {'type': 'boolean', 'default': None}, 'value': True}\n",
+       "                    },\n",
+       "                    'execution_stage': 1,\n",
+       "                    'doc': \"Returns 'True' if both inputs are 'True'.\",\n",
+       "                    'pipeline_structure': None\n",
+       "                },\n",
+       "                'input_connections': {'a': '__parent__.a', 'b': '__parent__.b'},\n",
+       "                'output_connections': {'y': ['not.a']}\n",
+       "            },\n",
+       "            {\n",
+       "                'module': {\n",
+       "                    'alias': 'not',\n",
+       "                    'address': 'nand.not',\n",
+       "                    'type': 'not',\n",
+       "                    'is_pipeline': False,\n",
+       "                    'state': 'results_ready',\n",
+       "                    'inputs': {\n",
+       "                        'a': {'schema': {'type': 'boolean', 'default': None}, 'value': True}\n",
+       "                    },\n",
+       "                    'outputs': {\n",
+       "                        'y': {'schema': {'type': 'boolean', 'default': None}, 'value': False}\n",
+       "                    },\n",
+       "                    'execution_stage': 2,\n",
+       "                    'doc': 'Negates the input.',\n",
+       "                    'pipeline_structure': None\n",
+       "                },\n",
+       "                'input_connections': {'a': 'and.y'},\n",
+       "                'output_connections': {'y': ['__parent__.y']}\n",
+       "            }\n",
+       "        ],\n",
+       "        'workflow_input_connections': {'a': ['and.a'], 'b': ['and.b']},\n",
+       "        'workflow_output_connections': {'y': 'not.y'}\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "state = workflow.to_dict(include_structure=True)\n", + "print(state)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### How to actually deal with workflow/module outputs?\n", + "\n", + "- why not store all results?\n", + "- smart way of storing/deleting/managing storage:\n", + " - compression\n", + " - efficient module design\n", + " - cleanup process\n", + " - only store results if good execution time/result size ratio, otherwise just re-process" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To satisfy the above mentioned requirements, my current plan is to just store all results of all module runs, instead of coming up with a complicated caching scheme. There will have to be some sort of 'result-cleaning' and consolidation, but I think if we are being smart about it this might be the most promising strategy, which will introduce the least amount of complexity.\n", + "\n", + "A folder structure to accomodate that would probably look something like this:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "\n", + "\n", + "- each module has its own name/id, all results for a module will be stored under same folder\n", + "- 'result.feather' has one or several columns that represent output values\n", + "- also, one column with runtime metadata (execution time, version of workflow, etc.)\n", + "- this works well with the 'dataset' API in Apache Arrow: https://arrow.apache.org/docs/python/dataset.html (which means we can lazy-load all results of a workflow/module into the same dataframe, and do 'meta'-queries and -analysis on that if we choose to)\n", + "- debatable whether 'workflow-results' have to be stored at all, since they are just copies of 'module-results'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In order to not waste too much hard-disk space (which would be the most obvious concern here), I think we have a few different options. For one, we'd store all results with compression enabled. We would implement our modules in an efficient way that is aware of how we store results. We might have a cleanup process running in the background that is aware of how often a result is accessed, and how it's compute-time/result-size ratio is. In some cases where that ratio leans very much towards result-size, we might decide to not store those results at all, but re-process every time." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Streaming module results\n", + "\n", + "TBD" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "This is an area I haven't done too much work on yet, but in general: we will want to have access to intermediate results (or, rather: partial results in real-time), so we can provide the user with information they can use to determine whether to cancel a running process or not. Even though we will probably not have that functionality available in our initial, first version, I think we should anticipate that requirement, and design our data management with it in mind, so it can be added later without having to re-write a lot of code." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Default data format (for import/export)\n", + "\n", + "- every result can be described by specifying:\n", + " - the input dataset(s) and other inputs\n", + " - the workflow (and workflow version) that was used to produce it\n", + " - -> theoretically, every (result) dataset can be described by very small json file/metadata set\n", + "- proposal: invent our own (small) set of file formats (including version-nr, metadata schema, payload)\n", + " - Apache Arrow based for tabular/scalar data\n", + " - folder/zip based for binary data\n", + " - all our import modules would create files in that format\n", + " - provide tooling (and modules) to convert/export those to all common data formats\n", + " - possibility of data registries:\n", + " - very simple implementation compared to products like dataverse, ckan\n", + " - high performance data transfer (using Apache Flight)\n", + " - different levels: local (within our app), organization-wide, global (aka default registry)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The last thing to decide is whether we want to provide a 'standard' data format for our application. This will be modelled closely upon the format we will use internally, but with some added metadata fields and possibly restrictions.\n", + "\n", + "This is mostly for the purpose of sharing, transferring, and publishing data. In principle, there is a really lightweight way to share our work: since we can describe everything we do by specifying the workflow, and listing all the inputs we use with it. Assuming all inputs are either scalars or, in case of datasets, available via download, this description could be very lightweight: it's just a json file containing the workflow structure (incl. maybe version information), and input-data urls. With that, everyone with access to the data can in theory replicate any end- and intermediate result.\n", + "\n", + "In theory, that json structure can also be attached to every result dataset, which means that our results will always come with information how they were produced (and how to re-produce them).\n", + "\n", + "Since all this is very dependent on being able to have access to metadata alongside the 'actual' data, and because in my experience systems and architectures that store metadata seperately to data are either fairly complex, specific and hard to maintain, I would propose we come up with a way to package our data in a way that allows for our metadata to always be included, and where it's easy to access both data and metadata without having to open the whole file. Arrow gets us a long way toward that (for tabular data), the only thing that is missing is a standard way to include metadata. For that we have two options: use the Arrow 'metadata' field (which is fairly limited, it only takes encoded byte-arrays as keys/values), or store our metadata in a seperate column. Currently, I'm leaning toward the latter option, but this is something we'll have to try out and play with to get a better idea how feasable it is.\n", + "For other types of data (binary blobs, images, etc.), I propose we use an archive format (zip, tar, ...) with a json file at a standard location (e.g. './.metadata.json') that includes the same metadata schema a tabular dataset would use. That way our datasets always have the same 'interface'. And we can provide a set of standard tools (which could be implemented as workflow modules and workflows) to import and export 'our' data from/to commonly used formats like csv, excel, etc (which in most cases would not include metadata at all).\n" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/architecture/data/result_tree.png b/docs/architecture/data/result_tree.png new file mode 100644 index 000000000..98472cd2f Binary files /dev/null and b/docs/architecture/data/result_tree.png differ diff --git a/docs/architecture/decisions.md b/docs/architecture/decisions.md new file mode 100644 index 000000000..8ac420cc9 --- /dev/null +++ b/docs/architecture/decisions.md @@ -0,0 +1,70 @@ +# Decisions + +This page lists a few of the main decisions that were taken, what the considerations around them were, their impact, as well as why they were made. + +## Supporting two sorts of modules: 'core', and 'pipeline' modules + +When starting to write code, we didn't have yet many examples of modules and how specific/broad they would be in their utility. +I think we should have done a better job gathering those and coming up with a set of modules that would be sufficient for +our first 10 or so workflows before writing any code, but alas, we didn't. One way I saw to lower the risk of us implementing +us ourselves into a corner was to make our modules as flexible, re-usable and 'combinable' as possible. And the best way +I could think of to do that was to have a very simple interface for each module (each module has a defined set of input/output fields, and one main function to transform inputs into outputs), and to allow several modules to be combined into a 'new' +module that has those same characteristics/interface. + +Advantages: +- easy to declaratively create re-usable modules with just json/yaml +- easy to re-use modules/other pipelines for UI-specific subtasks (like data previews/querying) +- in most cases, the higher-level backend code does not know about core- and pipeline- modules, since they can be treated the same + +Disadvantages: +- the lower-level backend code needs to implement two different ways to assemble/create modules, depending on whether it's a core-module, or a pipeline + + +## Use of subclassing in general + +Across *kiara*, I'm using subclassing and inheritance in some instances, esp. important base classes are [KiaraModule][kiara.module.KiaraModule] and [PipelineController][kiara.pipeline.controller.PipelineController]. I'm aware that this is considered bad practice in a lot of cases, and I have [read](https://www.sicpers.info/2018/03/why-inheritance-never-made-any-sense/) [my](https://python-patterns.guide/gang-of-four/composition-over-inheritance/) [share](https://python-patterns.guide/gang-of-four/composition-over-inheritance/#problem-the-subclass-explosion) of opinions and thoughts about the matter. In principle I agree, and I'm not 100% happy with every decision I made (or thought I had to made) in this area for *kiara*, but overall I decided to allow for some inheritance and class-based code sharing in the code, partly to speed up my implementation work, partly because I thought some of the disadvantages (like having to search base classes for some function definitions) are not as bad in a certain context than in others. I can totally see how others would disagree here, though, and there are a few things I would like to change/improve later on, if I find the time. + +One of the main advantages I get out of using inheritance is being able to automatically discover subclasses of a base class. This is done for multiple of those, like: + +- [KiaraModule][kiara.module.KiaraModule] +- [ValueType][kiara.data.type.ValueType] +- [MetadataModel][kiara.metadata.MetadataModel] + +Using auto-discovery in a Python virtualenv removes the need for workflow/module developers to understand Python packaging and entry_points. I've written a [project template](https://github.com/DHARPA-Project/kiara_modules.project_template) that sets up all the basics, and developers focus on creating new classes (basically plugins), with no extra registration work to be done. I hope this will aid adoption. And that I've managed to design those base classes well enough so that they are easy to use and understand, so that some of the main drawbacks of subclassing won't matter all that much. + + +## Requiring to subclass an abstract base class when creating a module + +The main class that uses a subclassing-strategy is [KiaraModule][kiara.module.KiaraModule]. At it's heart, it's +basically just a wrapper around a pure function, with some utility methods describing it's input and output. One reason +I decided to not just create a decorator that wraps any function was the need to be able to describe the input the +function takes, and the output it produces in a stricter way than would have been possible with just type hints. +Another reason is that this way it is possible to add configuration to a module object, which should make module +code much more flexible and re-usable, and developers do not have to implement separate modules for just slightly different +use-cases. + +This design decision does not prevent to allow for more 'loose' implementations of a module, like the above mentioned +function with a decorator. Those would be dynamically converted into a ``KiaraModule`` subclass/object, with potential +downsides of not being able to version control it properly (or as easliy). The point is, though, that the default +way of doing things will give us the best guarantees (and metadata). + +Advantages: +- relatively easy to manage 'plugin-like' architecture, discovery of modules +- being able to describe module input/output fields in detail +- module versioning: by requiring the subclassing of a base class, and also having to add modules as entry_points, it will be possible describe exactly which version of the module was used in a workflow (as well as which version of the base class) + +Disadvantages: +- more abstraction layers than strictly necessary +- other, usual disadvantages associated with subclassing/inheritance + +## Separating data from the Python objects that describe them / Data registry + +TBD + +Advantages: +- efficiency, option to save on memory and IO +- (hopefully) decrease of complexity for non trivial scenarios like multi-process or remote job execution + +Disadvantages: +- extra level of abstraction +- increase in complexity (at least for simple use-cases) diff --git a/docs/architecture/index.md b/docs/architecture/index.md new file mode 100644 index 000000000..448dcb3a2 --- /dev/null +++ b/docs/architecture/index.md @@ -0,0 +1,5 @@ +# Architecture documents + +This section contains architecture-related documents for the *Kiara* project. + +Not all of those might be up-to-date, but they should help to understand certain design decisions, and why they were made. diff --git a/docs/architecture/metadata.md b/docs/architecture/metadata.md new file mode 100644 index 000000000..6aad7ad29 --- /dev/null +++ b/docs/architecture/metadata.md @@ -0,0 +1,42 @@ +## Metadata + +Metadata is more important in research than in other fields. Metadata can be used to, among other things, track provenance of data, +describe authorship, time of creation, location of creation, describing the 'shape' of data (schemas, etc.). + +In some cases it's not easy to determine what's data and what's metadata. Sometimes metadata becomes data ("One persons metadata..."). +Handling metadata is difficult, and it regularly gets lost somewhere in the process. Creating metadata in the first place can be +very time-consuming, I would wager that is more true in the digital humanities than in the harder sciences. + +With the growing popularity of the open data movement, people are getting more aware of the importance of metadata, and +there is a growing infrastructure and services around all of this (DOIs, RDF, 'linked data', Dublin core, ...). None +of it is easy or intuitive to use, but I guess that's just the nature of the beast. + +I think it is safe to say that whatever we come up with has to be able to create and handle metadata in some way or form, +and personally, I think we should 'bake' metadata handling in from the beginning. Looking at the user-stories it's quite +clear that this an important topic. How exactly that will look, I think there is some leeway, but all architecture proposals +should at least include some indication on how this would be handled. + +### Schema information + +One important piece of metadata is often schema information: what exactly is the shape of the data, how can I read it? +In some cases this can be inferred from the data easily, sometimes it's even obvious. But often that is not the case at all, +which makes things like creating generic data exploration tools very hard, if not impossible. +We would have, if we choose to create and attach it, all that information available, always, which would mean it would be easy +to create generic, peripheral tools like a generic data explorer. It will, of course, also make it easier to re-use such data in other workflows, +because users would not have to explicitly specify what their data is; we could infer that from the attached schema. + +### Workflow metadata + +One thing that is specific to our application is that we have full control over every part of the data-flow. So, we can +attach metadata of all inputs and previous steps to each result (or intermediate result) along the way. Which is quite an +unique opportunity; this is often not available at all, or has to be done manually by the researcher. + +There is a lot that can be done with such annotated (result-)data. For example, each data set can include pointers to all +the original data that was involved in creating it (or it could even include that data itself), as well as a description +of all the transformation steps it went through. This means that one could potentially create a full visual representation of +what happened to the data since it was created, just by looking at the attached metadata. This is usually impossible, because +there is never a sort of 'unbroken cold-chain' of metadata available. Of course, this would also help with reproducability and +related issues. + +This possibility is something I'm particularly excited about, even though it does not directly appear in any of our user +stories (so would not be a core requirement). But it's one of the things I would have liked to have available often in the past. diff --git a/docs/architecture/workflows/index.md b/docs/architecture/workflows/index.md new file mode 100644 index 000000000..bb15e236c --- /dev/null +++ b/docs/architecture/workflows/index.md @@ -0,0 +1,224 @@ +If we accept the premise that in the computational context we are really only interested in structured data, it follows +that there must be also 'things' that do stuff to our structured data. Let's call those things 'workflows'. + +## Definition + +I will concede that 'doing stuff to data' although entirely accurate is probably not the most useful of definitions. +Not Websters, all right? Well, how about: + + "A workflow is a tool to transform data into more structured data." + +"more" can be read in one or all of those ways: + + - 'more data' -- we'll create what can be considered 'new' data out of the existing set + - 'better structured' -- improve (and replace) the current structure (fix errors, etc.) + - 'more structure' -- augment existing data with additional structure + +In our context, workflows can also have secondary outcomes: + + - present data in different, more intuitive ways (e.g. visualisations), which researchers can use to get different ideas about the data, or new research questions + - convert structured data into equivalent structured data, just a different format (e.g csv to Excel spreadsheet) + - ... (I'm sure there's more, just can't think of anything important right now) + +## Deconstructing a workflow + +I've written more about it [here](https://github.com/DHARPA-Project/architecture-documents/blob/master/workflow-modularity/workflow-modularity.ipynb), but +conceptually, every data workflow is a collection of interconnected modules, where outputs of some modules are connected +to inputs of some other modules. The resulting network graph of modules defines a workflow. That's even the case for Jupyter +notebooks (which are really just fancy Python/R/Julia scripts); if you squint a bit you can see it: +the modules are functions that you call with some inputs, and you use the outputs of those functions (stored in variables) as inputs to +other functions. Move along, nothing to see here: this is really just how (most) programs work. + +As I see it, there are three main differences to programs that are written in 'normal' computer engineering: + +- the complexity of the resulting interconnected 'network graph' (the interconnection of functions) is usually lower +- it's a tad easier (or at least possible) to define, separate and re-use the building blocks needed in a majority of workflows (an example would be Numpy or the Pandas libraries, which are basically implementations of abstract problems that crop up often in this domain) +- it is possible to create workflows entirely out of modules that were previously created, with no or almost no other customization (normally, that customization would be very prominent in a program) -- often-times only some 'glue' code is needed + +This means that data engineering workflows could be considered relatively simple script-like applications, where advanced +concepts like Object-Oriented-Design, Encapsulation, DRY, YAGNI, ... are not necessary or relevant (in most cases they wouldn't +hurt though). + +## Data engineering + +This way of looking at workflows is nothing new, there are quite a few tools and projects in the data engineering space +which deal with workflows in one level of abstraction or another. + +As I'll point out below, the main difference to what we try to implement is that we'll add an element of 'interactivity'. +But I believe we can still learn a whole lot by looking at some aspects of those other tools. +I encourage everyone remotely interested to look up some of those projects, and maybe not read the whole documentation, +but at least the 'Why-we-created-yet-another-data-orchestrator', 'Why-we-are-better-than-comparable-projects' as well as +'What-we-learned'-type documentation pages you come across. 'I-tried-project-XXX-and-it-is-crap'-blog posts +as well as hackernews comment-threads related to those projects are usually also interesting. The '/r/dataengineering' and +'/r/datascience' sub-reddits are ok. But they are on Reddit, so, signal-to-noise is a bit, well.. + +Among others, interesting projects include: + +- [dagster](https://github.com/dagster-io/dagster) +- [prefect](https://www.prefect.io/) +- [airflow](https://airflow.apache.org/) +- [luigi](https://github.com/spotify/luigi) + +- also relevant, but less data-engineering-y: Node-RED, Apache NiFi, IFTTT, Zapier, Huginn, ... + +## The 'workflow lifecycle' + +One thing that I think is also worth considering is the different stages in the lifecycle of a workflow. For illustration, +I'll describe how each of those stages relates to the way data science is currently done with Jupyter, which is probably the most used tool +in this space at the moment. + +### Workflow creation + +This is the act of designing and implementing a new workflow transformed into one or a set of defined outcomes (which can be new data, or just a visualization, doesn't matter). +The actual creation of the workflow is similar to developing a script or application, and offers some freedom on how to implement it (e.g. which supporting +libraries to choose, whether and which defaults to set, ...). + +In the Jupyter-case, this would be the iterative development of a Jupyter notebook, with one cell added after the other. One thing that is different for us +is that we will have a much stricter definition of the desired outcome of our workflow, whereas the creation of a Jupyter notebook is typically way more open-ended, +and a researcher would easily be able to 'follow some leads' they come across while working on the whole thing. This is a very important distinction that pays to +keep in mind, and I can't emphasize this enough: the workflows we are dealing with are a lot more 'static' than typical Jupyter notebooks, because we have decided in +advance which ones to implement, and how to implement them. There is not much we can do about this, and it's a trade-off with very little room to negotiate. This +has a few important implications on how our product is different from how data science is done by Jupyter users currently. I will probably mention this again +and again, because it is not intuitive at first, but has a big impact on how we view what we are building! + +As per our core assumptions, end-users won't create new workflows, this is done by a group with a yet-to-be-determined 'special' skill set. + +### Workflow execution + +This is when a 'finished' workflow gets run, with a set of inputs that are chosen by the user. The schema/type of those inputs is a requirement +that has to be considered by the user. It's possible that a workflow allows for inputs to be in multiple formats, to make the users life easier (e.g. allow both '.csv' as well as '.json' formats), +but that also has to be documented and communicated to users. It is not possible to add elements to a workflow, and make it do different things +than it was designed to do. Our workflows are static, they never change (except in an 'iterative-development' sense where we release new versions)! + +Compare that to a researcher who created their own Jupyter notebook: they will have run the workflow itself countless times by then, while developing it. +The execution phase is really only that last run that achieves the desired outcome, and which will 'fill' the notebook output cells with +the final results. That notebook state is likely to be attached to a publication. Often the data is 'hardcoded' into the notebook itself (for example +by adding the data itself in the git repo, and using a relative path to point to it in a notebook). +It is also possible, although not as common (as far as I've seen -- I might be wrong here) that researchers spend a bit more time on the notebook and +make the inputs easier to change, in order to be able to execute it with different parameters, quickly. This is more like what we will end up with, +although I'd argue that the underlying workflow is still much easier to change, fix, and adapt than will be the case with our application. + +One difference between workflow creation and execution is that the creation part is more common for 'data scientists', and the execution part is a bigger +concern for 'data engineers' (both do both, of course). I think, our specific problem sits more in the data engineering than data science space (because +our main products are 'fixed'/'static' workflows), which is why I tend to look more for the tools used in that domain (data orchestrators, ...) than in the other +(Jupyter, ..) when I look for guidance. + + +### Workflow publication + +Once a workflow is run with a set of inputs that yield a meaningful outcome for a researcher, it can be attached to a publication in some way. +This has one main purpose: to document and explain the research methodologies that were used, on a different level than 'just' plain language. + +There is a long-term, idealistic goal of being able to replicate results, but the general sentiment is that it is unrealistic to attempt that at +this stage. It doesn't hurt to consider it a sort of 'guiding light', though. + +It is getting more and more common for researchers to attach Jupyter notebooks to a publication. Jupyter notebooks are a decent fit for this +purpose, because the contain plain-text documentation, the actual code, as well as the output(s) of the code in a single file, that has a +predictable, well specified format (json, along with a required document schema). As our colleagues at the DHJ project have discovered, it's +not a perfect fit, but it can be bent to serve as the basis for a formal, digital publication. + +In our case, it is my understanding that we would like to have an artefact like this too, and even though it's not one of the 'core' requirements, +it would be a very nice thing to have. One strong option is for us to re-use Jupyter notebooks for that. Depending on how we implement our +solution, we might already have one as our core element that 'holds' a workflow, in which case this is a non-issue. +Or, if that is not the case, we could 'render' a notebook from the metadata we have available, which should also not be too hard to do since the target +(the notebook) is well spec'ed. If that's the case, there is one thing I'd like to investigate before we commit though: what characteristics exactly are the +ones that make notebooks a good choice for that, and which one are detrimental? As I've mentioned, the DHJ project uses notebooks as the base +for creating article-(web)pages, and they came across some issues along the way. So I wonder: is there a better way to achieve the 'document and +explain research methodologies' goal than by using a Jupyter notebook? How would that look in a perfect world? How much effort would be involved? + + +## Interactivity / Long(-ish) running computations + +One component that is different in our scenario to other implementations is the requirement for interactivity. In data-engineering, +this is never an issue, you describe your pipeline, then you or someone else uses that with a set of inputs, and off it goes, +without any further interaction. *Plomp*, notification, results, rinse, repeat. + +For us that will be different, because we are creating a graphical user interface that reflects the workflow, and its state. +By definition, graphical user interfaces are interactive, and when a user triggers an action, they expect that to kick off +some instant response in the UI (maybe the change in a visualization, or a progress indicator, whatever). + +### Computationally trivial/non-trivial + +One main difficulty will be to find a good visual way to express what is happening to the user, ideally in the same way +for 2 different scenarios: + +- computations that are computationally trivial, and will return a result back in a few seconds at most +- computations that take longer + +In our workflows, I can see a few different ways those interactions can play out, depending on the characteristics of any particular workflow. + +So, in the case where a user 'uploads' data or changes a setting: + + - *if the whole workflow is trivial, computationally*: + - this triggers the whole workflow to execute and return with a new state/result immediately, and the output elements reflect the new state without any noticable delay + + - *if only some (or no) components of the workflows are trivial, computationally*: + - this triggers the execution of only parts of the workflow immediately (from the point of user input to the next non-trivial step of the workflow). + - all computationally non-trivial parts of the workflow will have some sort of "Process" button that users have to click manually to kick off those parts of the workflow. Otherwise the UI would be locked for an undefined amount of time after every user input -- which would result in a very bad UX). + - alternatively, workflows with computationally non-trivial parts could have one 'global' "Process" button, which would trigger the execution of the whole workflow with all current inputs/settings. + +There will be also inputs that don't directly kick off any processing (like for example control buttons in a visualisation). I +think we can ignore those for now, because this is what UIs usually do, and this does not present a difficulty in terms of +the overall UI/UX (just like the 'computationally trivial' workflow scenario). + +### UI representations for the current workflow state + +#### tldr; + +In some cases it will be impossible for users to use a workflow fully interactively, because one or all workflow steps +will take too much time, which means the interactive session has to be interrupted. In those cases (depending on our setup +and other circumstances) we might need to include a 'job-management'/'queue' component to our application, which matches +running/completed jobs to users and 'sessions'/'experiments' (for lack of a better word). +We need to find a visual metaphors for workflows and workflow steps to make that intuitive, ideally in a way so that those scenarios are not +handled too differently in comparison to how our 100%-interactive workflows are used and executed. +In addition, we have to make sure our backend can deal with all the scenarios we want to support. + +#### Details, skip if you want + +I'll include some suggestions on how all this could look visually, but those are in no way to be taken as gospel. Just +the most obvious (to me) visual elements to use, which I hope will make it easier to get my point across. +It's probably obvious that the important cases we have to care about are the ones where there is non-trivial computation. +I think we can roughly divide them into 4 categories: + + - *execution time of a few seconds*: + - in this case a 'spinning-wheel'-progress indidcator is probably enough + - backend-wise, we (probably) don't have to worry (although, it's not a given this will not crash a hosted app if we have too many users and computations are executed 'in-line') + - *execution time of a few minutes*: + - not long enough so that for example a browser session would expire + - in this case it would be good UX-wise to have a semi-exact progress indicator that either shows a 'done'-percentage, or remaining time + - on the backend-side, we need to separate three scenarios: + - local app: + - the computation can happen locally, either in a new thread, or a different process (we can also make use of multiple cores if available) + - hosted jupyter in some form or other: + - the running Jupyter kernel can execute the computation, which is probably a good enough separation to not affect the hosted UI + - hosted web app: + - there needs to exist some infrastructure we can use to offload the computation, it can't run on the same host as our service (which means a lot of added complexity) + - there is no need yet for authentication apart from that we need to be able to assign the result of the computation to individual sessions + - *execution time of a few hours*: + - long enough that a user will have left the computer in between, or closed a browser, etc. + - now the separation of backend-scenarios kicks in earlier, and also affects the front-end: + - local app: + - as in the case before, the UI would display a progress-indicator of some sort + - the computation would happen as a background process, and as long as the user does not shut-down or restart the + computer there is no issue (the job should even survive a suspend/hibernate) + - hosted jupyter: + - difficult to say, the computation could either still happen in the running Jupyter kernel, or would have to be farmed out to an external service + - one issue to be aware of is that, depending on how it is configured, Jupyter might or might not kill a notebook process (and underlying kernel) + if there has been no activity in the browser for a while. We'd have to make sure this does not happen, or that we have some sort of user session + management (which should be entirely possible -- but of course increases complexity by quite a bit). The latter will also be necessary if a user + comes back to their session after having been disconnected in some way, because otherwise they'd loose their result. + - ui-wise there needs to be session and compute-job management, and a list of currently running and past jobs and links to the experiments that produced them + - hosted web app: + - as with the jupyter case, we'll need session as well as job management + - *execution time of more than a few hours (days, weeks)*: + - in all cases the computation now needs to be outsourced, and submitted to a compute service (cloud, HPC, local dask-cluster, whatever...) + - all cases need to implement some sort of session authentication and job management (which would probably be a bit more transparent to the user in the local case, but overall it would be implemented in a similar way in each scenario) + + +### Externally running computations + +One thing to stress is that 'outsourcing' computationally intensive tasks comes with a considerable amount of complexity. +Nothing that can't be implemented, and there are several ways I can think of to do this. I'd still advise to be very aware of the +cost and complexity this incurs. I do believe we will have to add that in some form at some stage though, if we are in +any way successful and have people adopting our solution. Which means we have to include the issue in our architecture +design, even if we only plan to implement it later. diff --git a/docs/architecture/workflows/modularity/.ipynb_checkpoints/modularity-checkpoint.ipynb b/docs/architecture/workflows/modularity/.ipynb_checkpoints/modularity-checkpoint.ipynb new file mode 100644 index 000000000..18908b8c3 --- /dev/null +++ b/docs/architecture/workflows/modularity/.ipynb_checkpoints/modularity-checkpoint.ipynb @@ -0,0 +1,800 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "slideshow": { + "slide_type": "skip" + }, + "tags": [ + "remove-cell" + ] + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "IPython.notebook.set_autosave_interval(0)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Autosave disabled\n" + ] + } + ], + "source": [ + "%autosave 0\n", + "\n", + "import os\n", + "from rich.jupyter import print\n", + "from dharpa_toolbox.modules.utils import list_available_module_names, describe_module, print_module_desc, load_workflows, create_module\n", + "from dharpa_toolbox.utils import print_file_content, graph_to_image\n", + "from dharpa_toolbox.modules.workflows import DharpaWorkflow\n", + "from dharpa_toolbox.rendering.jupyter.renderer import PlainJupyterWorkflowRenderer, ModuleJupyterWorkflowRenderer\n", + "\n", + "base_path = os.path.abspath(\".\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## What's a workflow, really?\n", + "\n", + "- Jupyter is a very good tool to create non-trivial exploratory workflows\n", + "- there's a difference between 'dynamic' workflows, and 'static' ones\n", + "- Jupyter is usually used to create workflows in a 'dynamic' way\n", + "- also important (for us): interactivity\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Currently, Jupyter is one of the most used technologies in digital research to create workflows. Although there are exceptions,\n", + "in most cases it is used to explore a very specific research question. Jupyter is exceptionally good at that, which is the\n", + "reason it is so successful.\n", + "\n", + "From a computer-engineering perspective, Jupyter notebooks are 'just' simple scripts, and often they include anti-patterns like\n", + "global- as well as hard-coded variables, little to no encapsulation of functionality, etc. Which means that typically,\n", + "Jupyter notebooks have (relatively) little value to other researchers, and re-usability is low. This is an acceptable\n", + "trade-off though, because the problems they are solving are (usually) very niche and specific, so there is little downside\n", + "to tailor the code to the exact problem one is having. In addition, Jupyter notebooks are very good to document the workflow\n", + "itself, and communicate what is happening to the data (which is important for publication).\n", + "\n", + "If we want to create a tool that lets users run pre-created workflows, that equation changes though. Because, now the assumption\n", + "is that the (comparatively few) workflows we create will be useful in not just a very specific way. The goal is to identify\n", + "areas where people have (roughly) the same problem, and to solve that problem in a generic way that is useful to a\n", + "larger group of people. The workflow will typically be less important in relation to the overall research project a\n", + "researcher is working on (compared to a tailored, specific one), but from the perspective of a reasearcher it will also be\n", + "much less hassle and expensive to use, since they don't have to create the workflow themselves, and someone else already\n", + "has thought about all the options and parameters that make sense, has done the validation and testing, etc. Also, they\n", + "don't have to learn programming if they don't already know it...\n", + "\n", + "This means that we are dealing now with a very 'static' workflow, compared to the 'dynamic' ones researchers with programming\n", + "skills can create and change themselves very easily. Everything that can happen in a workflow is known in advance, and\n", + "even though there can be 'forks' in the flow of data, those have to be defined, implemented and documented in advance.\n", + "And that difference is why we should not assume that Jupyter notebooks are as good a vessel to implement such a workflow\n", + "as they are in the other case, where all that can happen 'on the go'. It's still possible notebooks are a good fit here too,\n", + "but we can't use our normal experience with -- and intuition about -- Jupyter to make that case.\n", + "\n", + "One other point that is important to note is user interactivity. Usually, when developing a Jupyter notebook inputs (data as well\n", + "as parameters) are either hardcoded, or factored out into a variable that is changed on top of the notebook (or in some cells\n", + "further down). And by running or re-running certain cells, those variables are re-set or changed. This works fine for\n", + "dynamically creating a workflow (although, it's sometimes confusing, and one of the main criticisms against the Jupyter notebook\n", + "approach). But, in a 'static' workflow, we need to make sure that a user can set or change all those inputs at any time, while\n", + "making sure that the 'internal' state of our workflow is known to our engine. At a minimum, we need to know that our state\n", + "is currently inconsistent after a user-input, and have a way to communicating that to the user so they can kick off\n", + "some re-processing manually, to make it consistent again. Jupyter supports interactivity via widgets, but the 'cell-based'\n", + "approach in notebooks is not a very good fit for that, because it forces a very simple one-after-the other processing model,\n", + "that would make it hard to implement the efficient execution of even remotely non-trivial workflows (for example having\n", + "parallel execution of some cells, or skipping the execution of parts that don't need to be executed currently, etc.)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Prior art\n", + "\n", + "- workflow/pipeline modelling and execution is a solved problem in programming:\n", + " - [flow-based programming (FBP)](https://en.wikipedia.org/wiki/Flow-based_programming)\n", + " - requires well defined, modular entities (with 'ports': input and output values)\n", + "- lots of (partial) implementations in data engineering:\n", + " - [airflow](https://airflow.apache.org/)\n", + " - [luigi](https://github.com/spotify/luigi)\n", + " - [dagster](https://github.com/dagster-io/dagster)\n", + " - [prefect](https://www.prefect.io/)\n", + " - many others: Node-RED, Apache NiFi, IFTTT, Zapier, Huginn, ...\n", + "- one subtle (although important) difference with our project, again: interactivity" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "There is a form of programming that fits our problem space fairly well: [flow-based programming (FBP)](https://en.wikipedia.org/wiki/Flow-based_programming).\n", + "Like functional programming, it's probably older than all of us, and it is gaining some notable traction again in recent years\n", + "(although with much less hype around it, and without being explicitly mentioned by name). A lot of the data orchestration\n", + "tools and frameworks that cropped up in recent years use some form or aspects of FBP, for example:\n", + "\n", + " - [airflow](https://airflow.apache.org/)\n", + " - [luigi](https://github.com/spotify/luigi)\n", + " - [dagster](https://github.com/dagster-io/dagster)\n", + " - [prefect](https://www.prefect.io/)\n", + "\n", + "One thing that FBP requires are well defined entities ('modules', 'nodes'), that have 'ports' (meaning: known inputs, and outputs).\n", + "A Jupyter notebook for example does not typically have that, which makes it hard to 'combine' notebooks in an FBP-like\n", + "manner. There are attempts to 'formalize' Jupyter notebooks in a way that would make them better fits in such scenarios\n", + "([papermill](https://papermill.readthedocs.io/en/latest/), [orchest](https://www.orchest.io/)), but in my opinion, although\n", + " they kind of work, those attempts are a bit clunky, and not very user-friendly (because they try to bend Jupyter into\n", + " something it was not designed to do). Also, they typically only deal with inputs; outputs are not very well defined at all.\n", + " Compare that for example with how a 'proper' data-orchestration tool like dagster handles [inputs and outputs](https://docs.dagster.io/tutorial/basics_solids),\n", + "which should make clear how many more options someone who implements a workflow execution and rendering framework (which\n", + "is basically what we are building) has when that sort of metadata is available.\n", + "\n", + "\n", + "As was the case in the section above, one difference in our case is interactivity. Most tools in that space assume they'll\n", + "get the input values for a workflow execution at the start, and then they can proceed to go through the workflow, batch-processing\n", + "style (meaning, no further user input half-way through). This is different for us, since we want users to be able to\n", + "interactively explore their data (within the limits of a 'static' workflow). This means we will have to consider how\n", + "to deal with long-running computations whose results wil be available after minutes, hours weeks. The good thing is though,\n", + "whatever we come up with, we'll get a 'traditional workflow execution engine' for free, because every workflow that can\n", + "be executed interactively, will also be able to do 'batch-style'. This will let us re-use and 'move' our workflows to\n", + "other execution environments (HPC clusters, 'the cloud', ...) and do other interesting things with them if the need arises\n", + "(monte-carlo style experiments, automated-testing of workflows and modules, ...)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Modelling workflows\n", + "\n", + "- research data is more useful when it's structured, so why would workflow definitions be different?\n", + "- so: can we model a workflow as code, or even better: as data?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "So, assuming everyone agrees this is a reasonable avenue to explore, we have to think about how we want to model our\n", + "workflows. We should definitely look at how other, similar frameworks do this, but I think one approach is very tempting:\n", + "\n", + "> ***Describe workflows as structured data!***\n", + "\n", + "There are several reasons for why I think this would be a good idea:\n", + "\n", + "- structured data can be processed by every programming language in existence\n", + " - we would have one 'main' library that does the actual workflow execution/data processing (probably in Python)\n", + " - we could use other languages to do different other things in our 'ecosystem': e.g. JavaScript for dynamically rendering a frontend\n", + "- we can (largely) work independent from each other, the only thing to consult about is the schema of the workflow data\n", + "- such structured data can be displayed as a network graph, which is much easier to grasp than code\n", + "- automated testing of every workflow and model is easy, can be done in CI/CD\n", + "- Jupyter notebooks are, as I've explained above, pretty good at creating and manipulating structured data\n", + "- there are a lot of researchers out there who know how to use Jupyter: those could all be potential \"DHARPA-workflow\" creators\n", + "- in addition to that, we can decide to create a visual 'workflow editor/creator', that is independent from the 'workflow executor' part, and 100% optional\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Code!\n", + "\n", + "The following is using prototype-quality code to illustrate how a 'workflow-as-data' model could look like in practice. Only a few modules are implemented, the goal is to recreate the first part of the 'Topic-modelling' workflow: load some text files, tokenize them, then do some processing (lowercasing, removal of stopwords).\n", + "\n", + "### Definitions\n", + "\n", + "- ***module***: a module is an atomic entity that contains a fixed set of defined inputs and outputs, as well as a processing unit that converts the set of inputs to outputs, in a predicable way\n", + "\n", + "- ***workflow***: a workflow contains a set of modules which are connected in a specific way. A workflow is conceptually also a module, because it also contains a set of inputs/outputs as well as processing unit, and it can be used in other, 'parent' workflows in the same ways a normal module can." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['corpus_processing',\n", + " 'corpus_processing_simple',\n", + " 'dharpa_workflow',\n", + " 'file_reader',\n", + " 'input_files_processing',\n", + " 'lowercase_corpus',\n", + " 'remove_stopwords_from_corpus',\n", + " 'tokenize_corpus']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can list all available modules (and workflows)\n", + "list_available_module_names()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{'tokenize_corpus': {'inputs': {'text_map': 'Dict'}, 'outputs': {'tokenized_text': 'Dict'}}}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we can investigate each modules inputs and output specs\n", + "print_module_desc('tokenize_corpus')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "```yaml\n", + "---\n", + "modules:\n", + "\n", + "- type: tokenize_corpus\n", + "\n", + "- type: lowercase_corpus\n", + " input_map:\n", + " tokenized_text: tokenize_corpus.tokenized_text\n", + "\n", + "- type: remove_stopwords_from_corpus\n", + " input_map:\n", + " tokenized_text: lowercase_corpus.tokenized_text\n", + " workflow_outputs:\n", + " tokenized_text: processed_text_corpus\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# a workflow configuration is basically just a list of modules, incl. their input/output connections\n", + "workflow_config = f'{base_path}/workflows/corpus_processing_simple.yaml'\n", + "print_file_content(workflow_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "    'lowercase_corpus__enabled',\n",
+       "    'remove_stopwords_from_corpus__enabled',\n",
+       "    'remove_stopwords_from_corpus__stopwords_list',\n",
+       "    'tokenize_corpus__text_map'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we create a 'workflow' object using the configuration data\n", + "workflow: DharpaWorkflow = DharpaWorkflow.from_file(workflow_config)\n", + "# we can investigate each workflows available input and output names\n", + "print(workflow.input_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
['processed_text_corpus']\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(workflow.output_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can display the execution and data-flow structures of a workflow graphically\n", + "graph_to_image(workflow.execution_graph)\n", + "# graph_to_image(workflow.data_flow_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'dharpa_workflow': {\n",
+       "        'inputs': {\n",
+       "            'lowercase_corpus__enabled': 'Bool',\n",
+       "            'remove_stopwords_from_corpus__enabled': 'Bool',\n",
+       "            'remove_stopwords_from_corpus__stopwords_list': 'List',\n",
+       "            'tokenize_corpus__text_map': 'Dict'\n",
+       "        },\n",
+       "        'outputs': {'processed_text_corpus': 'Dict'}\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# print the workflow input/output spec\n", + "print_module_desc(workflow)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{'1': ['world'], '2': ['dharpa']}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# using the spec, we can set a workflows inputs and outputs manually\n", + "text_map = {\n", + " \"1\": \"Hello World!\",\n", + " \"2\": \"Hello DHARPA!\"\n", + "}\n", + "stopwords = [\n", + " \"hello\",\n", + " \"!\"\n", + "]\n", + "workflow.set_input(\"tokenize_corpus__text_map\", text_map)\n", + "workflow.set_input(\"lowercase_corpus__enabled\", True)\n", + "workflow.set_input(\"remove_stopwords_from_corpus__enabled\", True)\n", + "workflow.set_input(\"remove_stopwords_from_corpus__stopwords_list\", stopwords)\n", + "\n", + "# the workflow state is processed automatically, so we can always query the current output\n", + "output1 = workflow.get_output(\"processed_text_corpus\")\n", + "print(output1)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "    'corpus_processing',\n",
+       "    'corpus_processing_simple',\n",
+       "    'dharpa_workflow',\n",
+       "    'file_reader',\n",
+       "    'input_files_processing',\n",
+       "    'lowercase_corpus',\n",
+       "    'remove_stopwords_from_corpus',\n",
+       "    'tokenize_corpus'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we can load workflows from json/yaml files on the file-system, and convert them to Python classes\n", + "load_workflows(f\"{base_path}/workflows\")\n", + "print(list_available_module_names())" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'input_files_processing': {\n",
+       "        'inputs': {\n",
+       "            'files': 'Any',\n",
+       "            'make_lowercase': 'Bool',\n",
+       "            'remove_stopwords': 'Bool',\n",
+       "            'stopwords': 'List'\n",
+       "        },\n",
+       "        'outputs': {'processed_text_corpus': 'Dict'}\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# display the module spec for the 'input_files_processing' workflow\n", + "print_module_desc(\"input_files_processing\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'modules': [\n",
+       "        {\n",
+       "            'type': 'file_reader',\n",
+       "            'input_map': {'files': '__workflow_input__.files'},\n",
+       "            'id': 'file_reader'\n",
+       "        },\n",
+       "        {\n",
+       "            'type': 'corpus_processing',\n",
+       "            'input_map': {\n",
+       "                'text_map': 'file_reader.content_map',\n",
+       "                'make_lowercase': '__workflow_input__.make_lowercase',\n",
+       "                'remove_stopwords': '__workflow_input__.remove_stopwords',\n",
+       "                'stopwords': '__workflow_input__.stopwords'\n",
+       "            },\n",
+       "            'workflow_outputs': {'processed_text_corpus': 'processed_text_corpus'},\n",
+       "            'id': 'corpus_processing'\n",
+       "        }\n",
+       "    ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# create the workflow object\n", + "ifp_workflow = create_module('input_files_processing')\n", + "# display the internal structure of the workflow\n", + "print(ifp_workflow._workflow_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAARkAAAD7CAYAAABe6+AqAAAABmJLR0QA/wD/AP+gvaeTAAAgAElEQVR4nO3de1RU5f4G8GdABBHk4oUOl3UADdCFq3NqpXZMM1ZiJ4uzNJI0MJd3zUuFEaKWkffUNDUzc3k0zYRlmsKSk50oU8JlZSQpYgdvZEpyEwflNt/fHzb758DA7IHZDOjzWWv+mHf25fu+e/PMzDvDbJ2ICIiINOJg7wKI6O7GkCEiTTFkiEhTHeo3FBYWIisryx61EFE7FxAQgEceecS0UerZvXu3AOCNN954s/oWHR1dP1KkwSsZI37oRETWeO6558y2c06GiDTFkCEiTTFkiEhTDBki0hRDhog0xZAhIk0xZIhIUwwZItIUQ4aINMWQISJNMWSISFMMGSLSFEOGiDTFkCEiTbXLkKmpqcHRo0ebtW5RURFSU1OxZMkSG1dFavEY3FvaVciUlpYiKSkJXl5eePTRR1Wts27dOiQmJiIiIgI9evRAbGwsRo0ahY8//ljjatXR6/UIDAxERkaG0jZgwAAkJCTYsarmu3O8Bw8ejPz8fJP+5OXlITk5uU0dg/ZOi/PF3HnZXO0qZLy8vLBkyRJ07txZ1fLvvfcekpKSsHjxYuzduxeDBg3CG2+8oXGV1nFyckJhYSEqKyuVtqCgILi4uNitpsLCwmatV3+8u3fvjvLycpP+hIWFYdWqVbYs956nxfli7rxsrkZ/Ga8t8/b2RlFRkcXlNm7cCD8/Pzg6OsLDwwN79uxpheqs07FjRwQHByM0NFRp27Vrl93qOX/+PMaOHYvDhw9bvW5j412/P87OzjaplW7T4nwxd142V7t6JWOtS5cuQafT2bsMi/r06YNevXrZuwz89ttvePrpp/HHH380a/32Mt6kjq3OyxaHTGZmJpydneHu7o5vv/0W5eXliIuLg06nw+OPP45ffvkFAHDixAn4+vriww8/BABcv34dr7/+OubOnYv4+HgMGzYM8fHxKCsrg8FgwDfffINXXnkFQUFBuHz5MoYMGYK//vWvKCsra1DDqlWr4OLigjlz5uDo0aNIT0/HtGnToNfrceXKFUybNk25b05TtbSkj5mZmQgICLD4quCll16Cs7MzDAYDUlNTMW7cODz22GMAgP3792PKlCkICAhAWVkZxo0bh27duqFv37744YcfAADZ2dmYM2cOgoKCcPXqVURHR6Nr167o27cvPvvsMwDA5s2b4eDgoIRARUUFVq9ebdL273//G7/88osyZmo1Nt4VFRUN+tOYW7duYcWKFZg4cSIefvhhDB06FLm5ucrj33//PQYMGIAZM2bgjTfegJOTU6PHsz5L42PpfLN0fgC35zAWLVqEuLg4zJ49G0OGDMHatWtt0r/GHmvu+WK0fv16xMXFYfr06XBxcYFOp1NuwP+fly3W2NUKrDF9+nRxcXGR8vJyERG5efOm+Pj4SGxsrLJMbW2tDB48WEREKioqJCQkRBYuXKg8XlRUJCEhIRIcHCxFRUWSlZUlrq6uAkCWLl0qX375pUycOFFu3LghYWFhSo0lJSUSFxcnP//8c4O6AEhYWFiT7ZZqKSsra1YfRUQ+//xzcXV1lQMHDqgey4sXL5rUV1hYKG5ubgJAFi9eLBcuXJAdO3YIAOnfv7/U1dVJWlqadOrUSQDIzJkz5fDhw/LJJ5+Iu7u7AJCjR4+KiEjPnj0bHNv6bY2NmRrm1q3fn8aWnTRpkuTl5Sn3IyMjxcfHR65fvy4iIiEhIeLt7a08HhMTI0VFRRZrUjM+mZmZjZ5vV65csXh+1NTUyJAhQyQuLk4MBoOIiGzdulUAKMe+Jf1r6jFrzxejdevWiaOjoxQXF4uIyNKlSwWAxMfHWxzTxkRHR5u9WoFNQubUqVMCQN5//32lLSoqStzc3KSiokJERPbv3y+bNm0SEZF58+YJAPn9999NtrN9+3YBIAkJCSIiEhoaKgCkpKTEZDljyBQUFMiECRPkjz/+MFuXmpBRW4u1fTSqra01W1tT6tdtHIc7+fj4iLOzs3I/JCREAIher1fa1qxZIwDk+eefFxExCWej+m22DpnG2u9sO3bsWKOX2EhLSxMRke7duwsAWbt2rRgMBsnNzVX+QNVQMz7mzjc158fq1asFgJw5c0Z5vLa2VrZu3SqlpaUt7p+lvjfnfImKihIHBweprq4WEZHc3FwBIAMGDFA9pvU1FjI2mZPp3bs3IiIisGnTJgDAhQsXUFdXh+rqamVSavv27YiNjQUA5Tsu7u7uJtsZPHgwACgXlzO+bPPy8jK73+HDh0Ov16Nbt27Nrl1tLdb20cjR0bHZtRmZm+fw8vJCVVWVct/B4fahdHV1VdqioqIAAGfPnm1xDVo6fvw4wsPDIbef9Exuw4cPB3B7Utnd3R2zZ89Gv379cOPGjQbHrClqxsfc+abm/Pj6668BAP7+/srjjo6OGDduHDw9PVvcP2v7ruZ8GTp0KAwGA9LT0wFA+XQqIiKi0e02l80mfmfMmIGcnBwcP34cy5cvx4oVKzBy5Ehs3rwZp06dQmBgoHKAjQf8/PnzJtvw8fEBAHh4eKja58qVK7F7924sX7682XVbU4s1fWwLfH19Ady+ql9bVlxcjIKCArMflxoMBgDAs88+i59++gnDhg3D999/j0GDBmHbtm0t2q+a8VFzfly9ehVA42He0v5p0fcZM2bgo48+woQJE/Daa68hPj4eycnJSE5ObtF2zbFZyERFRSEgIAALFy6EXq9Hnz59MHXqVBw/fhzTp083mUg0PgsYU9To0qVLAIAnnnhC1T6feuopJCUlISkpCQcPHmxW3dbUYk0fjerq6ppVly0UFxcD+P8+GJ/hqqurAdy+gF95ebnJOjqdDrW1ta1Y5e3vzlRWVjZ4sjh9+jTWr18PAHjzzTcRHByMjIwM7Nq1CzU1NZg/f36L9lt/fMxRc3488MADAIDFixebXBTxwoULOHjwYIv7p0Xf6+rqkJubi+zsbLzzzjvYt28fFixYYJNX3g3Uf//UnDkZo0WLFolOp5Pc3FylLSwsTJ555hmT5SorKyU8PFz8/f1N3uvOnj1bBg4cKDU1NSIiEhgYKADkxo0bJusHBQUJADEYDFJbWysRERHi6ekpJ06cUJYpKSkRABIcHNxg3wAkMDDQqlqs7aOISFpamri5ucnBgwebHrg7VFRUCADx9fVV2ozjcCc/Pz8BoNRnnFu5cw5o27Zt8tBDDynLjBgxQgDIggUL5OzZs/Luu++Kt7e3AJCMjAypq6uTXr16SefOneXixYuqaxZpfLzN9af+Mbh165YEBwcLABk/frzs3LlT5s+fL5GRkcrcg6urq5SWloqISE1NjXh4eJhMZFqiZnzMnW9qzo+CggLp3LmzAJCIiAjZsGGDLFiwQKZMmSIGg6HF/WvqseaeL8nJydKzZ0/ZsmWLZGRkSFZWluTn5zdrDtFI04lfo2vXrsmrr75q0rZ161bJzs5usGxFRYUkJCRIZGSkxMfHS0JCgiQnJ0tVVZXo9XpJTk5WJscmT54sJ06ckJKSEnn77bdFp9MJAFmyZIn89ttvyiRcly5dZOnSpXLkyBGZOnWqABAHBwd56623JCcnRwoKCmTWrFnKdtesWSOlpaVN1tKSPh46dEh8fX3lq6++UjV+er1e5s6dq9S3evVqWbZsmXJ/0aJFUl5erkxYApDExES5efOm8ke0cuVKuXbtmhQVFcmyZctM/mDy8/Olf//+0rlzZ4mMjJT8/HwZNGiQxMXFyaeffipVVVUyd+5c+ctf/iJ79uxRVbOIyMmTJ82Ot7n+5OTkmD0G58+fl6ioKPH29pb77rtPJk+ebDKhD0AefPBBWbZsmbzwwgvy9NNPy7lz51TX2NT4NHa+Gak5P06ePCnDhg0TLy8v8fPzk5dffln5JFJEWtS/xh5ryfly6NAh8fHxaTAR3b17d6uO/Z0aCxndn51QpKSkICYmhtfCbmd69+6NvLw8HrdGcHxMbd26FdeuXcNrr70G4Pbc0OXLl5GZmYk5c+Yo80zWMF4LOzU11aS9Xf5bAbUeNd/gzcvLs8nXz5tDbX30/5YvX47ExERlTgq4PcHt7++PRx99FH5+fjbd3139bwX3EuO3Q9V+C1YtMfOxa/2bvQLGmvq0Gp/26MiRIwCADz74wCRofvzxRyQmJmLHjh023R9Dpp3T6/WYN2+e8mnHrFmzkJ2dbeeq2g6OT0Pbtm3DzJkzsWXLFvj7+2PgwIEYNWoUfvzxR+zYsQN9+vSx6f44J0NENtHYnAxfyRCRphgyRKQphgwRaYohQ0SaYsgQkaYYMkSkKYYMEWmKIUNEmmLIEJGmGDJEpCmGDBFpiiFDRJpiyBCRphr90aqUlJTWrIOI2rnCwkKTy8IYNRoyMTExmhZERHef6OjoBm0Nfk+GqCk6nQ67d+/GqFGj7F0KtROckyEiTTFkiEhTDBki0hRDhog0xZAhIk0xZIhIUwwZItIUQ4aINMWQISJNMWSISFMMGSLSFEOGiDTFkCEiTTFkiEhTDBki0hRDhog0xZAhIk0xZIhIUwwZItIUQ4aINMWQISJNMWSISFMMGSLSFEOGiDTFkCEiTTFkiEhTDBki0hRDhog0xZAhIk0xZIhIUwwZItJUB3sXQG3Xzz//jJqamgbtBQUF+OGHH0zaQkND4ebm1lqlUTuiExGxdxHUNo0YMQL79u2zuJyLiwuuXr2KLl26tEJV1N7w7RI1avTo0RaXcXR0xFNPPcWAoUYxZKhRUVFRcHV1bXIZg8GA2NjYVqqI2iOGDDXKxcUFI0eOhJOTU6PLdOrUCf/85z9bsSpqbxgy1KQxY8aYnfwFACcnJ8TExMDFxaWVq6L2hCFDTRo6dCi8vLzMPlZTU4MxY8a0ckXU3jBkqEkdOnTA888/j44dOzZ4zMvLC0OGDGn9oqhdYciQRaNHj0Z1dbVJW8eOHTF27Fh06MCvWlHT+D0ZskhE4O/vj8uXL5u0Z2dno3///naqitoLvpIhi3Q6HWJjY00+ZfL390e/fv3sWBW1FwwZUmX06NHKp0wdO3bEiy++CJ1OZ+eqqD3g2yVS7f7778evv/4KADh58iTCw8PtXBG1B3wlQ6qNHTsWABAWFsaAIdXu2o8GCgsLkZWVZe8y7iqenp7Q6XT4+9//jpSUFHuXc1cJCAjAI488Yu8yNHHXvl1KSUlBTEyMvcsgUiU6Ohqpqan2LkMTd+0rGaO7NEPt5rvvvrtrn3Ht5bnnnrN3CZrinAxZhQFD1mLIEJGmGDJEpCmGDBFpiiFDRJpiyBCRphgyRKQphgwRaYohQ0SaYsgQkaYYMkSkKYYMEWmKIUNEmmLIEJGmGDJtQHl5uVXLFxUVITU1FUuWLNGoIuu1xZqaw9pjQZYxZOykqqoKS5YswT/+8Q907dpV9Xp5eXlITk7GqFGj8PHHH9u0Jr1ej8DAQGRkZJi0r1u3DomJiYiIiMDgwYORn5+PAQMGICEhQfOaWkNzj4U5jY3hvYwhYyfOzs549dVXcebMGdTV1aleLywsDKtWrdKkJicnJxQWFqKyslJpe++995CUlITFixdj79696N69O8rLyxEUFKRcA1vLmlpDc4+FOebG8F531/8yXlvm4uKCHj16oKSkxKr1nJ2dNamnY8eOCA4ORmhoqNK2ceNG+Pn5wdHRER4eHtizZw8AYNeuXa1SU2tp7rGoz9wY3uv4SoZM9OnTB7169VLuX7p0iddXslL9MbzX8ZXMnyorK7F3716kp6fjwoULWLVqFaZPn46SkhLs3LkT3bt3x+uvv44jR46gW7du2LFjBx566CFl/evXr2Px4sVwcHBAdXU1cnNzER4ejgULFsDT0xMAcPPmTcyfPx83btxAjx49UFdXB71er2xj8+bNmDJlCkQEIoKKigps3rwZc+bMUdoac+vWLbz33nvIz89HTk4OPD098e677yqXLsnMzMTYsWOxc+dODB48uNHtvPTSS3B2dkZ6ejrS0tKg1+tx5coVTJs2DQCwYsUKZGRkID09HefOncM333zT7JqaYjAY8O2332Lfvn3Yt28fjh49ijFjxuDcuXPIycmBi4tLk9s+e/YskpKS0LNnT1y+fBnnz5/Hhg0b0LdvX1XHoqn6+/Tp02RtxjGkP8ldavfu3WJN9wwGg/z6668CQDw8PCQ9PV1OnTolACQwMFDeeecdKS8vlxMnTggAGTJkiLJuRUWFhISEyMKFC5W2oqIiCQkJkeDgYCkrK5Pa2lrp37+/TJo0SVnmf//7n3To0MGkzp49ezao21wbAAkLC1PuT5o0SfLy8pT7kZGR4uPjI9evXxcRkc8//1xcXV3lwIEDqsfE3H5ERC5evGi23dqamlJVVSVZWVni6uoqAGTp0qXy5ZdfysSJE+XGjRsWt33//fdLz549RUSkpqZGPD09JTw8XERE9bFobB/Xrl1rsjZrRUdHS3R0tNXrtRcMmXrq/6H4+fk12E6PHj3E09NTuT9v3jwBIL///rvJctu3bxcAkpCQIOvXrxcAcvr0aZNlQkJCTLYfFhbWYH/m2u6s89ixYwLA7C0tLU1Zp7a21pqhaLAfS+3NqcmS0NBQASAlJSVKm5ptr169Wnbt2iUit59AevbsKU5OTiIiqo6Fmn2Yq6057vaQ4dslC9zd3Ru0eXt7Iy8vT7l/9OhRs8sa35ZkZWUpywcGBpos4+DQ8mmx48ePIzw8HCdPnmxyOUdHxxbvSy21NVlinA/y8vKyatuvvPIK9Ho93n//fZSUlKCqqkq5lvcXX3wBoOljoWYf5mqjhjjxawPGk/P8+fMm7T4+PgAADw8P/PbbbwCA4uJim++/uLgYBQUFZj82NRgMNt+fGlrWpGbbx48fR9++fREcHIz58+fDzc1NWUbNsWiLY9peMWRswPiKJT093aT90qVLAIAnnngCYWFhZpepz/jsWF1dDeD2xeksfQs1LCwMlZWVWL58uUn76dOnsX79euV+S78DYg21NWm17bFjx6KmpgZPPvkkANNgUHMstKz/nmPv92taac6czM2bNwWAhIaGKm3GSdeKigqlLTAwUABIXV2diIhUVlZKeHi4+Pv7m8zLzJ49WwYOHCg1NTXy008/SYcOHaRr166SkZEhlZWV8tVXX0mXLl0EgJw7d05EREaMGCEAZMGCBXL27Fl59913xdvbWwBIRkaG1NXVSWVlpTIhLSJy69YtCQ4OFgAyfvx42blzp8yfP18iIyOVidC0tDRxc3OTgwcPqh6PkpISASDBwcEm7RUVFQJAfH19lbbm1KSGcazvnFBVs20PDw/R6XTyxRdfyM6dO6VHjx4CQI4dOyZpaWkWj4WafZirrTnu9jkZhsyfrl69Kq+++qoAEGdnZ/nyyy/lP//5j/KJw6xZs6S4uFjWrVsnOp1OAMiKFSvk2rVrInL7Dy8hIUEiIyMlPj5eEhISJDk5WaqqqpR9HD58WAYOHCju7u4SHBwsy5Ytk8GDB8vUqVPlv//9r9TV1Ul+fr70799fOnfuLJGRkZKfny+DBg2SuLg4+fTTTyUvL09mzZqlTEKuWbNGSktL5fz58xIVFSXe3t5y3333yeTJk+WPP/5Q9n3o0CHx9fWVr776StV4nDx5UqZOnSoAxMHBQd566y3JyckRvV4vc+fOVfa/evVqycnJaVZNTdHr9ZKcnKxsc/LkyXLixAnlcUvb3rBhg3h4eEi/fv0kOztb1q5dK15eXvKvf/1LiouLVR2LxvZhqTZr3e0hoxO5Oy8WnZKSgpiYGF4Lm9o847WwU1NT7VyJNjgnQ3ah0+ks3s6cOWPvMskG+BE22QVfYd47+EqGiDTFkCEiTTFkiEhTDBki0hRDhog0xZAhIk0xZIhIUwwZItIUQ4aINMWQISJNMWSISFMMGSLSFEOGiDTFkCEiTTFkiEhTd/3vyaSkpNi7BKImFRYWwt/f395laOauD5mYmBh7l0BkUXR0tL1L0Mxd+xu/pA2dTofdu3dj1KhR9i6F2gnOyRCRphgyRKQphgwRaYohQ0SaYsgQkaYYMkSkKYYMEWmKIUNEmmLIEJGmGDJEpCmGDBFpiiFDRJpiyBCRphgyRKQphgwRaYohQ0SaYsgQkaYYMkSkKYYMEWmKIUNEmmLIEJGmGDJEpCmGDBFpiiFDRJpiyBCRphgyRKQphgwRaYohQ0SaYsgQkaYYMkSkKYYMEWmqg70LoLbr559/Rk1NTYP2goIC/PDDDyZtoaGhcHNza63SqB3RiYjYuwhqm0aMGIF9+/ZZXM7FxQVXr15Fly5dWqEqam/4dokaNXr0aIvLODo6Yvjw4QwYahRDhhoVFRUFV1fXJpcxGAyIjY1tpYqoPWLIUKNcXFwwcuRIODk5NbpMp06d8OSTT7ZiVdTeMGSoSWPGjDE7+QsATk5OiImJgYuLSytXRe0JQ4aaNHToUHh5eZl9rKamBmPGjGnliqi9YchQkzp06IDRo0ebfcvUtWtXPP7443aoitoThgxZNHr06AZvmTp27IjY2Fg4OjraqSpqL/g9GbJIRODv74/Lly+btGdnZ6N///52qoraC76SIYt0Oh3i4uJM3jL5+/ujX79+dqyK2guGDKly51umjh074sUXX4ROp7NzVdQe8O0SqXb//ffj119/BQCcPHkS4eHhdq6I2gO+kiHVXnzxRQBA7969GTCkms3+C7uwsBBZWVm22hy1QR4eHtDpdPjb3/6GlJQUe5dDGgoICMAjjzxim42JjezevVsA8MYbb3fBLTo62lbRIDb/PRnhFM9d7bvvvrPdMxy1Sc8995xNt8c5GbIKA4asxZAhIk0xZIhIUwwZItIUQ4aINMWQISJNMWSISFMMGSLSFEOGiDTFkCEiTTFkiEhTDBki0hRDhog0xZAhIk3dUyFTU1ODo0ePNmvdoqIipKamYsmSJTauiujudk+ETGlpKZKSkuDl5YVHH31U1Trr1q1DYmIiIiIi0KNHD8TGxmLUqFH4+OOPNa5WHb1ej8DAQGRkZNi7FLKBAQMGICEhwabbbCvnyD0RMl5eXliyZAk6d+6savn33nsPSUlJWLx4Mfbu3YtBgwbhjTfe0LhK6zg5OaGwsBCVlZX2LoVsICgoyObXFG8r54jNfxmvLfP29kZRUZHF5TZu3Ag/Pz84OjrCw8MDe/bsaYXqrNOxY0cEBwcjNDTU3qWQDezatcvm22wr58g98UrGWpcuXWoX1xTq06cPevXqZe8yqA1rC+eI3UImMzMTzs7OcHd3x7fffovy8nLExcVBp9Ph8ccfxy+//AIAOHHiBHx9ffHhhx8CAK5fv47XX38dc+fORXx8PIYNG4b4+HiUlZXBYDDgm2++wSuvvIKgoCBcvnwZQ4YMwV//+leUlZU1qGHVqlVwcXHBnDlzcPToUaSnp2PatGnQ6/W4cuUKpk2bptw3p6laWtLHzMxMBAQE4PDhw02O4UsvvQRnZ2cAt99/L1q0CHFxcZg9ezaGDBmCtWvXqqq1qXHLyMjAnDlzEBQUhKtXryI6Ohpdu3ZF37598dlnnwEANm/eDAcHByWYKyoqsHr1apM2APj+++8xYMAAzJgxA2+88QacnJwaHdv6srOzm6zD0rG3dKzUjOGtW7ewYsUKTJw4EQ8//DCGDh2K3NxcVf1r7DGDwYDU1FSMGzcOjz32GABg//79mDJlCgICAlBWVoZx48ahW7du6Nu3L3744QeTcVm/fj3i4uIwffp0uLi4QKfTKbf654jd2OoXyY1XK7DG9OnTxcXFRcrLy0VE5ObNm+Lj4yOxsbHKMrW1tTJ48GAREamoqJCQkBBZuHCh8nhRUZGEhIRIcHCwFBUVSVZWlri6ugoAWbp0qXz55ZcyceJEuXHjhoSFhSk1lpSUSFxcnPz8888N6gIgYWFhTbZbqqWsrKxZfRQR+fzzz8XV1VUOHDigahxrampkyJAhEhcXJwaDQUREtm7dKgDkwIEDzR638ePHy+7du6VTp04CQGbOnCmHDx+WTz75RNzd3QWAHD16VEREevbs2eD4128LCQkRb29v5X5MTIwUFRVZ7F9dXZ2kpaU1WUdmZmajx/7KlSsWj5WlMRQRmTRpkuTl5SnbiIyMFB8fH7l+/brF/jX12MWLF03OrcLCQnFzcxMAsnjxYrlw4YLs2LFDAEj//v2Vbaxbt04cHR2luLhYRESWLl0qACQ+Pt7imDYlOjraplcrsGvInDp1SgDI+++/r7RFRUWJm5ubVFRUiIjI/v37ZdOmTSIiMm/ePAEgv//+u8l2tm/fLgAkISFBRERCQ0MFgJSUlJgsZwyZgoICmTBhgvzxxx9m61ITMmprsbaPRrW1tWZrM2f16tUCQM6cOWOy/tatW6W0tLTF4xYSEiIARK/XK21r1qwRAPL888+LiJgEuFH9tu7duwsAWbt2rRgMBsnNzVX+QNVQU4e5Pqjpv6UxPHbsWKOXD0lLS7PYP0t9r3/OGftxJx8fH3F2dlbuR0VFiYODg1RXV4uISG5urgCQAQMGqB5Tc2wdMnadk+nduzciIiKwadMmAMCFCxdQV1eH6upqZSJs+/btiI2NBQDlOy7u7u4m2xk8eDAAKBeXM75U9PLyMrvf4cOHQ6/Xo1u3bs2uXW0t1vbRyNHRUXUtX3/9NQDA39/fZP1x48bB09OzxePm4HD7NHF1dVXaoqKiAABnz55VXefGjRvh7u6O2bNno1+/frhx40aDmpqipg5zfVDTf0tjePz4cYSHh0NuPzGb3IYPH26xf9b23dycoJeXF6qqqpT7Q4cOhcFgQHp6OgAon05FREQ0ul17sPvE74wZM5CTk4Pjx49j+fLlWLFiBUaOHInNmzfj1KlTCAwMVE4q40l2/vx5k234+PgAuH2FQzVWrlyJ3bt3Y/ny5c2u25parOljc1y9ehVA43/wthq3O/n6+gK4faVBtZ599ln89NNPGDZsGL7//nsMGjQI27Zts3rf1tahpv+WxiCLyH4AAAUNSURBVLC4uBgFBQVmPw42GAwAmu6fFn2fMWMGPvroI0yYMAGvvfYa4uPjkZycjOTk5BZt19bsHjJRUVEICAjAwoULodfr0adPH0ydOhXHjx/H9OnTMW3aNGVZ4zOPMbmNLl26BAB44oknVO3zqaeeQlJSEpKSknDw4MFm1W1NLdb00aiurk51LQ888AAAYPHixSYX17tw4QIOHjxos3G7U3Fxscm6xmfe6upqALcv8ldeXm6yzptvvong4GBkZGRg165dqKmpwfz5863ed1N1mKOm/5bGMCwsDJWVlQ2emE6fPo3169db7J8Wfa+rq0Nubi6ys7PxzjvvYN++fViwYIFVr4Jbha3edzVnTsZo0aJFotPpJDc3V2kLCwuTZ555xmS5yspKCQ8PF39/f5P317Nnz5aBAwdKTU2NiIgEBgYKALlx44bJ+kFBQQJADAaD1NbWSkREhHh6esqJEyeUZUpKSgSABAcHN9g3AAkMDLSqFmv7KCKSlpYmbm5ucvDgwaYH7k8FBQXSuXNnASARERGyYcMGWbBggUyZMkUMBkOLx804t3LnPNG2bdvkoYceUtYdMWKEAJAFCxbI2bNn5d133xVvb28BIBkZGVJXVyeurq5SWloqIrcnqz08PEwmMi1RU4e5Pqjpv6UxvHXrlgQHBwsAGT9+vOzcuVPmz58vkZGRytxKU/1r6rGKigoBIL6+vkptxn7cyc/PTwAofU1OTpaePXvKli1bJCMjQ7KysiQ/P9+q+Txz7qqJX6Nr167Jq6++atK2detWyc7ObrBsRUWFJCQkSGRkpMTHx0tCQoIkJydLVVWV6PV6SU5OVibkJk+eLCdOnJCSkhJ5++23RafTCQBZsmSJ/Pbbb8rEX5cuXWTp0qVy5MgRmTp1qgAQBwcHeeuttyQnJ0cKCgpk1qxZynbXrFkjpaWlTdbSkj4eOnRIfH195auvvlI9hidPnpRhw4aJl5eX+Pn5ycsvv6x8otXccTMy/nGvXLlSrl27JkVFRbJs2TKTP+T8/Hzp37+/dO7cWSIjIyU/P18GDRokcXFx8umnn0pVVZUAkAcffFCWLVsmL7zwgjz99NNy7tw51X1sqg5LfVBzrCyN4fnz5yUqKkq8vb3lvvvuk8mTJ5t8eNBU/xp7TK/Xy9y5c5W6V69eLcuWLVPuL1q0SMrLy5UJbgCSmJgoN2/elEOHDomPj0+Dieju3bvLnj17VI9rfbYOGd2fA9BiKSkpiImJ4bWw70K9e/dGXl6e3Y9tW6mjrdi6dSuuXbuG1157DcDtuaHLly8jMzMTc+bMUeaZrGW8FnZqaqpN6ryn/q2A2iY1367Oy8trhUraj+XLlyMxMVGZkwJuT3D7+/vj0UcfhZ+fnx2rM2X3iV9q+4zfWlX77VxriZmPhevfQkNDNa+jPTly5AgA4IMPPjAJmh9//BGJiYnYsWOHvUprgCFDjdLr9Zg3b57yKcysWbOQnZ19z9bRlmzbtg0zZ87Eli1b4O/vj4EDB2LUqFH48ccfsWPHDvTp08feJSo4J0NEJmw9J8NXMkSkKYYMEWmKIUNEmmLIEJGmGDJEpCmGDBFpiiFDRJpiyBCRphgyRKQphgwRaYohQ0SaYsgQkaYYMkSkKZv/aFVKSoqtN0lEraiwsNDk0jAtZfOQiYmJsfUmiaiVRUdH22xbNvs9GSIiczgnQ0SaYsgQkaYYMkSkqQ4AbPNDnkREZvwfhOKRqPItXy4AAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# display the execution graph for the workflow\n", + "graph_to_image(ifp_workflow.execution_graph)\n", + "# graph_to_image(ifp_workflow.data_flow_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "797d10365048493281e703a19d548681", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GridspecLayout(children=(Label(value='files', layout=Layout(grid_area='widget001')), FileUpload(value={}, desc…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "794daa53740a42b5848d45e1ec0cac2b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# auto-render input and output-widgets for the workflow (only works when executed in Jupyter)\n", + "renderer = PlainJupyterWorkflowRenderer(workflow=ifp_workflow)\n", + "renderer.render()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Advantages\n", + "\n", + "- each interactive workflow is automatically a 'batch-style' one\n", + "- ability to work on different parts of the application almost 100% independently (workflow engine, workflow renderers (UIs), plugins (submitting long running jobs to a cluster, metadata augmentation/data lineage, metric gathering, ...)\n", + "- comparatively little dependencies for the base system / lightweight, but very extensible\n", + "- no dependency on any one UI/frontend technology (can use Jupyter/React/QT/...), can run without any UI at all\n", + "- complex workflow can be broken up into separate pieces, and developed and tested individually\n", + "- fairly high re-usablity\n", + "- easy to use in an agile, iterative development process\n", + "\n", + "\n", + "### Disadvantages\n", + "\n", + "- potentially lower performance due to having to create copies of inputs/outputs to make sure they (or items contained within them) are not changed by subsequent steps\n", + "- harder to create very customized UIs\n", + "- there's a limit to the complexity of workflows that can be supported realistically (e.g. hard to implement control structures like if-then-else or loops on a workflow level/outside of modules -- but in a lot of cases that is probably not necessary)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
True\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from asyncclick import MultiCommand\n", + "\n", + "class B(MultiCommand):\n", + " attr_2 = 0\n", + "\n", + "class C(B):\n", + " attr_3 = 0\n", + "\n", + "c = C()\n", + "t = isinstance(c, B)\n", + "print(t)" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + }, + "rise": {} + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/architecture/workflows/modularity/.ipynb_checkpoints/modularity_2-checkpoint.ipynb b/docs/architecture/workflows/modularity/.ipynb_checkpoints/modularity_2-checkpoint.ipynb new file mode 100644 index 000000000..1746bb09c --- /dev/null +++ b/docs/architecture/workflows/modularity/.ipynb_checkpoints/modularity_2-checkpoint.ipynb @@ -0,0 +1,723 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "IPython.notebook.set_autosave_interval(0)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Autosave disabled\n" + ] + } + ], + "source": [ + "%autosave 0\n", + "\n", + "import os\n", + "from rich.jupyter import print\n", + "from dharpa_toolbox.modules.utils import list_available_module_names, describe_module, print_module_desc, load_workflows, create_module, create_workflow\n", + "from dharpa_toolbox.utils import print_file_content, graph_to_image\n", + "from dharpa_toolbox.modules.workflows import DharpaWorkflow\n", + "from dharpa_toolbox.rendering.jupyter.renderer import PlainJupyterWorkflowRenderer, ModuleJupyterWorkflowRenderer\n", + "\n", + "base_path = os.path.abspath(\".\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## What's a workflow?\n", + "\n", + "- a workflow is a tool to transform data into more structured data\n", + " - 'more data' -- we'll create what can be considered 'new' data out of the existing set\n", + " - 'better structured' -- improve (and replace) the current structure (fix errors, etc.)\n", + " - 'more structure' -- augment existing data with additional structure\n", + " - secondary outcomes (insight, new research questions, etc...)\n", + "- workflows are 'just' simple scripts\n", + " - no or only very minimal control structures, except from input/output connections\n", + " - low computational complexity for the workflow itself\n", + " - high(er) computational complexity within modules (but hidden from workflow creator)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Workflows in our context\n", + "\n", + "- Jupyter is a very good tool to create non-trivial exploratory workflows\n", + "- there's a difference between 'dynamic' workflows, and 'static' ones\n", + "- Jupyter is usually used to create workflows in a 'dynamic' way\n", + "- more 'data engineering' than 'data science'\n", + "- interactivity:\n", + " - not an issue in 'data engineering'\n", + " - not very well supported in Jupyter (cell-based approach not useful for us)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Two options: monolithic & modular\n", + "\n", + "### Monolithic\n", + "\n", + " - well integrated, no restrictions UI-wise\n", + " - complexity spread out across the workflow\n", + "\n", + "### Modular\n", + "\n", + " - complexity concentrated in the framework, but simple modules\n", + " - some restrictions on the UI\n", + "\n", + "## Suggested approach\n", + "\n", + " - 100% modular backend\n", + " - modular frontend, incl. optional monolithic frontend approach for high-value workflows" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "notes" + } + }, + "source": [ + "Notes:\n", + "\n", + " - only important for the workflow part, there will be features that won't be affected by this at all (metadata, data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Definitions\n", + "\n", + "- ***module***:\n", + " - a module is an atomic entity that contains a fixed set of defined inputs and outputs,\n", + " - as well as a processing unit that converts the set of inputs to outputs, in a predicable way\n", + "\n", + "- ***workflow***:\n", + " - a workflow contains a set of modules which are connected in a specific way\n", + " - a workflow is conceptually also a module, because it also contains a set of inputs/outputs as well as processing unit\n", + " - it can be used in other, 'parent' workflows in the same ways a normal module can.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Modelling workflows\n", + "\n", + "- research data is more useful when it's structured, so why would workflow definitions be different?\n", + "- so: can we model a workflow as data?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "notes" + } + }, + "source": [ + "Notes:\n", + "\n", + "- monolithic workflows are not structured (comparison:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "notes" + } + }, + "source": [ + "## Advantages\n", + "\n", + " - scalability\n", + " - disposible middleware and frontends (only important to be able to use the created workflows)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Examples" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['dharpa_workflow',\n", + " 'file_reader',\n", + " 'lowercase_corpus',\n", + " 'remove_stopwords_from_corpus',\n", + " 'tokenize_corpus']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can list all available modules (and workflows)\n", + "list_available_module_names()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{'tokenize_corpus': {'inputs': {'text_map': 'Dict'}, 'outputs': {'tokenized_text': 'Dict'}}}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we can investigate each modules inputs and output specs\n", + "print_module_desc('tokenize_corpus')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "```yaml\n", + "---\n", + "modules:\n", + "\n", + "- type: tokenize_corpus\n", + "\n", + "- type: lowercase_corpus\n", + " input_map:\n", + " tokenized_text: tokenize_corpus.tokenized_text\n", + "\n", + "- type: remove_stopwords_from_corpus\n", + " input_map:\n", + " tokenized_text: lowercase_corpus.tokenized_text\n", + " workflow_outputs:\n", + " tokenized_text: processed_text_corpus\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# a workflow configuration is basically just a list of modules, incl. their input/output connections\n", + "workflow_config = f'{base_path}/workflows/corpus_processing_simple.yaml'\n", + "print_file_content(workflow_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "    'lowercase_corpus__enabled',\n",
+       "    'remove_stopwords_from_corpus__enabled',\n",
+       "    'remove_stopwords_from_corpus__stopwords_list',\n",
+       "    'tokenize_corpus__text_map'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we create a 'workflow' object using the configuration data\n", + "workflow = DharpaWorkflow.from_file(workflow_config)\n", + "# we can investigate each workflows available input and output names\n", + "print(workflow.input_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
['processed_text_corpus']\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(workflow.output_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can display the execution and data-flow structures of a workflow graphically\n", + "graph_to_image(workflow.execution_graph)\n", + "# graph_to_image(workflow.data_flow_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{'1': ['world'], '2': ['dharpa']}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "text_map = {\n", + " \"1\": \"Hello World!\",\n", + " \"2\": \"Hello DHARPA!\"\n", + "}\n", + "stopwords = [\n", + " \"hello\",\n", + " \"!\"\n", + "]\n", + "\n", + "workflow = DharpaWorkflow.from_file(workflow_config)\n", + "\n", + "workflow.set_input(\"tokenize_corpus__text_map\", text_map)\n", + "workflow.set_input(\"lowercase_corpus__enabled\", True)\n", + "workflow.set_input(\"remove_stopwords_from_corpus__enabled\", True)\n", + "workflow.set_input(\"remove_stopwords_from_corpus__stopwords_list\", stopwords)\n", + "\n", + "# the workflow state is processed automatically, so we can always query the current output\n", + "output1 = workflow.get_output(\"processed_text_corpus\")\n", + "print(output1)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "eb898d84b9014e3ba5d0791317772f14", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GridspecLayout(children=(Label(value='tokenize_corpus__text_map', layout=Layout(grid_area='widget001')), Texta…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f8aa7471161243fb981843cd3d0b743a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# text_map: {\"one\": \"Hello World!\", \"two\": \"Hello DHARPA!\"}\n", + "# stopword_list: hello\n", + "\n", + "workflow = DharpaWorkflow.from_file(workflow_config)\n", + "renderer = PlainJupyterWorkflowRenderer(workflow=workflow)\n", + "renderer.render()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "    'corpus_processing',\n",
+       "    'corpus_processing_simple',\n",
+       "    'dharpa_workflow',\n",
+       "    'file_reader',\n",
+       "    'input_files_processing',\n",
+       "    'lowercase_corpus',\n",
+       "    'remove_stopwords_from_corpus',\n",
+       "    'tokenize_corpus'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we can load workflows from json/yaml files on the file-system, and convert them to Python classes\n", + "load_workflows(f\"{base_path}/workflows\")\n", + "print(list_available_module_names())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'input_files_processing': {\n",
+       "        'inputs': {\n",
+       "            'files': 'Any',\n",
+       "            'make_lowercase': 'Bool',\n",
+       "            'remove_stopwords': 'Bool',\n",
+       "            'stopwords': 'List'\n",
+       "        },\n",
+       "        'outputs': {'processed_text_corpus': 'Dict'}\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# display the module spec for the 'input_files_processing' workflow\n", + "print_module_desc(\"input_files_processing\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'modules': [\n",
+       "        {\n",
+       "            'type': 'file_reader',\n",
+       "            'input_map': {'files': '__workflow_input__.files'},\n",
+       "            'id': 'file_reader'\n",
+       "        },\n",
+       "        {\n",
+       "            'type': 'corpus_processing',\n",
+       "            'input_map': {\n",
+       "                'text_map': 'file_reader.content_map',\n",
+       "                'make_lowercase': '__workflow_input__.make_lowercase',\n",
+       "                'remove_stopwords': '__workflow_input__.remove_stopwords',\n",
+       "                'stopwords': '__workflow_input__.stopwords'\n",
+       "            },\n",
+       "            'workflow_outputs': {'processed_text_corpus': 'processed_text_corpus'},\n",
+       "            'id': 'corpus_processing'\n",
+       "        }\n",
+       "    ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# create the workflow object\n", + "ifp_workflow = create_workflow('input_files_processing')\n", + "# display the internal structure of the workflow\n", + "print(ifp_workflow._workflow_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# display the execution graph for the workflow\n", + "graph_to_image(ifp_workflow.execution_graph)\n", + "# graph_to_image(ifp_workflow.data_flow_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "66541c9631444836b5a12048e1edc285", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GridspecLayout(children=(Label(value='files', layout=Layout(grid_area='widget001')), FileUpload(value={}, desc…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "87cecf1e619c4293a85e2b6dbc243672", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# auto-render input and output-widgets for the workflow (only works when executed in Jupyter)\n", + "renderer = PlainJupyterWorkflowRenderer(workflow=ifp_workflow)\n", + "renderer.render()" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/architecture/workflows/modularity/modularity.ipynb b/docs/architecture/workflows/modularity/modularity.ipynb new file mode 100644 index 000000000..0831458d3 --- /dev/null +++ b/docs/architecture/workflows/modularity/modularity.ipynb @@ -0,0 +1,768 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "slideshow": { + "slide_type": "skip" + }, + "tags": [ + "remove-cell" + ] + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "IPython.notebook.set_autosave_interval(0)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Autosave disabled\n" + ] + } + ], + "source": [ + "%autosave 0\n", + "\n", + "import os\n", + "from rich.jupyter import print\n", + "from dharpa_toolbox.modules.utils import list_available_module_names, describe_module, print_module_desc, load_workflows, create_module\n", + "from dharpa_toolbox.utils import print_file_content, graph_to_image\n", + "from dharpa_toolbox.modules.workflows import DharpaWorkflow\n", + "from dharpa_toolbox.rendering.jupyter.renderer import PlainJupyterWorkflowRenderer, ModuleJupyterWorkflowRenderer\n", + "\n", + "base_path = os.path.abspath(\".\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## What's a workflow, really?\n", + "\n", + "- Jupyter is a very good tool to create non-trivial exploratory workflows\n", + "- there's a difference between 'dynamic' workflows, and 'static' ones\n", + "- Jupyter is usually used to create workflows in a 'dynamic' way\n", + "- also important (for us): interactivity\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Currently, Jupyter is one of the most used technologies in digital research to create workflows. Although there are exceptions,\n", + "in most cases it is used to explore a very specific research question. Jupyter is exceptionally good at that, which is the\n", + "reason it is so successful.\n", + "\n", + "From a computer-engineering perspective, Jupyter notebooks are 'just' simple scripts, and often they include anti-patterns like\n", + "global- as well as hard-coded variables, little to no encapsulation of functionality, etc. Which means that typically,\n", + "Jupyter notebooks have (relatively) little value to other researchers, and re-usability is low. This is an acceptable\n", + "trade-off though, because the problems they are solving are (usually) very niche and specific, so there is little downside\n", + "to tailor the code to the exact problem one is having. In addition, Jupyter notebooks are very good to document the workflow\n", + "itself, and communicate what is happening to the data (which is important for publication).\n", + "\n", + "If we want to create a tool that lets users run pre-created workflows, that equation changes though. Because, now the assumption\n", + "is that the (comparatively few) workflows we create will be useful in not just a very specific way. The goal is to identify\n", + "areas where people have (roughly) the same problem, and to solve that problem in a generic way that is useful to a\n", + "larger group of people. The workflow will typically be less important in relation to the overall research project a\n", + "researcher is working on (compared to a tailored, specific one), but from the perspective of a reasearcher it will also be\n", + "much less hassle and expensive to use, since they don't have to create the workflow themselves, and someone else already\n", + "has thought about all the options and parameters that make sense, has done the validation and testing, etc. Also, they\n", + "don't have to learn programming if they don't already know it...\n", + "\n", + "This means that we are dealing now with a very 'static' workflow, compared to the 'dynamic' ones researchers with programming\n", + "skills can create and change themselves very easily. Everything that can happen in a workflow is known in advance, and\n", + "even though there can be 'forks' in the flow of data, those have to be defined, implemented and documented in advance.\n", + "And that difference is why we should not assume that Jupyter notebooks are as good a vessel to implement such a workflow\n", + "as they are in the other case, where all that can happen 'on the go'. It's still possible notebooks are a good fit here too,\n", + "but we can't use our normal experience with -- and intuition about -- Jupyter to make that case.\n", + "\n", + "One other point that is important to note is user interactivity. Usually, when developing a Jupyter notebook inputs (data as well\n", + "as parameters) are either hardcoded, or factored out into a variable that is changed on top of the notebook (or in some cells\n", + "further down). And by running or re-running certain cells, those variables are re-set or changed. This works fine for\n", + "dynamically creating a workflow (although, it's sometimes confusing, and one of the main criticisms against the Jupyter notebook\n", + "approach). But, in a 'static' workflow, we need to make sure that a user can set or change all those inputs at any time, while\n", + "making sure that the 'internal' state of our workflow is known to our engine. At a minimum, we need to know that our state\n", + "is currently inconsistent after a user-input, and have a way to communicating that to the user so they can kick off\n", + "some re-processing manually, to make it consistent again. Jupyter supports interactivity via widgets, but the 'cell-based'\n", + "approach in notebooks is not a very good fit for that, because it forces a very simple one-after-the other processing model,\n", + "that would make it hard to implement the efficient execution of even remotely non-trivial workflows (for example having\n", + "parallel execution of some cells, or skipping the execution of parts that don't need to be executed currently, etc.)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Prior art\n", + "\n", + "- workflow/pipeline modelling and execution is a solved problem in programming:\n", + " - [flow-based programming (FBP)](https://en.wikipedia.org/wiki/Flow-based_programming)\n", + " - requires well defined, modular entities (with 'ports': input and output values)\n", + "- lots of (partial) implementations in data engineering:\n", + " - [airflow](https://airflow.apache.org/)\n", + " - [luigi](https://github.com/spotify/luigi)\n", + " - [dagster](https://github.com/dagster-io/dagster)\n", + " - [prefect](https://www.prefect.io/)\n", + " - many others: Node-RED, Apache NiFi, IFTTT, Zapier, Huginn, ...\n", + "- one subtle (although important) difference with our project, again: interactivity" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "There is a form of programming that fits our problem space fairly well: [flow-based programming (FBP)](https://en.wikipedia.org/wiki/Flow-based_programming).\n", + "Like functional programming, it's probably older than all of us, and it is gaining some notable traction again in recent years\n", + "(although with much less hype around it, and without being explicitly mentioned by name). A lot of the data orchestration\n", + "tools and frameworks that cropped up in recent years use some form or aspects of FBP, for example:\n", + "\n", + " - [airflow](https://airflow.apache.org/)\n", + " - [luigi](https://github.com/spotify/luigi)\n", + " - [dagster](https://github.com/dagster-io/dagster)\n", + " - [prefect](https://www.prefect.io/)\n", + "\n", + "One thing that FBP requires are well defined entities ('modules', 'nodes'), that have 'ports' (meaning: known inputs, and outputs).\n", + "A Jupyter notebook for example does not typically have that, which makes it hard to 'combine' notebooks in an FBP-like\n", + "manner. There are attempts to 'formalize' Jupyter notebooks in a way that would make them better fits in such scenarios\n", + "([papermill](https://papermill.readthedocs.io/en/latest/), [orchest](https://www.orchest.io/)), but in my opinion, although\n", + " they kind of work, those attempts are a bit clunky, and not very user-friendly (because they try to bend Jupyter into\n", + " something it was not designed to do). Also, they typically only deal with inputs; outputs are not very well defined at all.\n", + " Compare that for example with how a 'proper' data-orchestration tool like dagster handles [inputs and outputs](https://docs.dagster.io/tutorial/basics_solids),\n", + "which should make clear how many more options someone who implements a workflow execution and rendering framework (which\n", + "is basically what we are building) has when that sort of metadata is available.\n", + "\n", + "\n", + "As was the case in the section above, one difference in our case is interactivity. Most tools in that space assume they'll\n", + "get the input values for a workflow execution at the start, and then they can proceed to go through the workflow, batch-processing\n", + "style (meaning, no further user input half-way through). This is different for us, since we want users to be able to\n", + "interactively explore their data (within the limits of a 'static' workflow). This means we will have to consider how\n", + "to deal with long-running computations whose results wil be available after minutes, hours weeks. The good thing is though,\n", + "whatever we come up with, we'll get a 'traditional workflow execution engine' for free, because every workflow that can\n", + "be executed interactively, will also be able to do 'batch-style'. This will let us re-use and 'move' our workflows to\n", + "other execution environments (HPC clusters, 'the cloud', ...) and do other interesting things with them if the need arises\n", + "(monte-carlo style experiments, automated-testing of workflows and modules, ...)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Modelling workflows\n", + "\n", + "- research data is more useful when it's structured, so why would workflow definitions be different?\n", + "- so: can we model a workflow as code, or even better: as data?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "So, assuming everyone agrees this is a reasonable avenue to explore, we have to think about how we want to model our\n", + "workflows. We should definitely look at how other, similar frameworks do this, but I think one approach is very tempting:\n", + "\n", + "> ***Describe workflows as structured data!***\n", + "\n", + "There are several reasons for why I think this would be a good idea:\n", + "\n", + "- structured data can be processed by every programming language in existence\n", + " - we would have one 'main' library that does the actual workflow execution/data processing (probably in Python)\n", + " - we could use other languages to do different other things in our 'ecosystem': e.g. JavaScript for dynamically rendering a frontend\n", + "- we can (largely) work independent from each other, the only thing to consult about is the schema of the workflow data\n", + "- such structured data can be displayed as a network graph, which is much easier to grasp than code\n", + "- automated testing of every workflow and model is easy, can be done in CI/CD\n", + "- Jupyter notebooks are, as I've explained above, pretty good at creating and manipulating structured data\n", + "- there are a lot of researchers out there who know how to use Jupyter: those could all be potential \"DHARPA-workflow\" creators\n", + "- in addition to that, we can decide to create a visual 'workflow editor/creator', that is independent from the 'workflow executor' part, and 100% optional\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Code!\n", + "\n", + "The following is using prototype-quality code to illustrate how a 'workflow-as-data' model could look like in practice. Only a few modules are implemented, the goal is to recreate the first part of the 'Topic-modelling' workflow: load some text files, tokenize them, then do some processing (lowercasing, removal of stopwords).\n", + "\n", + "### Definitions\n", + "\n", + "- ***module***: a module is an atomic entity that contains a fixed set of defined inputs and outputs, as well as a processing unit that converts the set of inputs to outputs, in a predicable way\n", + "\n", + "- ***workflow***: a workflow contains a set of modules which are connected in a specific way. A workflow is conceptually also a module, because it also contains a set of inputs/outputs as well as processing unit, and it can be used in other, 'parent' workflows in the same ways a normal module can." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['corpus_processing',\n", + " 'corpus_processing_simple',\n", + " 'dharpa_workflow',\n", + " 'file_reader',\n", + " 'input_files_processing',\n", + " 'lowercase_corpus',\n", + " 'remove_stopwords_from_corpus',\n", + " 'tokenize_corpus']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can list all available modules (and workflows)\n", + "list_available_module_names()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{'tokenize_corpus': {'inputs': {'text_map': 'Dict'}, 'outputs': {'tokenized_text': 'Dict'}}}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we can investigate each modules inputs and output specs\n", + "print_module_desc('tokenize_corpus')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "```yaml\n", + "---\n", + "modules:\n", + "\n", + "- type: tokenize_corpus\n", + "\n", + "- type: lowercase_corpus\n", + " input_map:\n", + " tokenized_text: tokenize_corpus.tokenized_text\n", + "\n", + "- type: remove_stopwords_from_corpus\n", + " input_map:\n", + " tokenized_text: lowercase_corpus.tokenized_text\n", + " workflow_outputs:\n", + " tokenized_text: processed_text_corpus\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# a workflow configuration is basically just a list of modules, incl. their input/output connections\n", + "workflow_config = f'{base_path}/workflows/corpus_processing_simple.yaml'\n", + "print_file_content(workflow_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "    'lowercase_corpus__enabled',\n",
+       "    'remove_stopwords_from_corpus__enabled',\n",
+       "    'remove_stopwords_from_corpus__stopwords_list',\n",
+       "    'tokenize_corpus__text_map'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we create a 'workflow' object using the configuration data\n", + "workflow: DharpaWorkflow = DharpaWorkflow.from_file(workflow_config)\n", + "# we can investigate each workflows available input and output names\n", + "print(workflow.input_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
['processed_text_corpus']\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(workflow.output_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can display the execution and data-flow structures of a workflow graphically\n", + "graph_to_image(workflow.execution_graph)\n", + "# graph_to_image(workflow.data_flow_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'dharpa_workflow': {\n",
+       "        'inputs': {\n",
+       "            'lowercase_corpus__enabled': 'Bool',\n",
+       "            'remove_stopwords_from_corpus__enabled': 'Bool',\n",
+       "            'remove_stopwords_from_corpus__stopwords_list': 'List',\n",
+       "            'tokenize_corpus__text_map': 'Dict'\n",
+       "        },\n",
+       "        'outputs': {'processed_text_corpus': 'Dict'}\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# print the workflow input/output spec\n", + "print_module_desc(workflow)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{'1': ['world'], '2': ['dharpa']}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# using the spec, we can set a workflows inputs and outputs manually\n", + "text_map = {\n", + " \"1\": \"Hello World!\",\n", + " \"2\": \"Hello DHARPA!\"\n", + "}\n", + "stopwords = [\n", + " \"hello\",\n", + " \"!\"\n", + "]\n", + "workflow.set_input(\"tokenize_corpus__text_map\", text_map)\n", + "workflow.set_input(\"lowercase_corpus__enabled\", True)\n", + "workflow.set_input(\"remove_stopwords_from_corpus__enabled\", True)\n", + "workflow.set_input(\"remove_stopwords_from_corpus__stopwords_list\", stopwords)\n", + "\n", + "# the workflow state is processed automatically, so we can always query the current output\n", + "output1 = workflow.get_output(\"processed_text_corpus\")\n", + "print(output1)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "    'corpus_processing',\n",
+       "    'corpus_processing_simple',\n",
+       "    'dharpa_workflow',\n",
+       "    'file_reader',\n",
+       "    'input_files_processing',\n",
+       "    'lowercase_corpus',\n",
+       "    'remove_stopwords_from_corpus',\n",
+       "    'tokenize_corpus'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we can load workflows from json/yaml files on the file-system, and convert them to Python classes\n", + "load_workflows(f\"{base_path}/workflows\")\n", + "print(list_available_module_names())" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'input_files_processing': {\n",
+       "        'inputs': {\n",
+       "            'files': 'Any',\n",
+       "            'make_lowercase': 'Bool',\n",
+       "            'remove_stopwords': 'Bool',\n",
+       "            'stopwords': 'List'\n",
+       "        },\n",
+       "        'outputs': {'processed_text_corpus': 'Dict'}\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# display the module spec for the 'input_files_processing' workflow\n", + "print_module_desc(\"input_files_processing\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'modules': [\n",
+       "        {\n",
+       "            'type': 'file_reader',\n",
+       "            'input_map': {'files': '__workflow_input__.files'},\n",
+       "            'id': 'file_reader'\n",
+       "        },\n",
+       "        {\n",
+       "            'type': 'corpus_processing',\n",
+       "            'input_map': {\n",
+       "                'text_map': 'file_reader.content_map',\n",
+       "                'make_lowercase': '__workflow_input__.make_lowercase',\n",
+       "                'remove_stopwords': '__workflow_input__.remove_stopwords',\n",
+       "                'stopwords': '__workflow_input__.stopwords'\n",
+       "            },\n",
+       "            'workflow_outputs': {'processed_text_corpus': 'processed_text_corpus'},\n",
+       "            'id': 'corpus_processing'\n",
+       "        }\n",
+       "    ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# create the workflow object\n", + "ifp_workflow = create_module('input_files_processing')\n", + "# display the internal structure of the workflow\n", + "print(ifp_workflow._workflow_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# display the execution graph for the workflow\n", + "graph_to_image(ifp_workflow.execution_graph)\n", + "# graph_to_image(ifp_workflow.data_flow_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "797d10365048493281e703a19d548681", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GridspecLayout(children=(Label(value='files', layout=Layout(grid_area='widget001')), FileUpload(value={}, desc…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "794daa53740a42b5848d45e1ec0cac2b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# auto-render input and output-widgets for the workflow (only works when executed in Jupyter)\n", + "renderer = PlainJupyterWorkflowRenderer(workflow=ifp_workflow)\n", + "renderer.render()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Advantages\n", + "\n", + "- each interactive workflow is automatically a 'batch-style' one\n", + "- ability to work on different parts of the application almost 100% independently (workflow engine, workflow renderers (UIs), plugins (submitting long running jobs to a cluster, metadata augmentation/data lineage, metric gathering, ...)\n", + "- comparatively little dependencies for the base system / lightweight, but very extensible\n", + "- no dependency on any one UI/frontend technology (can use Jupyter/React/QT/...), can run without any UI at all\n", + "- complex workflow can be broken up into separate pieces, and developed and tested individually\n", + "- fairly high re-usablity\n", + "- easy to use in an agile, iterative development process\n", + "\n", + "\n", + "### Disadvantages\n", + "\n", + "- potentially lower performance due to having to create copies of inputs/outputs to make sure they (or items contained within them) are not changed by subsequent steps\n", + "- harder to create very customized UIs\n", + "- there's a limit to the complexity of workflows that can be supported realistically (e.g. hard to implement control structures like if-then-else or loops on a workflow level/outside of modules -- but in a lot of cases that is probably not necessary)\n", + "\n" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + }, + "rise": {} + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/architecture/workflows/modularity/modularity_2.ipynb b/docs/architecture/workflows/modularity/modularity_2.ipynb new file mode 100644 index 000000000..7ad23358d --- /dev/null +++ b/docs/architecture/workflows/modularity/modularity_2.ipynb @@ -0,0 +1,735 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "application/javascript": [ + "IPython.notebook.set_autosave_interval(0)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Autosave disabled\n" + ] + } + ], + "source": [ + "%autosave 0\n", + "\n", + "import os\n", + "from rich.jupyter import print\n", + "from dharpa_toolbox.modules.utils import list_available_module_names, describe_module, print_module_desc, load_workflows, create_module, create_workflow\n", + "from dharpa_toolbox.utils import print_file_content, graph_to_image\n", + "from dharpa_toolbox.modules.workflows import DharpaWorkflow\n", + "from dharpa_toolbox.rendering.jupyter.renderer import PlainJupyterWorkflowRenderer, ModuleJupyterWorkflowRenderer\n", + "\n", + "base_path = os.path.abspath(\".\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## What's a workflow?\n", + "\n", + "- a workflow is a tool to transform data into more structured data\n", + " - 'more data' -- we'll create what can be considered 'new' data out of the existing set\n", + " - 'better structured' -- improve (and replace) the current structure (fix errors, etc.)\n", + " - 'more structure' -- augment existing data with additional structure\n", + " - secondary outcomes (insight, new research questions, etc...)\n", + "- workflows are 'just' simple scripts\n", + " - no or only very minimal control structures, except from input/output connections\n", + " - low computational complexity for the workflow itself\n", + " - high(er) computational complexity within modules (but hidden from workflow creator)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Workflows in our context\n", + "\n", + "- workflow creators and workflow users are different, distinct roles\n", + "- Jupyter is a very good tool to create non-trivial exploratory workflows\n", + "- there's a difference between 'dynamic' workflows, and 'static' ones\n", + "- Jupyter is usually used to create workflows in a 'dynamic' way\n", + "- more 'data engineering' than 'data science'\n", + "- interactivity:\n", + " - not an issue in 'data engineering'\n", + " - not very well supported in Jupyter (cell-based approach not useful for us)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Two options: monolithic & modular\n", + "\n", + "### Monolithic\n", + "\n", + " - complexity spread out across the workflow\n", + " - well integrated, no restrictions UI-wise\n", + "\n", + "### Modular\n", + "\n", + " - complexity concentrated in the framework, but simple modules\n", + " - some restrictions on the UI" + ] + }, + { + "cell_type": "markdown", + "source": [ + "- no difference in how to handle data, metadata\n", + "- backend modules don't necessarily map onto frontend modules\n", + "- suggested approach:\n", + " - 100% modular backend\n", + " - modular frontend, incl. optional monolithic frontend approach for high-value workflows" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "notes" + } + }, + "source": [ + "Notes:\n", + "\n", + " - only important for the workflow part, there will be features that won't be affected by this at all (metadata, data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Definitions\n", + "\n", + "- ***module***:\n", + " - a module is an atomic entity that contains a fixed set of defined inputs and outputs,\n", + " - as well as a processing unit that converts the set of inputs to outputs, in a predicable way\n", + "\n", + "- ***workflow***:\n", + " - a workflow contains a set of modules which are connected in a specific way\n", + " - a workflow is conceptually also a module, because it also contains a set of inputs/outputs as well as processing unit\n", + " - it can be used in other, 'parent' workflows in the same ways a normal module can.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Modelling workflows\n", + "\n", + "- research data is more useful when it's structured, so why would workflow definitions be different?\n", + "- so: can we model a workflow as data?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "notes" + } + }, + "source": [ + "Notes:\n", + "\n", + "- monolithic workflows are not structured (comparison:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "notes" + } + }, + "source": [ + "## Advantages\n", + "\n", + " - scalability\n", + " - disposible middleware and frontends (only important to be able to use the created workflows)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Examples" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['dharpa_workflow',\n", + " 'file_reader',\n", + " 'lowercase_corpus',\n", + " 'remove_stopwords_from_corpus',\n", + " 'tokenize_corpus']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can list all available modules (and workflows)\n", + "list_available_module_names()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{'tokenize_corpus': {'inputs': {'text_map': 'Dict'}, 'outputs': {'tokenized_text': 'Dict'}}}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we can investigate each modules inputs and output specs\n", + "print_module_desc('tokenize_corpus')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "```yaml\n", + "---\n", + "modules:\n", + "\n", + "- type: tokenize_corpus\n", + "\n", + "- type: lowercase_corpus\n", + " input_map:\n", + " tokenized_text: tokenize_corpus.tokenized_text\n", + "\n", + "- type: remove_stopwords_from_corpus\n", + " input_map:\n", + " tokenized_text: lowercase_corpus.tokenized_text\n", + " workflow_outputs:\n", + " tokenized_text: processed_text_corpus\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# a workflow configuration is basically just a list of modules, incl. their input/output connections\n", + "workflow_config = f'{base_path}/workflows/corpus_processing_simple.yaml'\n", + "print_file_content(workflow_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "    'lowercase_corpus__enabled',\n",
+       "    'remove_stopwords_from_corpus__enabled',\n",
+       "    'remove_stopwords_from_corpus__stopwords_list',\n",
+       "    'tokenize_corpus__text_map'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we create a 'workflow' object using the configuration data\n", + "workflow = DharpaWorkflow.from_file(workflow_config)\n", + "# we can investigate each workflows available input and output names\n", + "print(workflow.input_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
['processed_text_corpus']\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(workflow.output_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we can display the execution and data-flow structures of a workflow graphically\n", + "graph_to_image(workflow.execution_graph)\n", + "# graph_to_image(workflow.data_flow_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{'1': ['world'], '2': ['dharpa']}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "text_map = {\n", + " \"1\": \"Hello World!\",\n", + " \"2\": \"Hello DHARPA!\"\n", + "}\n", + "stopwords = [\n", + " \"hello\",\n", + " \"!\"\n", + "]\n", + "\n", + "workflow = DharpaWorkflow.from_file(workflow_config)\n", + "\n", + "workflow.set_input(\"tokenize_corpus__text_map\", text_map)\n", + "workflow.set_input(\"lowercase_corpus__enabled\", True)\n", + "workflow.set_input(\"remove_stopwords_from_corpus__enabled\", True)\n", + "workflow.set_input(\"remove_stopwords_from_corpus__stopwords_list\", stopwords)\n", + "\n", + "# the workflow state is processed automatically, so we can always query the current output\n", + "output1 = workflow.get_output(\"processed_text_corpus\")\n", + "print(output1)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "eb898d84b9014e3ba5d0791317772f14", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GridspecLayout(children=(Label(value='tokenize_corpus__text_map', layout=Layout(grid_area='widget001')), Texta…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f8aa7471161243fb981843cd3d0b743a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# text_map: {\"one\": \"Hello World!\", \"two\": \"Hello DHARPA!\"}\n", + "# stopword_list: hello\n", + "\n", + "workflow = DharpaWorkflow.from_file(workflow_config)\n", + "renderer = PlainJupyterWorkflowRenderer(workflow=workflow)\n", + "renderer.render()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "    'corpus_processing',\n",
+       "    'corpus_processing_simple',\n",
+       "    'dharpa_workflow',\n",
+       "    'file_reader',\n",
+       "    'input_files_processing',\n",
+       "    'lowercase_corpus',\n",
+       "    'remove_stopwords_from_corpus',\n",
+       "    'tokenize_corpus'\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# we can load workflows from json/yaml files on the file-system, and convert them to Python classes\n", + "load_workflows(f\"{base_path}/workflows\")\n", + "print(list_available_module_names())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'input_files_processing': {\n",
+       "        'inputs': {\n",
+       "            'files': 'Any',\n",
+       "            'make_lowercase': 'Bool',\n",
+       "            'remove_stopwords': 'Bool',\n",
+       "            'stopwords': 'List'\n",
+       "        },\n",
+       "        'outputs': {'processed_text_corpus': 'Dict'}\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# display the module spec for the 'input_files_processing' workflow\n", + "print_module_desc(\"input_files_processing\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'modules': [\n",
+       "        {\n",
+       "            'type': 'file_reader',\n",
+       "            'input_map': {'files': '__workflow_input__.files'},\n",
+       "            'id': 'file_reader'\n",
+       "        },\n",
+       "        {\n",
+       "            'type': 'corpus_processing',\n",
+       "            'input_map': {\n",
+       "                'text_map': 'file_reader.content_map',\n",
+       "                'make_lowercase': '__workflow_input__.make_lowercase',\n",
+       "                'remove_stopwords': '__workflow_input__.remove_stopwords',\n",
+       "                'stopwords': '__workflow_input__.stopwords'\n",
+       "            },\n",
+       "            'workflow_outputs': {'processed_text_corpus': 'processed_text_corpus'},\n",
+       "            'id': 'corpus_processing'\n",
+       "        }\n",
+       "    ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# create the workflow object\n", + "ifp_workflow = create_workflow('input_files_processing')\n", + "# display the internal structure of the workflow\n", + "print(ifp_workflow._workflow_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# display the execution graph for the workflow\n", + "graph_to_image(ifp_workflow.execution_graph)\n", + "# graph_to_image(ifp_workflow.data_flow_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "66541c9631444836b5a12048e1edc285", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "GridspecLayout(children=(Label(value='files', layout=Layout(grid_area='widget001')), FileUpload(value={}, desc…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "87cecf1e619c4293a85e2b6dbc243672", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# auto-render input and output-widgets for the workflow (only works when executed in Jupyter)\n", + "renderer = PlainJupyterWorkflowRenderer(workflow=ifp_workflow)\n", + "renderer.render()" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs/architecture/workflows/modularity/workflows/corpus_processing.yaml b/docs/architecture/workflows/modularity/workflows/corpus_processing.yaml new file mode 100644 index 000000000..2fef6b208 --- /dev/null +++ b/docs/architecture/workflows/modularity/workflows/corpus_processing.yaml @@ -0,0 +1,24 @@ +--- +_doc: | + Receives a dict with the id of a text as key, and the text as value, tokenizes the text(s) and then processes the tokenized values according to the provided settings. + + Currently, lowercasing and the removal of stopwords is supported. + +modules: + +- type: tokenize_corpus + input_map: + text_map: __workflow_input__.text_map + +- type: lowercase_corpus + input_map: + tokenized_text: tokenize_corpus.tokenized_text + enabled: __workflow_input__.make_lowercase + +- type: remove_stopwords_from_corpus + input_map: + tokenized_text: lowercase_corpus.tokenized_text + enabled: __workflow_input__.remove_stopwords + stopwords_list: __workflow_input__.stopwords + workflow_outputs: + tokenized_text: processed_text_corpus diff --git a/docs/architecture/workflows/modularity/workflows/corpus_processing_simple.yaml b/docs/architecture/workflows/modularity/workflows/corpus_processing_simple.yaml new file mode 100644 index 000000000..0a0913bca --- /dev/null +++ b/docs/architecture/workflows/modularity/workflows/corpus_processing_simple.yaml @@ -0,0 +1,14 @@ +--- +modules: + +- type: tokenize_corpus + +- type: lowercase_corpus + input_map: + tokenized_text: tokenize_corpus.tokenized_text + +- type: remove_stopwords_from_corpus + input_map: + tokenized_text: lowercase_corpus.tokenized_text + workflow_outputs: + tokenized_text: processed_text_corpus diff --git a/docs/architecture/workflows/modularity/workflows/input_files_processing.yaml b/docs/architecture/workflows/modularity/workflows/input_files_processing.yaml new file mode 100644 index 000000000..2dceb5b23 --- /dev/null +++ b/docs/architecture/workflows/modularity/workflows/input_files_processing.yaml @@ -0,0 +1,20 @@ +--- +_doc: | + Reads one or several (text)-files, tokenzies the content, then processes the tokenized content according to the provided settings. + + Currently, lowercasing and the removal of stopwords is supported. + +modules: + +- type: file_reader + input_map: + files: __workflow_input__.files + +- type: corpus_processing + input_map: + text_map: file_reader.content_map + make_lowercase: __workflow_input__.make_lowercase + remove_stopwords: __workflow_input__.remove_stopwords + stopwords: __workflow_input__.stopwords + workflow_outputs: + processed_text_corpus: processed_text_corpus diff --git a/mkdocs.yml b/mkdocs.yml index bca67994a..e0b64bddd 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -30,6 +30,17 @@ markdown_extensions: nav: - Home: index.md - Install: install.md +- Architecture: + - Overview: architecture/index.md + - Data: + - Overview: architecture/data/index.md + - Formats: architecture/data/data_formats.ipynb + - Workflows: + - Workflows: architecture/workflows/index.md + - Modularity: architecture/workflows/modularity/modularity.ipynb + - Metadata: architecture/metadata.md + - Decisions: architecture/decisions.md + - Assumptions: architecture/assumptions.md plugins: - search diff --git a/setup.cfg b/setup.cfg index 4a2efffc6..ed3f3a209 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,7 +5,7 @@ long_description = file: README.md long_description_content_type = text/markdown url = https://github.com/DHARPA-Project/kiara author = Markus Binsteiner -author_email = markus.binsteiner@uni.lu +author_email = markus@frkl.io license = MPL-2.0 license_file = LICENSE platforms = any diff --git a/src/kiara/__init__.py b/src/kiara/__init__.py index 371472eb5..6a480e484 100644 --- a/src/kiara/__init__.py +++ b/src/kiara/__init__.py @@ -36,7 +36,7 @@ __author__ = """Markus Binsteiner""" """The author of this package.""" -__email__ = "markus.binsteiner@uni.lu" +__email__ = "markus@frkl.io" """Email address of the author.""" diff --git a/src/kiara/_frkl/_frkl.json b/src/kiara/_frkl/_frkl.json index 43ab866f7..0c18bf39d 100644 --- a/src/kiara/_frkl/_frkl.json +++ b/src/kiara/_frkl/_frkl.json @@ -1,7 +1,7 @@ { "project": { "full_name": "Markus Binsteiner", - "email": "markus.binsteiner@uni.lu", + "email": "markus@frkl.io", "project_name": "kiara", "exe_name": "kiara", "project_slug": "kiara",