Merge pull request #4 from dominikzorgnotti/handling-kb2143838

Advanced data standardization Closing #3
dominikzorgnotti · Mar 12, 2021 · ef8bc83 · ef8bc83
2 parents aadd2b5 + cb58e9b
commit ef8bc83
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -5,6 +5,9 @@ and transforms them into a machine-readable format.
 A combination of parsing with [beautiful soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and data handling with [Pandas](https://pandas.pydata.org/) is the used to achieve the goal.
 A process is scheduled with GitHub actions to run daily, the results will be pushed to the repo [Machine-readable VMware release data](https://github.com/dominikzorgnotti/vmware_product_releases_machine-readable).
 
+## Data standardization
+Since v0.1.0 the raw data from the published KB is transformed to establish a standard in terms of column headers, formatting, etc.
+
 ## Output format and folder structures
 The way the output is currently structured is:   
 - Directory: based on Pandas options to handle [json data orientation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html)

diff --git a/data_handling.py b/data_handling.py
@@ -22,7 +22,7 @@
 __contact__ = "[email protected]"
 __license__ = "GPLv3"
 __status__ = "beta"
-__version__ = "0.0.3"
+__version__ = "0.1.0"
 
 
 import os
@@ -37,6 +37,16 @@ def create_json_output(kb_dataobject, output_base_dir: str, record_type: str):
     table_id = 0
     for dataframe in kb_dataobject.list_of_dframes:
         filename = f"kb{kb_dataobject.id}_{kb_dataobject.fmt_product}_table{table_id}_release_as-{record_type}.json"
+        # vRA KB
+        if kb_dataobject.id == 2143850:
+            dataframe = transform_kb2143850(dataframe)
+        # General data optimization
+        if ("BuildNumber" in dataframe.columns):
+            dataframe.rename(columns={"BuildNumber": "Build Number"}, inplace=True)
+        if ("Build number" in dataframe.columns):
+            dataframe.rename(columns={"Build number": "Build Number"}, inplace=True)
+        if "ReleaseDate" in dataframe.columns:
+            dataframe.rename(columns={"ReleaseDate": "Release Date"}, inplace=True)
         if "Build Number" in dataframe.columns and record_type == "index":
             dataframe = transform_index(dataframe)
         dataframe.to_json(
@@ -53,3 +63,10 @@ def transform_index(dataframe):
     dataframe.reset_index(drop=True)
     dataframe.set_index("Build Number", inplace=True)
     return dataframe
+
+
+def transform_kb2143850(dataframe):
+    """Special handling of KB2143850 (vRA)"""
+    if r"Build Number - Version" in dataframe:
+        dataframe[["Build Number", "Version"]] = dataframe[r"Build Number - Version"].str.split(r" - ", expand=True)
+    return dataframe
diff --git a/kb_data.py b/kb_data.py
@@ -35,6 +35,7 @@ def __init__(self, kb_id):
         self.fmt_product = self.format_product_name()
         self.raw_html_resolution = self.get_resolution_section()
         self.list_of_dframes = self.parse_releasedata()
+        self.list_of_merged_frames = None
 
     def get_resolution_section(self):
         """Extracts the resolution section from the KB article content section"""
@@ -66,21 +67,63 @@ def parse_releasedata(self):
         df = pd.read_html(self.raw_html_resolution, flavor="bs4")
         # Contains a list of all tables converted to dataframes in the resolution section
         list_of_release_df = []
-        # Since the HTML table has no header, we need to reassign the first row as heading
+        dict_of_releases = {}
         for table_id in range(len(df)):
-            df_header = df[table_id][:1]
-            current_df = df[table_id][1:]
-            current_df.columns = df_header.values.tolist()[0]
-            # Get the data types right, especially the date format='%m/%d/%Y'
-            releaseinfo_dataframe = current_df
-            if "Release Date" in current_df.columns:
-                releaseinfo_dataframe["Release Date"] = pd.to_datetime(current_df["Release Date"],
-                                                                       infer_datetime_format=True, errors='coerce')
-            # Skipping build number conversion. This produces non-deterministic results atm
-            # if "Build Number" in current_df.columns:
-            #     releaseinfo_dataframe["Build Number"] = pd.to_numeric(current_df["Build Number"],
-            #                                                           errors='coerce', downcast="integer")
-            list_of_release_df.append(releaseinfo_dataframe)
-            # Fun stuff may happen with dataframes if not erased before the next iteration
-            del df_header, current_df, releaseinfo_dataframe
+            # KB2143838 handling of vCenter 6.7 releases with VCSA and Windows
+            if self.id == 2143838 and table_id == 1:
+                vcenter67_table = df[table_id]
+                product_editions = ["VCSA", "Windows"]
+                for product_edition in product_editions:
+                    split_df = self.split_kb2143838(vcenter67_table, product_edition)
+                    reformatted_df = transform_kb2143838(split_df)
+                    list_of_release_df.append(reformatted_df)
+                    del split_df
+            elif self.id == 2143838 and table_id == 0:
+                vcenter7_table = df[table_id]
+                reformatted_df = transform_kb2143838(vcenter7_table)
+                list_of_release_df.append(reformatted_df)
+            else:
+                # Since some HTML table have no header, we need to reassign the first row as heading
+                if "Version" not in df[table_id].columns:
+                    df_header = df[table_id][:1]
+                    current_df = df[table_id][1:]
+                    current_df.columns = df_header.values.tolist()[0]
+                    # Moving the del up here
+                    del df_header
+                else:
+                    current_df = df[table_id]
+                releaseinfo_dataframe = current_df
+                # Get the data types right, especially the date format='%m/%d/%Y'
+                if "Release Date" in current_df.columns:
+                    releaseinfo_dataframe["Release Date"] = pd.to_datetime(current_df["Release Date"],
+                                                                           infer_datetime_format=True, errors='coerce')
+                list_of_release_df.append(releaseinfo_dataframe)
+                # Fun stuff may happen with dataframes if not erased before the next iteration
+                del current_df, releaseinfo_dataframe
         return list_of_release_df
+
+
+    def split_kb2143838(self, dataframe, product_edition):
+        """Splits a dataframe based on the product edition (VCSA, Windows) and returns the output dataframe"""
+        tempdf_headless = dataframe[dataframe[0] == product_edition]
+        tempdf_header = tempdf_headless[:1]
+        tempdf = tempdf_headless[1:]
+        tempdf.columns = tempdf_header.values.tolist()[0]
+        tempdf.rename(columns={product_edition: "Edition"}, inplace=True)
+        #tempdf["Release Date"] = pd.to_datetime(tempdf["Release Date"],format='%Y-%m-%d', errors='coerce')
+        return tempdf
+
+def transform_kb2143838(dataframe):
+    """Special handling of KB2143838 (vCenter)"""
+    # When you access the vCenter API the values from this column are returned, alias it as Build Number
+    if "Client/MOB/vpxd.log" in dataframe.columns:
+        dataframe["Build Number"] = dataframe["Client/MOB/vpxd.log"]
+    if "Version" in dataframe.columns:
+        # Splitting the data in the Version columns does not work atm
+        #pass
+        tempdf = dataframe.rename(columns={"Version": "Version - Release Date"})
+        tempdf[["Version", "Release Name"]] = tempdf["Version - Release Date"].str.split(pat=r"(", expand=True)
+        tempdf["Release Name"] = tempdf["Release Name"].str.strip(r")")
+        # dataframe.rename(columns={"Version": "Version - Release Date"}, inplace=True)
+        # dataframe.rename(columns={"Version2": "Version"}, inplace=True)
+    return tempdf
diff --git a/main.py b/main.py
@@ -22,7 +22,7 @@
 __contact__ = "[email protected]"
 __license__ = "GPLv3"
 __status__ = "beta"
-__version__ = "0.0.3"
+__version__ = "0.1.0"
 
 # Imports
 from data_handling import create_json_output

diff --git a/templates/README.md b/templates/README.md
@@ -2,7 +2,9 @@
 
 This repo contains the release data of VMware products in a machine-readable format (currently JSON). It enables you to use the data easier in scripts or other automation use-cases.  
 The data is generated by a GitHub action running daily from [my transformation script](https://github.com/dominikzorgnotti/transform-vmware-product-builds-to-json).  
-*note: It's still early days: Currently, only a subset of product data is available until this hits a more stable release.
+
+## Data standardization
+Since v0.1.0 of the transformation script the raw data from the published KB is transformed to establish a standard in terms of column headers, formatting, etc.
 
 ## Output format and folder structures
 Different use-cases, different ways to organize data.