From 26c5d4389e4e17c5b3d9ed381076a620e9d805ce Mon Sep 17 00:00:00 2001 From: dominikzorgnotti Date: Mon, 22 Mar 2021 23:06:57 +0100 Subject: [PATCH] Added handling of dell vxrail releases in KB52075-vxrail --- README.md | 53 ++++++++++++++++++++++++------------ data_handling.py | 12 ++++++++- kb_data.py | 55 +++++++++++++++++++++++++++++++++++--- main.py | 6 ++++- templates/README.md | 65 +++++++++++++++++++++++++++++---------------- 5 files changed, 145 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 549dce6..0976fc6 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,66 @@ # Transform VMware product builds -This Python code parses VMware product release data from the kb article [Correlating build numbers and versions of VMware products (1014508)](https://kb.vmware.com/s/article/1014508?lang=en_US) +This Python code parses VMware product release data from the kb +article [Correlating build numbers and versions of VMware products (1014508)](https://kb.vmware.com/s/article/1014508?lang=en_US) and transforms them into a machine-readable format. -A combination of parsing with [beautiful soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and data handling with [Pandas](https://pandas.pydata.org/) is the used to achieve the goal. -A process is scheduled with GitHub actions to run daily, the results will be pushed to the repo [Machine-readable VMware release data](https://github.com/dominikzorgnotti/vmware_product_releases_machine-readable). +A combination of parsing with [beautiful soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and data handling +with [Pandas](https://pandas.pydata.org/) is the used to achieve the goal. A process is scheduled with GitHub actions to +run daily, the results will be pushed to the +repo [Machine-readable VMware release data](https://github.com/dominikzorgnotti/vmware_product_releases_machine-readable) +. ## Data standardization -Since v0.1.0 the raw data from the published KB is transformed to establish a standard in terms of column headers, formatting, etc. -KB 2143832 (ESXi) is used as the model for providing a standardized information set. + +Since v0.1.0 the raw data from the published KB is transformed to establish a standard in terms of column headers, +formatting, etc. KB 2143832 (ESXi) is used as the model for providing a standardized information set. ### Uniform column names + Columns describing the same data set have different labels, e.g. "Build number", "Build Number", "BuildNumber". In case of the example, the columns will be renamed to "Build Number" per KB 2143832. ### Multi-value columns + Columns may have more than one value, e.g. "Build Number - Version" in KB2143850 (vRealize Automation). -In this case, two additional columns (Version, Build Number) will be added to the table each containing just a single Value. +In this case, two additional columns (Version, Build Number) will be added to the table each containing just a single +Value. ### Merged tables -Roadmap: There may be more than one table that hold the version information, e.g. in KB2143838 (vCenter Server). -A merge operation will attempt to provide a unified table. + +Roadmap: There may be more than one table that hold the version information, e.g. in KB2143838 (vCenter Server). A merge +operation will attempt to provide a unified table. ### Nested tables, merged columns/rows -Since v0.2.0: For vCenter build information (KB2143838), this release based on PR #5 offers merged tables: + +Since v0.2.0: For vCenter build information (KB2143838), this release based on PR #5 offers merged tables: The KB article contains three tables: + - Release information for VCSA 7 - Release information for VCSA/Windows VC 6.7 - Release information for VCSA/Windows before that The merged output available is now: + - one table for all VCSA releases - one table for all Windows releases -- one table for all releases -Unicode issues are addressed as well +- one table for all releases Unicode issues are addressed as well + +### Non-standard tables + +Since v0.4.0 the script can handle KB52075 (vxrail). This table has none of the default column names and also multi-line +column headers. ## Output format and folder structures -The way the output is currently structured is: -- Directory: based on Pandas options to handle [json data orientation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html) + +The way the output is currently structured is: + +- Directory: based on Pandas options to + handle [json data orientation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html) - Files: KB(a)_(b)_table(c)_release_as_(d)" - - a: knowledge base article id - the unique ID for the KB article - - b: product name - The first product from the meta data, all in lower case and spaces replaced by underscores - - c: An id to identify multiple html tables on the section (starting at 0) - - d: json data orientation - see above + - a: knowledge base article id - the unique ID for the KB article + - b: product name - The first product from the meta data, all in lower case and spaces replaced by underscores + - c: An id to identify multiple html tables on the section (starting at 0) + - d: json data orientation - see above ## Disclaimer diff --git a/data_handling.py b/data_handling.py index fb86e4b..733aeef 100644 --- a/data_handling.py +++ b/data_handling.py @@ -38,6 +38,9 @@ def create_json_output(kb_dataobject, output_base_dir: str, record_type: str): filename = f"kb{kb_dataobject.id}_{kb_dataobject.fmt_product}_table{table_id}_release_as-{record_type}.json" if "Build Number" in dataframe.columns and record_type == "index": dataframe = transform_index(dataframe) + # Adding vxrail handling + elif kb_dataobject.id == 52075 and record_type == "index": + dataframe = transform_kb52075_index(dataframe) try: dataframe.to_json( f"{outputdir}{os.sep}{filename}", @@ -63,13 +66,20 @@ def create_json_output(kb_dataobject, output_base_dir: str, record_type: str): def transform_index(dataframe): - """Takes a dataframe as an input and re-creates the index based on the build number. Destructive to the dataframe + """Takes a dataframe as an input and re-creates the index based on the vxrail release. Destructive to the dataframe as duplicates are erased""" dataframe.drop_duplicates(subset="Build Number", keep=False, inplace=True) dataframe.reset_index(drop=True) dataframe.set_index("Build Number", inplace=True) return dataframe +def transform_kb52075_index(dataframe): + """Takes the vxrail dataframe as an input and re-creates the index based on the build number. Destructive to the dataframe + as duplicates are erased""" + dataframe.drop_duplicates(subset="VxRail Release", keep=False, inplace=True) + dataframe.reset_index(drop=True) + dataframe.set_index("VxRail Release", inplace=True) + return dataframe def standardize_columns(dataframe): """Takes a dataframe as an input and renames the columns to a common standard""" diff --git a/kb_data.py b/kb_data.py index 2063aca..2c252b0 100644 --- a/kb_data.py +++ b/kb_data.py @@ -22,7 +22,7 @@ __contact__ = "dominik@why-did-it.fail" __license__ = "GPLv3" __status__ = "beta" -__version__ = "0.3.0" +__version__ = "0.4.0" import pandas as pd @@ -140,9 +140,10 @@ def parse_releasedata(self): vcenter_le65_table = df[table_id][1:] vcenter_le65_table.columns = df_header.values.tolist()[0] # Get the data types right, especially the date format='%m/%d/%Y' - vcenter_le65_table["Release Date"] = pd.to_datetime(vcenter_le65_table["Release Date"], infer_datetime_format=True, - errors='coerce') - #Filter VCSA releases by keyword "Appliance", for Windows negate the search + vcenter_le65_table["Release Date"] = pd.to_datetime(vcenter_le65_table["Release Date"], + infer_datetime_format=True, + errors='coerce') + # Filter VCSA releases by keyword "Appliance", for Windows negate the search vcsa_le65 = vcenter_le65_table[vcenter_le65_table["Version"].str.contains("appliance", case=False)] vcsa_le65["Edition"] = "VCSA" winvc_le65 = vcenter_le65_table[~vcenter_le65_table["Version"].str.contains("appliance", case=False)] @@ -240,3 +241,49 @@ def transform_kb2143850(self, dataframe): dataframe[r"Build Number - Version"] = dataframe[r"Build Number - Version"].str.normalize("NFKD") dataframe[["Build Number", "Version"]] = dataframe[r"Build Number - Version"].str.split(r" - ", expand=True) return dataframe + + +# vxrail releases +class Kb52075(KbData): + def __init__(self, kb_id): + super().__init__(kb_id) + self.list_of_dframes = self.parse_releasedata() + + def get_first_product_name(self): + """Overriding function with hardcoded value as it isn't in the meta section""" + return "Dell VxRAIL" + + def parse_releasedata(self): + """Accepts the html data for product releases from the KB article for parsing with pandas.""" + df = pd.read_html(self.raw_html_resolution, flavor="bs4") + # Contains a list of all tables converted to dataframes in the resolution section + list_of_release_df = [] + for table_id in range(len(df)): + if table_id == 0: + # The HTML table have no header, we need to reassign the first row as heading + df_header = df[table_id][:1] + # The header has more than one column header, let's merge. I am not even attempting to automate this. + df_header.at[0, 6] = "External vCSA - Min" + df_header.at[0, 7] = "External vCSA - Recommended" + df_header.at[0, 8] = "External vCSA - Max" + current_df = df[table_id][2:] + current_df.columns = df_header.values.tolist()[0] + # Moving the del up here + del df_header + # Normalize unicode with none breaking space in some rows + current_df["Esxi (Version - Build #)"] = current_df["Esxi (Version - Build #)"].str.normalize("NFKD") + # Atomize multi-value columns + current_df[["ESXi version", "ESXi build number"]] = current_df["Esxi (Version - Build #)"].str.split( + pat=r"-", expand=True) + current_df["ESXi version"] = current_df["ESXi version"].str.strip() + current_df["ESXi build number"] = current_df["ESXi build number"].str.strip() + current_df[["VxRail manager version", "VxRail manager build number"]] = current_df[ + "VxRail manager"].str.split( + pat=r"-", expand=True) + current_df["VxRail manager build number"] = current_df["VxRail manager build number"].str.strip() + current_df["VxRail manager version"] = current_df["VxRail manager version"].str.strip() + current_df.reset_index(drop=True, inplace=True) + list_of_release_df.append(current_df) + else: + print("Unknown table added, please add handling") + return list_of_release_df diff --git a/main.py b/main.py index 82ccb75..e29cdbd 100644 --- a/main.py +++ b/main.py @@ -26,7 +26,7 @@ # Imports from data_handling import create_json_output -from kb_data import KbData, Kb2143838, Kb2143850 +from kb_data import KbData, Kb2143838, Kb2143850, Kb52075 from webparsing import parse_kb_article_ids import os import logging @@ -48,10 +48,14 @@ logging.info(f"Creating object for KB id {kb_id}") # Handle specific KBs by using extra Classes. # KB2143838: vCenter + # KB2143850: vRA + # KB52075: vxrail if kb_id == 2143838: kb_article = Kb2143838(kb_id) elif kb_id == 2143850: kb_article = Kb2143850(kb_id) + elif kb_id == 52075: + kb_article = Kb52075(kb_id) else: try: # Pass on the KB id to the data object to fill it diff --git a/templates/README.md b/templates/README.md index 53bdf71..399efbb 100644 --- a/templates/README.md +++ b/templates/README.md @@ -1,55 +1,72 @@ # Machine-readable VMware release data -This repo contains the release data of VMware products in a machine-readable format (currently JSON). It enables you to use the data easier in scripts or other automation use-cases. -The data is generated by a GitHub action running daily from [my transformation script](https://github.com/dominikzorgnotti/transform-vmware-product-builds-to-json). +This repo contains the release data of VMware products in a machine-readable format (currently JSON). It enables you to +use the data easier in scripts or other automation use-cases. +The data is generated by a GitHub action running daily +from [my transformation script](https://github.com/dominikzorgnotti/transform-vmware-product-builds-to-json). ## Data standardization -Since v0.1.0 the raw data from the published KB is transformed to establish a standard in terms of column headers, formatting, etc. -KB 2143832 (ESXi) is used as the model for providing a standardized information set. + +Since v0.1.0 the raw data from the published KB is transformed to establish a standard in terms of column headers, +formatting, etc. KB 2143832 (ESXi) is used as the model for providing a standardized information set. ### Uniform column names + Columns describing the same data set have different labels, e.g. "Build number", "Build Number", "BuildNumber". In case of the example, the columns will be renamed to "Build Number" per KB 2143832. ### Multi-value columns + Columns may have more than one value, e.g. "Build Number - Version" in KB2143850 (vRealize Automation). -In this case, two additional columns (Version, Build Number) will be added to the table each containing just a single Value. +In this case, two additional columns (Version, Build Number) will be added to the table each containing just a single +Value. ### Merged tables -Since v0.2.0: For vCenter build information (KB2143838), this release based on PR #5 offers merged tables: + +Since v0.2.0: For vCenter build information (KB2143838), this release based on PR #5 offers merged tables: The KB article contains three tables: + - Release information for VCSA 7 - Release information for VCSA/Windows VC 6.7 - Release information for VCSA/Windows before that The merged output available is now: + - one table for all VCSA releases - one table for all Windows releases -- one table for all releases -Unicode issues are addressed as well - +- one table for all releases Unicode issues are addressed as well ### Nested tables, merged columns/rows + Roadmap: Tables may have nested tables (e.g. KB52520 - VCF). A decomposition is needed to provide the information in a usable format. + +### Non-standard tables +Since v0.4.0 the script can handle KB52075 (vxrail). This table has none of the default column names and also multi-line column headers. + ## Output format and folder structures + Different use-cases, different ways to organize data. -The way the output is currently structured is: -- Directory: based on Pandas options to handle [json data orientation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html) +The way the output is currently structured is: + +- Directory: based on Pandas options to + handle [json data orientation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html) - Files: KB(a)_(b)_table(c)_release_as_(d)" - - a: knowledge base article id - the unique ID for the KB article - - b: product name - The first product from the meta data, all in lower case and spaces replaced by underscores - - c: An id to identify multiple html tables on the section (starting at 0) - - d: json data orientation - see above + - a: knowledge base article id - the unique ID for the KB article + - b: product name - The first product from the meta data, all in lower case and spaces replaced by underscores + - c: An id to identify multiple html tables on the section (starting at 0) + - d: json data orientation - see above I will try to maintain a stable folder structure and data output formats but reserve the right to change it at any time. ## Source -Please review the original source of data [Correlating build numbers and versions of VMware products (1014508)](https://kb.vmware.com/s/article/1014508?lang=en_US) -| Product name | Article | +Please review the original source of +data [Correlating build numbers and versions of VMware products (1014508)](https://kb.vmware.com/s/article/1014508?lang=en_US) + +| Product name | Article | :-----:|:-----: -VMware Converter Standalone | Build numbers and versions of VMware Converter Standalone (2143828) | +VMware Converter Standalone | Build numbers and versions of VMware Converter Standalone (2143828) | VMware Data Recovery | Build numbers and versions of VMware Data Recovery (2143852) | VMware ESXi/ESX | Build numbers and versions of VMware ESXi/ESX (2143832) | VMware Horizon View | Build numbers and versions of VMware Horizon View (2143853) | @@ -63,19 +80,21 @@ VMware vCloud Networking and Security | Build numbers and versions of VMware vCl VMware vRealize Automation | Build numbers and versions of VMware vRealize Automation (2143850) | VMware vRealize Orchestrator | Build numbers and versions of VMware vRealize Orchestrator (2143846) | VMware vRealize Operations Manager | Build numbers and versions of VMware vRealize Operations Manager (2145975) | -VMware vSAN | Build numbers and versions of VMware vSAN (2150753) | +VMware vSAN | Build numbers and versions of VMware vSAN (2150753) | VMware vShield | Build numbers and versions of VMware vShield (2143849) | -VMware vSphere Update Manager | Build numbers and versions of VMware vSphere Update Manager (2143837) | +VMware vSphere Update Manager | Build numbers and versions of VMware vSphere Update Manager (2143837) | VMware vSphere Replication Appliance | Build numbers and versions of VMware vSphere Replication Appliance (2143840) | VMware vSphere Storage Appliance | Build numbers and versions of VMware vSphere Storage Appliance (2145727) | VxRAIL | Correlating VxRAIL Release with VMware build numbers (52075) | VMware Cloud Foundation | Correlating VMware Cloud Foundation version with the versions of its constituent products (52520) | VMware vRealize Network Insight (vRNI) | Build numbers and versions of VMware vRealize Network Insight (67245) | -Based on feedback: I can only with data that is published in the KB articles. It may not include pulled releases or beta builds. +Based on feedback: I can only with data that is published in the KB articles. It may not include pulled releases or beta +builds. ## Disclaimer This is not an official VMware repository. It is not linked in any capacity to my employment at VMware. -All data that is processed is provided by VMware. I am just transforming the data VMware publishes in the knowledgebase and provide them in a different output format. -Errors can and will be made, please use this at your own discretion and test it out. +All data that is processed is provided by VMware. I am just transforming the data VMware publishes in the knowledgebase +and provide them in a different output format. Errors can and will be made, please use this at your own discretion and +test it out.