Skip to content

Commit

Permalink
Merge pull request #4 from dominikzorgnotti/handling-kb2143838
Browse files Browse the repository at this point in the history
Advanced data standardization
Closing #3
  • Loading branch information
dominikzorgnotti authored Mar 12, 2021
2 parents aadd2b5 + cb58e9b commit ef8bc83
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 19 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ and transforms them into a machine-readable format.
A combination of parsing with [beautiful soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and data handling with [Pandas](https://pandas.pydata.org/) is the used to achieve the goal.
A process is scheduled with GitHub actions to run daily, the results will be pushed to the repo [Machine-readable VMware release data](https://github.com/dominikzorgnotti/vmware_product_releases_machine-readable).

## Data standardization
Since v0.1.0 the raw data from the published KB is transformed to establish a standard in terms of column headers, formatting, etc.

## Output format and folder structures
The way the output is currently structured is:
- Directory: based on Pandas options to handle [json data orientation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html)
Expand Down
19 changes: 18 additions & 1 deletion data_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
__contact__ = "[email protected]"
__license__ = "GPLv3"
__status__ = "beta"
__version__ = "0.0.3"
__version__ = "0.1.0"


import os
Expand All @@ -37,6 +37,16 @@ def create_json_output(kb_dataobject, output_base_dir: str, record_type: str):
table_id = 0
for dataframe in kb_dataobject.list_of_dframes:
filename = f"kb{kb_dataobject.id}_{kb_dataobject.fmt_product}_table{table_id}_release_as-{record_type}.json"
# vRA KB
if kb_dataobject.id == 2143850:
dataframe = transform_kb2143850(dataframe)
# General data optimization
if ("BuildNumber" in dataframe.columns):
dataframe.rename(columns={"BuildNumber": "Build Number"}, inplace=True)
if ("Build number" in dataframe.columns):
dataframe.rename(columns={"Build number": "Build Number"}, inplace=True)
if "ReleaseDate" in dataframe.columns:
dataframe.rename(columns={"ReleaseDate": "Release Date"}, inplace=True)
if "Build Number" in dataframe.columns and record_type == "index":
dataframe = transform_index(dataframe)
dataframe.to_json(
Expand All @@ -53,3 +63,10 @@ def transform_index(dataframe):
dataframe.reset_index(drop=True)
dataframe.set_index("Build Number", inplace=True)
return dataframe


def transform_kb2143850(dataframe):
"""Special handling of KB2143850 (vRA)"""
if r"Build Number - Version" in dataframe:
dataframe[["Build Number", "Version"]] = dataframe[r"Build Number - Version"].str.split(r" - ", expand=True)
return dataframe
75 changes: 59 additions & 16 deletions kb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(self, kb_id):
self.fmt_product = self.format_product_name()
self.raw_html_resolution = self.get_resolution_section()
self.list_of_dframes = self.parse_releasedata()
self.list_of_merged_frames = None

def get_resolution_section(self):
"""Extracts the resolution section from the KB article content section"""
Expand Down Expand Up @@ -66,21 +67,63 @@ def parse_releasedata(self):
df = pd.read_html(self.raw_html_resolution, flavor="bs4")
# Contains a list of all tables converted to dataframes in the resolution section
list_of_release_df = []
# Since the HTML table has no header, we need to reassign the first row as heading
dict_of_releases = {}
for table_id in range(len(df)):
df_header = df[table_id][:1]
current_df = df[table_id][1:]
current_df.columns = df_header.values.tolist()[0]
# Get the data types right, especially the date format='%m/%d/%Y'
releaseinfo_dataframe = current_df
if "Release Date" in current_df.columns:
releaseinfo_dataframe["Release Date"] = pd.to_datetime(current_df["Release Date"],
infer_datetime_format=True, errors='coerce')
# Skipping build number conversion. This produces non-deterministic results atm
# if "Build Number" in current_df.columns:
# releaseinfo_dataframe["Build Number"] = pd.to_numeric(current_df["Build Number"],
# errors='coerce', downcast="integer")
list_of_release_df.append(releaseinfo_dataframe)
# Fun stuff may happen with dataframes if not erased before the next iteration
del df_header, current_df, releaseinfo_dataframe
# KB2143838 handling of vCenter 6.7 releases with VCSA and Windows
if self.id == 2143838 and table_id == 1:
vcenter67_table = df[table_id]
product_editions = ["VCSA", "Windows"]
for product_edition in product_editions:
split_df = self.split_kb2143838(vcenter67_table, product_edition)
reformatted_df = transform_kb2143838(split_df)
list_of_release_df.append(reformatted_df)
del split_df
elif self.id == 2143838 and table_id == 0:
vcenter7_table = df[table_id]
reformatted_df = transform_kb2143838(vcenter7_table)
list_of_release_df.append(reformatted_df)
else:
# Since some HTML table have no header, we need to reassign the first row as heading
if "Version" not in df[table_id].columns:
df_header = df[table_id][:1]
current_df = df[table_id][1:]
current_df.columns = df_header.values.tolist()[0]
# Moving the del up here
del df_header
else:
current_df = df[table_id]
releaseinfo_dataframe = current_df
# Get the data types right, especially the date format='%m/%d/%Y'
if "Release Date" in current_df.columns:
releaseinfo_dataframe["Release Date"] = pd.to_datetime(current_df["Release Date"],
infer_datetime_format=True, errors='coerce')
list_of_release_df.append(releaseinfo_dataframe)
# Fun stuff may happen with dataframes if not erased before the next iteration
del current_df, releaseinfo_dataframe
return list_of_release_df


def split_kb2143838(self, dataframe, product_edition):
"""Splits a dataframe based on the product edition (VCSA, Windows) and returns the output dataframe"""
tempdf_headless = dataframe[dataframe[0] == product_edition]
tempdf_header = tempdf_headless[:1]
tempdf = tempdf_headless[1:]
tempdf.columns = tempdf_header.values.tolist()[0]
tempdf.rename(columns={product_edition: "Edition"}, inplace=True)
#tempdf["Release Date"] = pd.to_datetime(tempdf["Release Date"],format='%Y-%m-%d', errors='coerce')
return tempdf

def transform_kb2143838(dataframe):
"""Special handling of KB2143838 (vCenter)"""
# When you access the vCenter API the values from this column are returned, alias it as Build Number
if "Client/MOB/vpxd.log" in dataframe.columns:
dataframe["Build Number"] = dataframe["Client/MOB/vpxd.log"]
if "Version" in dataframe.columns:
# Splitting the data in the Version columns does not work atm
#pass
tempdf = dataframe.rename(columns={"Version": "Version - Release Date"})
tempdf[["Version", "Release Name"]] = tempdf["Version - Release Date"].str.split(pat=r"(", expand=True)
tempdf["Release Name"] = tempdf["Release Name"].str.strip(r")")
# dataframe.rename(columns={"Version": "Version - Release Date"}, inplace=True)
# dataframe.rename(columns={"Version2": "Version"}, inplace=True)
return tempdf
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
__contact__ = "[email protected]"
__license__ = "GPLv3"
__status__ = "beta"
__version__ = "0.0.3"
__version__ = "0.1.0"

# Imports
from data_handling import create_json_output
Expand Down
4 changes: 3 additions & 1 deletion templates/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

This repo contains the release data of VMware products in a machine-readable format (currently JSON). It enables you to use the data easier in scripts or other automation use-cases.
The data is generated by a GitHub action running daily from [my transformation script](https://github.com/dominikzorgnotti/transform-vmware-product-builds-to-json).
*note: It's still early days: Currently, only a subset of product data is available until this hits a more stable release.

## Data standardization
Since v0.1.0 of the transformation script the raw data from the published KB is transformed to establish a standard in terms of column headers, formatting, etc.

## Output format and folder structures
Different use-cases, different ways to organize data.
Expand Down

0 comments on commit ef8bc83

Please sign in to comment.