Skip to content

Commit

Permalink
Merge pull request #5 from dominikzorgnotti/merge-tables-2143838
Browse files Browse the repository at this point in the history
KB2143838: Unified datasets
  • Loading branch information
dominikzorgnotti authored Mar 14, 2021
2 parents ef8bc83 + bf68764 commit b2365eb
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 78 deletions.
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,23 @@ A process is scheduled with GitHub actions to run daily, the results will be pus

## Data standardization
Since v0.1.0 the raw data from the published KB is transformed to establish a standard in terms of column headers, formatting, etc.
KB 2143832 (ESXi) is used as the model for providing a standardized information set.

### Uniform column names
Columns describing the same data set have different labels, e.g. "Build number", "Build Number", "BuildNumber".
In case of the example, the columns will be renamed to "Build Number" per KB 2143832.

### Multi-value columns
Columns may have more than one value, e.g. "Build Number - Version" in KB2143850 (vRealize Automation).
In this case, two additional columns (Version, Build Number) will be added to the table each containing just a single Value.

### Merged tables
Roadmap: There may be more than one table that hold the version information, e.g. in KB2143838 (vCenter Server).
A merge operation will attempt to provide a unified table.

### Nested tables, merged columns/rows
Roadmap: Tables may have nested tables (e.g. KB52520 - VCF).
A decomposition is needed to provide the information in a usable format.

## Output format and folder structures
The way the output is currently structured is:
Expand Down
58 changes: 35 additions & 23 deletions data_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,38 +22,46 @@
__contact__ = "[email protected]"
__license__ = "GPLv3"
__status__ = "beta"
__version__ = "0.1.0"

__version__ = "0.2.0"

import os
import pandas as pd


def create_json_output(kb_dataobject, output_base_dir: str, record_type: str):
"""Takes a list of dataframes from a KB object, an relative output directory and a JSON data"""
outputdir = os.path.join(output_base_dir, record_type)
if not os.path.exists(outputdir):
os.makedirs(outputdir)
# TODO Code repeat make this DRY anytime soon
table_id = 0
for dataframe in kb_dataobject.list_of_dframes:
filename = f"kb{kb_dataobject.id}_{kb_dataobject.fmt_product}_table{table_id}_release_as-{record_type}.json"
# vRA KB
if kb_dataobject.id == 2143850:
dataframe = transform_kb2143850(dataframe)
# General data optimization
if ("BuildNumber" in dataframe.columns):
dataframe.rename(columns={"BuildNumber": "Build Number"}, inplace=True)
if ("Build number" in dataframe.columns):
dataframe.rename(columns={"Build number": "Build Number"}, inplace=True)
if "ReleaseDate" in dataframe.columns:
dataframe.rename(columns={"ReleaseDate": "Release Date"}, inplace=True)
if "Build Number" in dataframe.columns and record_type == "index":
dataframe = transform_index(dataframe)
dataframe.to_json(
f"{outputdir}{os.sep}{filename}",
indent=4, orient=record_type, date_format="iso"
)
table_id += 1
try:
dataframe.to_json(
f"{outputdir}{os.sep}{filename}",
indent=4, orient=record_type, date_format="iso"
)
except ValueError as err:
print(f"{kb_dataobject.id}: Error for json {record_type} in table {table_id}: {err}")
finally:
table_id += 1
if kb_dataobject.list_of_merged_frames:
table_id = 0
for dataframe in kb_dataobject.list_of_merged_frames:
filename = f"kb{kb_dataobject.id}_{kb_dataobject.fmt_product}_merged{table_id}_release_as-{record_type}.json"
if "Build Number" in dataframe.columns and record_type == "index":
dataframe = transform_index(dataframe)
try:
dataframe.to_json(
f"{outputdir}{os.sep}{filename}",
indent=4, orient=record_type, date_format="iso"
)
except ValueError as err:
print(f"{kb_dataobject.id}: Error for json {record_type} in merged table {table_id}: {err}")
finally:
table_id += 1


def transform_index(dataframe):
Expand All @@ -65,8 +73,12 @@ def transform_index(dataframe):
return dataframe


def transform_kb2143850(dataframe):
"""Special handling of KB2143850 (vRA)"""
if r"Build Number - Version" in dataframe:
dataframe[["Build Number", "Version"]] = dataframe[r"Build Number - Version"].str.split(r" - ", expand=True)
return dataframe
def standardize_columns(dataframe):
"""Takes a dataframe as an input and renames the columns to a common standard"""
if ("BuildNumber" in dataframe.columns):
dataframe.rename(columns={"BuildNumber": "Build Number"}, inplace=True)
if ("Build number" in dataframe.columns):
dataframe.rename(columns={"Build number": "Build Number"}, inplace=True)
if "ReleaseDate" in dataframe.columns:
dataframe.rename(columns={"ReleaseDate": "Release Date"}, inplace=True)
return dataframe
188 changes: 146 additions & 42 deletions kb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,19 @@
this program. If not, see <http://www.gnu.org/licenses/>.
"""

__author__ = "Dominik Zorgnotti"
__contact__ = "[email protected]"
__created__ = "2020-02-26"
__deprecated__ = False
__contact__ = "[email protected]"
__license__ = "GPLv3"
__status__ = "beta"
__version__ = "0.2.0"

import pandas as pd

from data_handling import standardize_columns
from webparsing import get_kb_webdata
import html5lib
import pandas as pd

# YOLO as I am okay with overwriting DF data regardless of the results
pd.options.mode.chained_assignment = None # default='warn'
Expand Down Expand Up @@ -69,61 +77,157 @@ def parse_releasedata(self):
list_of_release_df = []
dict_of_releases = {}
for table_id in range(len(df)):
# KB2143838 handling of vCenter 6.7 releases with VCSA and Windows
if self.id == 2143838 and table_id == 1:
# Since some HTML table have no header, we need to reassign the first row as heading
if "Version" not in df[table_id].columns:
df_header = df[table_id][:1]
current_df = df[table_id][1:]
current_df.columns = df_header.values.tolist()[0]
# Moving the del up here
del df_header
else:
current_df = df[table_id]
releaseinfo_dataframe = standardize_columns(current_df)
# Get the data types right, especially the date format='%m/%d/%Y'
if "Release Date" in current_df.columns:
releaseinfo_dataframe["Release Date"] = pd.to_datetime(current_df["Release Date"],
infer_datetime_format=True, errors='coerce')
list_of_release_df.append(releaseinfo_dataframe)
# Fun stuff may happen with dataframes if not erased before the next iteration
del current_df, releaseinfo_dataframe
return list_of_release_df


# You might read this and say "he's drunk!". Alas, it's pure desperation.

# vCenter releases
class Kb2143838(KbData):
def __init__(self, kb_id):
super().__init__(kb_id)
self.list_of_dframes = self.parse_releasedata()
self.list_of_merged_frames = self.merge_tables_kb2143838()

def parse_releasedata(self):
"""Accepts the html data for product releases from the KB article for parsing with pandas."""
df = pd.read_html(self.raw_html_resolution, flavor="bs4")
# Contains a list of all tables converted to dataframes in the resolution section
list_of_release_df = []
for table_id in range(len(df)):
if table_id == 0:
vcenter7_table = df[table_id]
reformatted_df = self.transform_kb2143838(vcenter7_table)
reformatted_df["Edition"] = "VCSA"
reformatted_df["Release Date"] = pd.to_datetime(reformatted_df["Release Date"],
infer_datetime_format=True,
errors='coerce')
list_of_release_df.append(reformatted_df)
elif table_id == 1:
vcenter67_table = df[table_id]
product_editions = ["VCSA", "Windows"]
for product_edition in product_editions:
split_df = self.split_kb2143838(vcenter67_table, product_edition)
reformatted_df = transform_kb2143838(split_df)
reformatted_df = self.transform_kb2143838(split_df)
reformatted_df["Release Date"] = pd.to_datetime(reformatted_df["Release Date"],
infer_datetime_format=True,
errors='coerce')
list_of_release_df.append(reformatted_df)
del split_df
elif self.id == 2143838 and table_id == 0:
vcenter7_table = df[table_id]
reformatted_df = transform_kb2143838(vcenter7_table)
list_of_release_df.append(reformatted_df)
else:
# Since some HTML table have no header, we need to reassign the first row as heading
if "Version" not in df[table_id].columns:
df_header = df[table_id][:1]
current_df = df[table_id][1:]
current_df.columns = df_header.values.tolist()[0]
# Moving the del up here
del df_header
else:
current_df = df[table_id]
releaseinfo_dataframe = current_df
elif table_id == 2:
# The HTML table have no header, we need to reassign the first row as heading
df_header = df[table_id][:1]
current_df = df[table_id][1:]
current_df.columns = df_header.values.tolist()[0]
# Moving the del up here
del df_header
current_df["Edition"] = "Windows"
# Get the data types right, especially the date format='%m/%d/%Y'
if "Release Date" in current_df.columns:
releaseinfo_dataframe["Release Date"] = pd.to_datetime(current_df["Release Date"],
infer_datetime_format=True, errors='coerce')
list_of_release_df.append(releaseinfo_dataframe)
# Fun stuff may happen with dataframes if not erased before the next iteration
del current_df, releaseinfo_dataframe
current_df["Release Date"] = pd.to_datetime(current_df["Release Date"], infer_datetime_format=True,
errors='coerce')
list_of_release_df.append(current_df)
else:
print("Unknown table added, please add handling")
return list_of_release_df


def split_kb2143838(self, dataframe, product_edition):
"""Splits a dataframe based on the product edition (VCSA, Windows) and returns the output dataframe"""
tempdf_headless = dataframe[dataframe[0] == product_edition]
tempdf_header = tempdf_headless[:1]
tempdf = tempdf_headless[1:]
tempdf.columns = tempdf_header.values.tolist()[0]
tempdf.rename(columns={product_edition: "Edition"}, inplace=True)
#tempdf["Release Date"] = pd.to_datetime(tempdf["Release Date"],format='%Y-%m-%d', errors='coerce')
return tempdf

def transform_kb2143838(dataframe):
"""Special handling of KB2143838 (vCenter)"""
# When you access the vCenter API the values from this column are returned, alias it as Build Number
if "Client/MOB/vpxd.log" in dataframe.columns:
dataframe["Build Number"] = dataframe["Client/MOB/vpxd.log"]
if "Version" in dataframe.columns:
# Splitting the data in the Version columns does not work atm
#pass
tempdf = dataframe.rename(columns={"Version": "Version - Release Date"})
tempdf[["Version", "Release Name"]] = tempdf["Version - Release Date"].str.split(pat=r"(", expand=True)
tempdf["Release Name"] = tempdf["Release Name"].str.strip(r")")
# dataframe.rename(columns={"Version": "Version - Release Date"}, inplace=True)
# dataframe.rename(columns={"Version2": "Version"}, inplace=True)
return tempdf
def transform_kb2143838(self, dataframe):
"""Special handling of KB2143838 (vCenter)"""
# When you access the vCenter API the values from this column are returned, alias it as Build Number
if "Client/MOB/vpxd.log" in dataframe.columns:
dataframe["Build Number"] = dataframe["Client/MOB/vpxd.log"]
if "Version" in dataframe.columns:
# Normalize unicode with none breaking space in some rows
dataframe["Version"] = dataframe["Version"].str.normalize("NFKD")
tempdf = dataframe.rename(columns={"Version": "Version - Release Name"})
tempdf[["Version", "Release Name"]] = tempdf["Version - Release Name"].str.split(pat=r"(", expand=True)
# Remove ) and trailing space
tempdf["Release Name"] = tempdf["Release Name"].str.strip(r")")
tempdf["Version"] = tempdf["Version"].str.strip()

return tempdf

def merge_tables_kb2143838(self):
"""Accepts a list of dataframes, merge them and return a list of the merged df"""
# Return this list when ready
merged_vcenter_tables = []
# Prepare the tables
vc7x_vcsa = self.list_of_dframes[0]
vc67_vcsa = self.list_of_dframes[1]
vc67_win = self.list_of_dframes[2]
vc_win_only = self.list_of_dframes[3]
# Solved by WET
# Merge VCSA tables
merged_vcsa_builds = vc7x_vcsa.append(vc67_vcsa)
merged_vcsa_builds.reset_index(drop=True, inplace=True)
merged_vcenter_tables.append(merged_vcsa_builds)
# Merge vCenter for Windows tables
merged_windows_builds = vc67_win.append(vc_win_only)
merged_windows_builds.reset_index(drop=True, inplace=True)
merged_vcenter_tables.append(merged_windows_builds)
# Merge both tables
merged_vc_all_builds = merged_vcsa_builds.append(merged_windows_builds)
merged_vc_all_builds.reset_index(drop=True, inplace=True)
merged_vcenter_tables.append(merged_vc_all_builds)
# Return the list
return merged_vcenter_tables


# vRA releases
class Kb2143850(KbData):
def __init__(self, kb_id):
super().__init__(kb_id)
self.list_of_dframes = self.parse_releasedata()

def parse_releasedata(self):
"""Accepts the html data for product releases from the KB article for parsing with pandas."""
df = pd.read_html(self.raw_html_resolution, flavor="bs4")
# Contains a list of all tables converted to dataframes in the resolution section
list_of_release_df = []
for table_id in range(len(df)):
if table_id == 0:
# The HTML table have no header, we need to reassign the first row as heading
df_header = df[table_id][:1]
current_df = df[table_id][1:]
current_df.columns = df_header.values.tolist()[0]
# Moving the del up here
del df_header
current_df = self.transform_kb2143850(current_df)
# Get the data types right, especially the date format='%m/%d/%Y'
current_df["Release Date"] = pd.to_datetime(current_df["Release Date"], infer_datetime_format=True,
errors='coerce')
list_of_release_df.append(current_df)
else:
print("Unknown table added, please add handling")
return list_of_release_df

def transform_kb2143850(self, dataframe):
"""Special handling of KB2143850 (vRA)"""
if r"Build Number - Version" in dataframe:
dataframe[["Build Number", "Version"]] = dataframe[r"Build Number - Version"].str.split(r" - ", expand=True)
return dataframe
27 changes: 16 additions & 11 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
__contact__ = "[email protected]"
__license__ = "GPLv3"
__status__ = "beta"
__version__ = "0.1.0"
__version__ = "0.2.0"

# Imports
from data_handling import create_json_output
from kb_data import KbData
from kb_data import KbData, Kb2143838, Kb2143850
from webparsing import parse_kb_article_ids
import os
import logging
Expand All @@ -46,14 +46,19 @@
vmware_release_kbs = parse_kb_article_ids(MASTERKBID)
for kb_id in vmware_release_kbs:
logging.info(f"Creating object for KB id {kb_id}")
# Pass on the KB id to the data object to fill it
try:
kb_article = KbData(kb_id=kb_id)
except ValueError as err:
print(f"cannot handle data from {kb_article.id} without breaking: {err}")
# Create outputs
for record_type in JSONRECORDS:
# Handle specific KBs by using extra Classes.
# KB2143838: vCenter
if kb_id == 2143838:
kb_article = Kb2143838(kb_id)
elif kb_id == 2143850:
kb_article = Kb2143850(kb_id)
else:
try:
create_json_output(kb_dataobject=kb_article, output_base_dir=OUTPUTBASEDIR, record_type=record_type)
# Pass on the KB id to the data object to fill it
kb_article = KbData(kb_id=kb_id)
except ValueError as err:
print(f"cannot create json data out as {record_type} from {kb_article.id} without breaking: {err}")
print(f"cannot handle data from {kb_article.id} without breaking: {err}")
# Create outputs
for record_type in JSONRECORDS:
create_json_output(kb_dataobject=kb_article, output_base_dir=OUTPUTBASEDIR, record_type=record_type)

Loading

0 comments on commit b2365eb

Please sign in to comment.