Skip to content

Commit

Permalink
Update transform.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ian-cho authored Feb 5, 2025
1 parent 6579ed2 commit 7e84256
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions transforms/universal/bloom/dpk_bloom/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
:param table: Pyarrow table
:return: a table with an additional is_in_GneissWeb column
"""
# make sure that the table contains "contents" column
TransformUtils.validate_columns(table=table, required=[self.doc_text_column])
# make sure that the table contains "contents" column.
# TransformUtils.validate_columns(table=table, required=[self.doc_text_column]) #To prevent errors, comment out this line if your Parquet file does not contain the "contents" column.
self.df = table.to_pandas()
df_id_list = []
for i in range(len(self.df)):
id_ = self.df.iloc[i]['id']
id_ = self.df.iloc[i]['id'] # Please make sure the UUID column in the Parquet file is named "id."
df_id_list.append(id_)

data_bloom_label = self._apply_model(df_id_list, self.batch_size)
Expand Down Expand Up @@ -153,4 +153,4 @@ def apply_input_params(self, args: Namespace) -> bool:
self.params["inference_engine"] = args.inference_engine
self.params["batch_size"] = args.batch_size
logger.info(f"bloom params are {self.params} ")
return True
return True

0 comments on commit 7e84256

Please sign in to comment.