From 1357d3a41fb6a29fc4519be12389e1f5f6280abf Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Tue, 6 Sep 2022 18:46:25 +0300 Subject: [PATCH 01/10] - rebase on master --- fedot/preprocessing/data_types.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 1229b368d7..3537c08a5f 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -5,6 +5,7 @@ import pandas as pd from fedot.core.log import LoggerAdapter, default_log +from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum @@ -82,15 +83,15 @@ def convert_data_for_fit(self, data: InputData): data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) - - self._into_numeric_features_transformation_for_fit(data) - # Launch conversion float and integer features into categorical - self._into_categorical_features_transformation_for_fit(data) # Save info about features and target types self.features_types = copy(data.supplementary_data.column_types['features']) self.target_types = copy(data.supplementary_data.column_types['target']) - self._retain_columns_info_without_types_conflicts(data) + if data.data_type is DataTypesEnum.table: + # Launch conversion float and integer features into categorical + self._into_numeric_features_transformation_for_fit(data) + self._into_categorical_features_transformation_for_fit(data) + self._retain_columns_info_without_types_conflicts(data) return data def convert_data_for_predict(self, data: InputData): @@ -103,11 +104,11 @@ def convert_data_for_predict(self, data: InputData): data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) - - # Convert column types - self._into_numeric_features_transformation_for_predict(data) - self._into_categorical_features_transformation_for_predict(data) - self._retain_columns_info_without_types_conflicts(data) + if data.data_type is DataTypesEnum.table: + # Convert column types + self._into_numeric_features_transformation_for_predict(data) + self._into_categorical_features_transformation_for_predict(data) + self._retain_columns_info_without_types_conflicts(data) return data def remove_incorrect_features(self, table: np.array, converted_columns: dict): From 404e8880ba01db3b3564397a4cbad3e3601b0f75 Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Tue, 20 Sep 2022 13:36:57 +0300 Subject: [PATCH 02/10] - now tfidf tries to fit on every potential text col - reduced fraction of unique values - added min vocabulary size param - added class labels property to multimodal --- fedot/core/constants.py | 3 +- fedot/core/data/data_detection.py | 29 +++++++++++++++---- .../data/default_operation_params.json | 2 +- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/fedot/core/constants.py b/fedot/core/constants.py index f76af8e533..7f28e05e5c 100644 --- a/fedot/core/constants.py +++ b/fedot/core/constants.py @@ -14,7 +14,8 @@ MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100 -FRACTION_OF_UNIQUE_VALUES = 0.95 +FRACTION_OF_UNIQUE_VALUES = 0.7 +MIN_VOCABULARY_SIZE = 20 default_data_split_ratio_by_task = { TaskTypesEnum.classification: 0.8, diff --git a/fedot/core/data/data_detection.py b/fedot/core/data/data_detection.py index 8d315c9ccf..f372938ec6 100644 --- a/fedot/core/data/data_detection.py +++ b/fedot/core/data/data_detection.py @@ -3,8 +3,10 @@ import numpy as np import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer -from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES +from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES, MIN_VOCABULARY_SIZE +from fedot.core.repository.default_params_repository import DefaultOperationParamsRepository ALLOWED_NAN_PERCENT = 0.9 @@ -75,11 +77,15 @@ def _column_contains_text(self, column: pd.Series) -> bool: :param column: pandas series with data :return: True if column contains text """ - if column.dtype == object and not self._is_float_compatible(column): - unique_num = len(column.unique()) - nan_num = pd.isna(column).sum() - return unique_num / len(column) > FRACTION_OF_UNIQUE_VALUES if nan_num == 0 \ - else (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES + if column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column): + try: + params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf') + tfidf_vectorizer = TfidfVectorizer(**params) + tfidf_vectorizer.fit(np.where(pd.isna(column), '', column)) + if len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE: + return True + except ValueError: + print(f'Column {column.name} possibly contains text, but it is not possible to vectorize it') return False @staticmethod @@ -96,6 +102,17 @@ def _is_float_compatible(column: pd.Series) -> bool: failed_ratio = failed_objects_number / non_nan_all_objects_number return failed_ratio < 0.5 + @staticmethod + def _has_unique_values(column: pd.Series) -> bool: + """ + :param column: pandas series with data + :return: True if number of unique column values > threshold + """ + unique_num = len(column.unique()) + nan_num = pd.isna(column).sum() + return unique_num / len(column) > FRACTION_OF_UNIQUE_VALUES if nan_num == 0 \ + else (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES + class TimeSeriesDataDetector(DataDetector): """ diff --git a/fedot/core/repository/data/default_operation_params.json b/fedot/core/repository/data/default_operation_params.json index 6b6a5495bd..a4a3676032 100644 --- a/fedot/core/repository/data/default_operation_params.json +++ b/fedot/core/repository/data/default_operation_params.json @@ -138,7 +138,7 @@ "model_name": "glove-twitter-25" }, "tfidf": { - "min_df": 0.1, + "min_df": 0.01, "max_df": 0.9 }, "fast_ica": { From 75341b1375e932b8128daecb6d874ffcb754cc98 Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Wed, 21 Sep 2022 16:31:39 +0300 Subject: [PATCH 03/10] - added autodetection of columns with links - tests adapted for new functionality --- fedot/core/constants.py | 2 +- fedot/core/data/data_detection.py | 29 +++++++++++++++++-- fedot/core/data/multi_modal.py | 6 ++-- ...multimodal_data_with_complicated_types.csv | 4 +-- test/unit/data/test_multimodal_data.py | 4 +-- 5 files changed, 35 insertions(+), 10 deletions(-) diff --git a/fedot/core/constants.py b/fedot/core/constants.py index 7f28e05e5c..010a308eaf 100644 --- a/fedot/core/constants.py +++ b/fedot/core/constants.py @@ -14,7 +14,7 @@ MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100 -FRACTION_OF_UNIQUE_VALUES = 0.7 +FRACTION_OF_UNIQUE_VALUES = 0.6 MIN_VOCABULARY_SIZE = 20 default_data_split_ratio_by_task = { diff --git a/fedot/core/data/data_detection.py b/fedot/core/data/data_detection.py index f372938ec6..e3437b0269 100644 --- a/fedot/core/data/data_detection.py +++ b/fedot/core/data/data_detection.py @@ -40,12 +40,29 @@ def define_text_columns(self, data_frame: pd.DataFrame) -> List[str]: text_columns.append(column_name) return text_columns + def define_link_columns(self, data_frame: pd.DataFrame) -> List[str]: + """ + :param data_frame: pandas dataframe with data + :return: list of link columns' names + """ + link_columns = [] + for column_name in data_frame.columns: + if self.is_link(data_frame[column_name]): + link_columns.append(column_name) + return link_columns + @staticmethod def is_full_of_nans(text_data: np.array) -> bool: if np.sum(pd.isna(text_data)) / len(text_data) > ALLOWED_NAN_PERCENT: return True return False + @staticmethod + def is_link(text_data: np.array) -> bool: + if str(next(el for el in text_data if el is not None)).startswith('http'): + return True + return False + @staticmethod def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict: """ Prepares MultiModal text data in a form of dictionary @@ -72,12 +89,18 @@ def _column_contains_text(self, column: pd.Series) -> bool: Column contains text if: 1. it's not float or float compatible (e.g. ['1.2', '2.3', '3.4', ...] is float too) - 2. fraction of unique values (except nans) is more than 0.95 + 2. fraction of unique values (except nans) is more than 0.6 + 3. size of tfidf vocabulary is more than 20 + + If size of tfidf vocabulary is less than 20, then it is probably + text column too, but it cannot be vectorized and used in model :param column: pandas series with data - :return: True if column contains text + :return: True if column contains text, False otherwise or if column contains links """ - if column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column): + if self.is_link(column): + return False + elif column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column): try: params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf') tfidf_vectorizer = TfidfVectorizer(**params) diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py index 1239b17392..3b05f3fd8e 100644 --- a/fedot/core/data/multi_modal.py +++ b/fedot/core/data/multi_modal.py @@ -170,14 +170,16 @@ def from_csv(cls, if not text_columns: text_columns = text_data_detector.define_text_columns(data_frame) + link_columns = text_data_detector.define_link_columns(data_frame) + columns_to_drop = text_columns + link_columns data_text = text_data_detector.prepare_multimodal_data(data_frame, text_columns) - data_frame_table = data_frame.drop(columns=text_columns) + data_frame_table = data_frame.drop(columns=columns_to_drop) table_features, target = process_target_and_features(data_frame_table, target_columns) data_part_transformation_func = partial(array_to_input_data, idx=idx, target_array=target, task=task) - # create labels for text data sources and remove source if there are many nans + # create labels for text data sources and remove source if there are many nans or text is link sources = dict((text_data_detector.new_key_name(data_part_key), data_part_transformation_func(features_array=data_part, data_type=DataTypesEnum.text)) for (data_part_key, data_part) in data_text.items() diff --git a/test/data/multimodal_data_with_complicated_types.csv b/test/data/multimodal_data_with_complicated_types.csv index d81bc7516e..fee5c23c15 100644 --- a/test/data/multimodal_data_with_complicated_types.csv +++ b/test/data/multimodal_data_with_complicated_types.csv @@ -5,7 +5,7 @@ 3,3,,4,4,3,"make a type specimen book. It has survived not only five centuries, but also",4, a ,True,,,0,2,yes 4,4,,5,5,0,"the leap into electronic typesetting, remaining essentially unchanged. It was",5, b ,,,,0,3,no 5,5,,6,6,0,popularised in the 1960s with the release of Letraset sheets containing Lorem ,6, c ,False,,,0,4,no -6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7, a ,True,sample text,sample text,1,5,no +6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7, a ,True,"Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.","Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.",1,5,no 7,7,inf,8,8,1,PageMaker including versions of Lorem Ipsum.,1, b ,,,4,0,6,no 8,inf,inf,9,9,2,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots ",2,,True,,,1,7,no 9,9,inf,10,10,2,"in a piece of classical Latin literature from 45 BC, making it over 2000 years ",3, c ,False,,,0,8,yes @@ -16,4 +16,4 @@ 14,14,,3,3,2,"1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good ",1,a,False,,,,error,no 15,15,,4,4,1,"and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of",2,a ,False,,,,13,no 16,16,2,5,12,0,"ethics, very popular during the Renaissance. The first line of Lorem Ipsum,",3, d ,True,,,1,16,yes -17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4, d ,False,,another sample text,0,17,no +17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4, d ,False,,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",0,17,no diff --git a/test/unit/data/test_multimodal_data.py b/test/unit/data/test_multimodal_data.py index 4ecdafb931..baa87757bb 100644 --- a/test/unit/data/test_multimodal_data.py +++ b/test/unit/data/test_multimodal_data.py @@ -43,7 +43,7 @@ def test_multi_modal_data(): multi_modal.target = new_target assert np.array_equal(multi_modal.target, new_target) - +# TODO make test of text columns autodetection def test_multimodal_data_from_csv(): """ Checking correctness of MultiModalData import from csv file. @@ -54,7 +54,7 @@ def test_multimodal_data_from_csv(): text_data = np.array(df['description']) table_data = np.array(df.drop(columns=['id', 'description', 'variety'])) target = np.array(df['variety']).reshape(-1, 1) - actual_data = MultiModalData.from_csv(path) + actual_data = MultiModalData.from_csv(path, text_columns=['description']) actual_text_features = actual_data['data_source_text/description'].features actual_table_features = actual_data['data_source_table'].features actual_target = actual_data.target From 2cdb32c0abb6204a5a52853701cbb8149b2a708a Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Thu, 29 Sep 2022 14:32:57 +0200 Subject: [PATCH 04/10] - fixed bug with crash on tuning --- fedot/core/pipelines/tuning/search_space.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py index 177ec09391..ae988f7863 100644 --- a/fedot/core/pipelines/tuning/search_space.py +++ b/fedot/core/pipelines/tuning/search_space.py @@ -286,7 +286,7 @@ def get_parameters_dict(self): }, 'tfidf': { 'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3)]]), - 'min_df': (hp.uniform, [0.0001, 0.1]), + 'min_df': (hp.uniform, [0.0001, 0.01]), 'max_df': (hp.uniform, [0.9, 0.99]) }, } From f6e9828cb32f78cf638b76d3599ce54527d582aa Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Thu, 6 Oct 2022 14:50:11 +0200 Subject: [PATCH 05/10] - added ngram_range=(1,4) to search space - minor changes --- fedot/core/constants.py | 2 +- fedot/core/data/data.py | 2 +- fedot/core/data/data_detection.py | 50 ++++++++++++--------- fedot/core/data/multi_modal.py | 10 ++--- fedot/core/pipelines/tuning/search_space.py | 2 +- 5 files changed, 36 insertions(+), 30 deletions(-) diff --git a/fedot/core/constants.py b/fedot/core/constants.py index 010a308eaf..2cd765938b 100644 --- a/fedot/core/constants.py +++ b/fedot/core/constants.py @@ -14,7 +14,7 @@ MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100 -FRACTION_OF_UNIQUE_VALUES = 0.6 +FRACTION_OF_UNIQUE_VALUES_IN_TEXT = 0.6 MIN_VOCABULARY_SIZE = 20 default_data_split_ratio_by_task = { diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 4cf5b6f2f9..095855b648 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -334,7 +334,7 @@ def num_classes(self) -> Optional[int]: return len(unique_values) if unique_values is not None else None @property - def class_labels(self) -> Optional[int]: + def class_labels(self) -> Optional[List[Union[int, str, float]]]: """Returns unique class labels that are present in the target""" if self.task.task_type == TaskTypesEnum.classification and self.target is not None: return np.unique(self.target) diff --git a/fedot/core/data/data_detection.py b/fedot/core/data/data_detection.py index e3437b0269..75d759f7ff 100644 --- a/fedot/core/data/data_detection.py +++ b/fedot/core/data/data_detection.py @@ -1,11 +1,13 @@ from abc import abstractmethod from typing import List +import re import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer -from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES, MIN_VOCABULARY_SIZE +from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES_IN_TEXT, MIN_VOCABULARY_SIZE +from fedot.core.log import default_log from fedot.core.repository.default_params_repository import DefaultOperationParamsRepository ALLOWED_NAN_PERCENT = 0.9 @@ -28,27 +30,25 @@ class TextDataDetector(DataDetector): """ Class for detecting text data during its import. """ + def __init__(self): + self.logger = default_log(prefix='FEDOT logger') + super().__init__() - def define_text_columns(self, data_frame: pd.DataFrame) -> List[str]: + def find_text_columns(self, data_frame: pd.DataFrame) -> List[str]: """ :param data_frame: pandas dataframe with data :return: list of text columns' names """ - text_columns = [] - for column_name in data_frame.columns: - if self._column_contains_text(data_frame[column_name]): - text_columns.append(column_name) + text_columns = [column_name for column_name in data_frame.columns + if self._column_contains_text(data_frame[column_name])] return text_columns - def define_link_columns(self, data_frame: pd.DataFrame) -> List[str]: + def find_link_columns(self, data_frame: pd.DataFrame) -> List[str]: """ :param data_frame: pandas dataframe with data :return: list of link columns' names """ - link_columns = [] - for column_name in data_frame.columns: - if self.is_link(data_frame[column_name]): - link_columns.append(column_name) + link_columns = [column_name for column_name in data_frame.columns if self.is_link(data_frame[column_name])] return link_columns @staticmethod @@ -59,9 +59,9 @@ def is_full_of_nans(text_data: np.array) -> bool: @staticmethod def is_link(text_data: np.array) -> bool: - if str(next(el for el in text_data if el is not None)).startswith('http'): - return True - return False + link_pattern = \ + '[(http(s)?):\\/\\/(www\\.)?a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)' + return re.search(link_pattern, str(next(el for el in text_data if el is not None))) is not None @staticmethod def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict: @@ -101,14 +101,14 @@ def _column_contains_text(self, column: pd.Series) -> bool: if self.is_link(column): return False elif column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column): + params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf') + tfidf_vectorizer = TfidfVectorizer(**params) try: - params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf') - tfidf_vectorizer = TfidfVectorizer(**params) + # TODO now grey zone columns (not text, not numerical) are not processed. Need to drop them tfidf_vectorizer.fit(np.where(pd.isna(column), '', column)) - if len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE: - return True + return len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE except ValueError: - print(f'Column {column.name} possibly contains text, but it is not possible to vectorize it') + self.logger.warning(f"Column {column.name} possibly contains text, but it's impossible to vectorize it") return False @staticmethod @@ -133,8 +133,13 @@ def _has_unique_values(column: pd.Series) -> bool: """ unique_num = len(column.unique()) nan_num = pd.isna(column).sum() - return unique_num / len(column) > FRACTION_OF_UNIQUE_VALUES if nan_num == 0 \ - else (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES + # fraction of unique values in column if there is no nans + frac_unique_is_bigger_than_threshold = unique_num / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT + # fraction of unique values in column if there are nans + frac_unique_is_bigger_than_threshold_with_nans = \ + (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT + return frac_unique_is_bigger_than_threshold if nan_num == 0 \ + else frac_unique_is_bigger_than_threshold_with_nans class TimeSeriesDataDetector(DataDetector): @@ -154,10 +159,13 @@ def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict multi_modal_ts_data = {} for column_name in columns: feature_ts = np.array(dataframe[column_name]) + idx = list(dataframe['datetime']) # Will be the same multi_modal_ts_data.update({column_name: feature_ts}) + multi_modal_ts_data['idx'] = np.asarray(idx) + return multi_modal_ts_data @staticmethod diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py index 3b05f3fd8e..7dc002378d 100644 --- a/fedot/core/data/multi_modal.py +++ b/fedot/core/data/multi_modal.py @@ -59,10 +59,8 @@ def data_type(self): @property def num_classes(self) -> Optional[int]: - if self.task.task_type == TaskTypesEnum.classification: - return len(np.unique(self.target)) - else: - return None + unique_values = self.class_labels + return len(unique_values) if unique_values is not None else None @property def class_labels(self) -> Optional[List[Union[int, str, float]]]: @@ -168,9 +166,9 @@ def from_csv(cls, text_columns = [text_columns] if isinstance(text_columns, str) else text_columns if not text_columns: - text_columns = text_data_detector.define_text_columns(data_frame) + text_columns = text_data_detector.find_text_columns(data_frame) - link_columns = text_data_detector.define_link_columns(data_frame) + link_columns = text_data_detector.find_link_columns(data_frame) columns_to_drop = text_columns + link_columns data_text = text_data_detector.prepare_multimodal_data(data_frame, text_columns) data_frame_table = data_frame.drop(columns=columns_to_drop) diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py index ae988f7863..55b2fa5445 100644 --- a/fedot/core/pipelines/tuning/search_space.py +++ b/fedot/core/pipelines/tuning/search_space.py @@ -285,7 +285,7 @@ def get_parameters_dict(self): 'glove-wiki-gigaword-100', 'word2vec-ruscorpora-300']]) }, 'tfidf': { - 'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3)]]), + 'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3), (1, 4)]]), 'min_df': (hp.uniform, [0.0001, 0.01]), 'max_df': (hp.uniform, [0.9, 0.99]) }, From 57bfc8607165245263d6895676533c867334b609 Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Thu, 6 Oct 2022 15:04:47 +0200 Subject: [PATCH 06/10] - minor changes --- fedot/core/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fedot/core/constants.py b/fedot/core/constants.py index 2cd765938b..90ce0b8d7b 100644 --- a/fedot/core/constants.py +++ b/fedot/core/constants.py @@ -14,6 +14,7 @@ MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100 +# constants for text processing FRACTION_OF_UNIQUE_VALUES_IN_TEXT = 0.6 MIN_VOCABULARY_SIZE = 20 From 507d136ffafbb1acc625f34b8d5f3a2247c07948 Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Sat, 8 Oct 2022 14:23:27 +0200 Subject: [PATCH 07/10] - max_features of tfidf is set to 10e5 to reduce memory consumption --- fedot/core/repository/data/default_operation_params.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fedot/core/repository/data/default_operation_params.json b/fedot/core/repository/data/default_operation_params.json index a4a3676032..0430f8868c 100644 --- a/fedot/core/repository/data/default_operation_params.json +++ b/fedot/core/repository/data/default_operation_params.json @@ -139,7 +139,8 @@ }, "tfidf": { "min_df": 0.01, - "max_df": 0.9 + "max_df": 0.9, + "max_features": 100000 }, "fast_ica": { "whiten": "unit-variance" From edffc875968f483141089e3fa24fc42c74e93889 Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Thu, 13 Oct 2022 18:30:53 +0200 Subject: [PATCH 08/10] - word2vec trained outline --- fedot/core/data/multi_modal.py | 1 + .../data_operations/text_pretrained.py | 26 ++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py index 7dc002378d..7ec06bb650 100644 --- a/fedot/core/data/multi_modal.py +++ b/fedot/core/data/multi_modal.py @@ -169,6 +169,7 @@ def from_csv(cls, text_columns = text_data_detector.find_text_columns(data_frame) link_columns = text_data_detector.find_link_columns(data_frame) + # TODO log drop of link columns columns_to_drop = text_columns + link_columns data_text = text_data_detector.prepare_multimodal_data(data_frame, text_columns) data_frame_table = data_frame.drop(columns=columns_to_drop) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py index 7cfb0e0c2a..041bc12534 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py @@ -13,7 +13,7 @@ try: import gensim.downloader as api - from gensim.models import KeyedVectors + from gensim.models import KeyedVectors, Word2Vec except ModuleNotFoundError: warn_requirement('gensim') api = None @@ -50,18 +50,21 @@ def transform(self, input_data: InputData) -> OutputData: return output_data @staticmethod - def vectorize_avg(text: str, embeddings): + def vectorize_avg(text: np.array, embeddings) -> np.array: """ Method converts text to an average of token vectors - :param text: str with text data + :param text: np.array with text data :param embeddings: gensim pretrained embeddings :return features: one-dimensional np.array with numbers """ + def _arr2string(array: np.array) -> str: + return np.array2string(array).replace('[', '').replace(']', '').replace('"', '') + embedding_dim = embeddings.vectors.shape[1] features = np.zeros([embedding_dim], dtype='float32') num_words = 0 - for word in text.split(): + for word in _arr2string(text).split(): if word in embeddings: features += embeddings[f'{word}'] num_words += 1 @@ -79,3 +82,18 @@ def _download_model_resources(self): if os.path.exists(model_path): self.logger.info('Embeddings are already downloaded. Loading model...') self.model = KeyedVectors.load_word2vec_format(model_path, binary=False) + + +class TrainedEmbeddingsImplementation(DataOperationImplementation): + + def __init__(self, **params: Optional[dict]): + self.params = params + self.logger = default_log(prefix='FEDOT logger') + super().__init__() + + def fit(self, input_data: InputData): + pass + + def transform(self, input_data: InputData) -> OutputData: + pass + From c0ed0d7643d11ed9c4d70d07f3619c2d92201b8d Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Tue, 13 Dec 2022 15:17:55 +0300 Subject: [PATCH 09/10] Fixed R2 calculation --- fedot/core/composer/metrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fedot/core/composer/metrics.py b/fedot/core/composer/metrics.py index 40a344414a..0b600f4a51 100644 --- a/fedot/core/composer/metrics.py +++ b/fedot/core/composer/metrics.py @@ -214,6 +214,7 @@ class R2(QualityMetric): default_value = 0 @staticmethod + @from_maximised_metric def metric(reference: InputData, predicted: OutputData) -> float: return r2_score(y_true=reference.target, y_pred=predicted.predict) From 86ae96fde5d0f99dc3c095c8b3d93909a9f20f3c Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Mon, 19 Dec 2022 16:51:45 +0300 Subject: [PATCH 10/10] Fixed bug with regression test --- test/unit/composer/test_quality_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/composer/test_quality_metrics.py b/test/unit/composer/test_quality_metrics.py index b0c460f391..fa1ad018d2 100644 --- a/test/unit/composer/test_quality_metrics.py +++ b/test/unit/composer/test_quality_metrics.py @@ -91,7 +91,7 @@ def test_regression_quality_metric(data_setup): for metric in RegressionMetricsEnum: metric_function = MetricsRepository().metric_by_id(metric) metric_value = metric_function(pipeline=pipeline, reference_data=train) - assert metric_value > 0 + assert 0 < abs(metric_value) < sys.maxsize def test_data_preparation_for_multi_target_correct(multi_target_data_setup):