diff --git a/fedot/core/composer/metrics.py b/fedot/core/composer/metrics.py index 40a344414a..0b600f4a51 100644 --- a/fedot/core/composer/metrics.py +++ b/fedot/core/composer/metrics.py @@ -214,6 +214,7 @@ class R2(QualityMetric): default_value = 0 @staticmethod + @from_maximised_metric def metric(reference: InputData, predicted: OutputData) -> float: return r2_score(y_true=reference.target, y_pred=predicted.predict) diff --git a/fedot/core/constants.py b/fedot/core/constants.py index f76af8e533..90ce0b8d7b 100644 --- a/fedot/core/constants.py +++ b/fedot/core/constants.py @@ -14,7 +14,9 @@ MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100 -FRACTION_OF_UNIQUE_VALUES = 0.95 +# constants for text processing +FRACTION_OF_UNIQUE_VALUES_IN_TEXT = 0.6 +MIN_VOCABULARY_SIZE = 20 default_data_split_ratio_by_task = { TaskTypesEnum.classification: 0.8, diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 4cf5b6f2f9..095855b648 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -334,7 +334,7 @@ def num_classes(self) -> Optional[int]: return len(unique_values) if unique_values is not None else None @property - def class_labels(self) -> Optional[int]: + def class_labels(self) -> Optional[List[Union[int, str, float]]]: """Returns unique class labels that are present in the target""" if self.task.task_type == TaskTypesEnum.classification and self.target is not None: return np.unique(self.target) diff --git a/fedot/core/data/data_detection.py b/fedot/core/data/data_detection.py index 8d315c9ccf..75d759f7ff 100644 --- a/fedot/core/data/data_detection.py +++ b/fedot/core/data/data_detection.py @@ -1,10 +1,14 @@ from abc import abstractmethod from typing import List +import re import numpy as np import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer -from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES +from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES_IN_TEXT, MIN_VOCABULARY_SIZE +from fedot.core.log import default_log +from fedot.core.repository.default_params_repository import DefaultOperationParamsRepository ALLOWED_NAN_PERCENT = 0.9 @@ -26,24 +30,39 @@ class TextDataDetector(DataDetector): """ Class for detecting text data during its import. """ + def __init__(self): + self.logger = default_log(prefix='FEDOT logger') + super().__init__() - def define_text_columns(self, data_frame: pd.DataFrame) -> List[str]: + def find_text_columns(self, data_frame: pd.DataFrame) -> List[str]: """ :param data_frame: pandas dataframe with data :return: list of text columns' names """ - text_columns = [] - for column_name in data_frame.columns: - if self._column_contains_text(data_frame[column_name]): - text_columns.append(column_name) + text_columns = [column_name for column_name in data_frame.columns + if self._column_contains_text(data_frame[column_name])] return text_columns + def find_link_columns(self, data_frame: pd.DataFrame) -> List[str]: + """ + :param data_frame: pandas dataframe with data + :return: list of link columns' names + """ + link_columns = [column_name for column_name in data_frame.columns if self.is_link(data_frame[column_name])] + return link_columns + @staticmethod def is_full_of_nans(text_data: np.array) -> bool: if np.sum(pd.isna(text_data)) / len(text_data) > ALLOWED_NAN_PERCENT: return True return False + @staticmethod + def is_link(text_data: np.array) -> bool: + link_pattern = \ + '[(http(s)?):\\/\\/(www\\.)?a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)' + return re.search(link_pattern, str(next(el for el in text_data if el is not None))) is not None + @staticmethod def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict: """ Prepares MultiModal text data in a form of dictionary @@ -70,16 +89,26 @@ def _column_contains_text(self, column: pd.Series) -> bool: Column contains text if: 1. it's not float or float compatible (e.g. ['1.2', '2.3', '3.4', ...] is float too) - 2. fraction of unique values (except nans) is more than 0.95 + 2. fraction of unique values (except nans) is more than 0.6 + 3. size of tfidf vocabulary is more than 20 + + If size of tfidf vocabulary is less than 20, then it is probably + text column too, but it cannot be vectorized and used in model :param column: pandas series with data - :return: True if column contains text + :return: True if column contains text, False otherwise or if column contains links """ - if column.dtype == object and not self._is_float_compatible(column): - unique_num = len(column.unique()) - nan_num = pd.isna(column).sum() - return unique_num / len(column) > FRACTION_OF_UNIQUE_VALUES if nan_num == 0 \ - else (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES + if self.is_link(column): + return False + elif column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column): + params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf') + tfidf_vectorizer = TfidfVectorizer(**params) + try: + # TODO now grey zone columns (not text, not numerical) are not processed. Need to drop them + tfidf_vectorizer.fit(np.where(pd.isna(column), '', column)) + return len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE + except ValueError: + self.logger.warning(f"Column {column.name} possibly contains text, but it's impossible to vectorize it") return False @staticmethod @@ -96,6 +125,22 @@ def _is_float_compatible(column: pd.Series) -> bool: failed_ratio = failed_objects_number / non_nan_all_objects_number return failed_ratio < 0.5 + @staticmethod + def _has_unique_values(column: pd.Series) -> bool: + """ + :param column: pandas series with data + :return: True if number of unique column values > threshold + """ + unique_num = len(column.unique()) + nan_num = pd.isna(column).sum() + # fraction of unique values in column if there is no nans + frac_unique_is_bigger_than_threshold = unique_num / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT + # fraction of unique values in column if there are nans + frac_unique_is_bigger_than_threshold_with_nans = \ + (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT + return frac_unique_is_bigger_than_threshold if nan_num == 0 \ + else frac_unique_is_bigger_than_threshold_with_nans + class TimeSeriesDataDetector(DataDetector): """ @@ -114,10 +159,13 @@ def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict multi_modal_ts_data = {} for column_name in columns: feature_ts = np.array(dataframe[column_name]) + idx = list(dataframe['datetime']) # Will be the same multi_modal_ts_data.update({column_name: feature_ts}) + multi_modal_ts_data['idx'] = np.asarray(idx) + return multi_modal_ts_data @staticmethod diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py index 1239b17392..7ec06bb650 100644 --- a/fedot/core/data/multi_modal.py +++ b/fedot/core/data/multi_modal.py @@ -59,10 +59,8 @@ def data_type(self): @property def num_classes(self) -> Optional[int]: - if self.task.task_type == TaskTypesEnum.classification: - return len(np.unique(self.target)) - else: - return None + unique_values = self.class_labels + return len(unique_values) if unique_values is not None else None @property def class_labels(self) -> Optional[List[Union[int, str, float]]]: @@ -168,16 +166,19 @@ def from_csv(cls, text_columns = [text_columns] if isinstance(text_columns, str) else text_columns if not text_columns: - text_columns = text_data_detector.define_text_columns(data_frame) + text_columns = text_data_detector.find_text_columns(data_frame) + link_columns = text_data_detector.find_link_columns(data_frame) + # TODO log drop of link columns + columns_to_drop = text_columns + link_columns data_text = text_data_detector.prepare_multimodal_data(data_frame, text_columns) - data_frame_table = data_frame.drop(columns=text_columns) + data_frame_table = data_frame.drop(columns=columns_to_drop) table_features, target = process_target_and_features(data_frame_table, target_columns) data_part_transformation_func = partial(array_to_input_data, idx=idx, target_array=target, task=task) - # create labels for text data sources and remove source if there are many nans + # create labels for text data sources and remove source if there are many nans or text is link sources = dict((text_data_detector.new_key_name(data_part_key), data_part_transformation_func(features_array=data_part, data_type=DataTypesEnum.text)) for (data_part_key, data_part) in data_text.items() diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py index 7cfb0e0c2a..041bc12534 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py @@ -13,7 +13,7 @@ try: import gensim.downloader as api - from gensim.models import KeyedVectors + from gensim.models import KeyedVectors, Word2Vec except ModuleNotFoundError: warn_requirement('gensim') api = None @@ -50,18 +50,21 @@ def transform(self, input_data: InputData) -> OutputData: return output_data @staticmethod - def vectorize_avg(text: str, embeddings): + def vectorize_avg(text: np.array, embeddings) -> np.array: """ Method converts text to an average of token vectors - :param text: str with text data + :param text: np.array with text data :param embeddings: gensim pretrained embeddings :return features: one-dimensional np.array with numbers """ + def _arr2string(array: np.array) -> str: + return np.array2string(array).replace('[', '').replace(']', '').replace('"', '') + embedding_dim = embeddings.vectors.shape[1] features = np.zeros([embedding_dim], dtype='float32') num_words = 0 - for word in text.split(): + for word in _arr2string(text).split(): if word in embeddings: features += embeddings[f'{word}'] num_words += 1 @@ -79,3 +82,18 @@ def _download_model_resources(self): if os.path.exists(model_path): self.logger.info('Embeddings are already downloaded. Loading model...') self.model = KeyedVectors.load_word2vec_format(model_path, binary=False) + + +class TrainedEmbeddingsImplementation(DataOperationImplementation): + + def __init__(self, **params: Optional[dict]): + self.params = params + self.logger = default_log(prefix='FEDOT logger') + super().__init__() + + def fit(self, input_data: InputData): + pass + + def transform(self, input_data: InputData) -> OutputData: + pass + diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py index 177ec09391..55b2fa5445 100644 --- a/fedot/core/pipelines/tuning/search_space.py +++ b/fedot/core/pipelines/tuning/search_space.py @@ -285,8 +285,8 @@ def get_parameters_dict(self): 'glove-wiki-gigaword-100', 'word2vec-ruscorpora-300']]) }, 'tfidf': { - 'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3)]]), - 'min_df': (hp.uniform, [0.0001, 0.1]), + 'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3), (1, 4)]]), + 'min_df': (hp.uniform, [0.0001, 0.01]), 'max_df': (hp.uniform, [0.9, 0.99]) }, } diff --git a/fedot/core/repository/data/default_operation_params.json b/fedot/core/repository/data/default_operation_params.json index 6b6a5495bd..0430f8868c 100644 --- a/fedot/core/repository/data/default_operation_params.json +++ b/fedot/core/repository/data/default_operation_params.json @@ -138,8 +138,9 @@ "model_name": "glove-twitter-25" }, "tfidf": { - "min_df": 0.1, - "max_df": 0.9 + "min_df": 0.01, + "max_df": 0.9, + "max_features": 100000 }, "fast_ica": { "whiten": "unit-variance" diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 1229b368d7..3537c08a5f 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -5,6 +5,7 @@ import pandas as pd from fedot.core.log import LoggerAdapter, default_log +from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum @@ -82,15 +83,15 @@ def convert_data_for_fit(self, data: InputData): data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) - - self._into_numeric_features_transformation_for_fit(data) - # Launch conversion float and integer features into categorical - self._into_categorical_features_transformation_for_fit(data) # Save info about features and target types self.features_types = copy(data.supplementary_data.column_types['features']) self.target_types = copy(data.supplementary_data.column_types['target']) - self._retain_columns_info_without_types_conflicts(data) + if data.data_type is DataTypesEnum.table: + # Launch conversion float and integer features into categorical + self._into_numeric_features_transformation_for_fit(data) + self._into_categorical_features_transformation_for_fit(data) + self._retain_columns_info_without_types_conflicts(data) return data def convert_data_for_predict(self, data: InputData): @@ -103,11 +104,11 @@ def convert_data_for_predict(self, data: InputData): data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) - - # Convert column types - self._into_numeric_features_transformation_for_predict(data) - self._into_categorical_features_transformation_for_predict(data) - self._retain_columns_info_without_types_conflicts(data) + if data.data_type is DataTypesEnum.table: + # Convert column types + self._into_numeric_features_transformation_for_predict(data) + self._into_categorical_features_transformation_for_predict(data) + self._retain_columns_info_without_types_conflicts(data) return data def remove_incorrect_features(self, table: np.array, converted_columns: dict): diff --git a/test/data/multimodal_data_with_complicated_types.csv b/test/data/multimodal_data_with_complicated_types.csv index d81bc7516e..fee5c23c15 100644 --- a/test/data/multimodal_data_with_complicated_types.csv +++ b/test/data/multimodal_data_with_complicated_types.csv @@ -5,7 +5,7 @@ 3,3,,4,4,3,"make a type specimen book. It has survived not only five centuries, but also",4, a ,True,,,0,2,yes 4,4,,5,5,0,"the leap into electronic typesetting, remaining essentially unchanged. It was",5, b ,,,,0,3,no 5,5,,6,6,0,popularised in the 1960s with the release of Letraset sheets containing Lorem ,6, c ,False,,,0,4,no -6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7, a ,True,sample text,sample text,1,5,no +6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7, a ,True,"Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.","Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.",1,5,no 7,7,inf,8,8,1,PageMaker including versions of Lorem Ipsum.,1, b ,,,4,0,6,no 8,inf,inf,9,9,2,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots ",2,,True,,,1,7,no 9,9,inf,10,10,2,"in a piece of classical Latin literature from 45 BC, making it over 2000 years ",3, c ,False,,,0,8,yes @@ -16,4 +16,4 @@ 14,14,,3,3,2,"1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good ",1,a,False,,,,error,no 15,15,,4,4,1,"and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of",2,a ,False,,,,13,no 16,16,2,5,12,0,"ethics, very popular during the Renaissance. The first line of Lorem Ipsum,",3, d ,True,,,1,16,yes -17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4, d ,False,,another sample text,0,17,no +17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4, d ,False,,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",0,17,no diff --git a/test/unit/composer/test_quality_metrics.py b/test/unit/composer/test_quality_metrics.py index b0c460f391..fa1ad018d2 100644 --- a/test/unit/composer/test_quality_metrics.py +++ b/test/unit/composer/test_quality_metrics.py @@ -91,7 +91,7 @@ def test_regression_quality_metric(data_setup): for metric in RegressionMetricsEnum: metric_function = MetricsRepository().metric_by_id(metric) metric_value = metric_function(pipeline=pipeline, reference_data=train) - assert metric_value > 0 + assert 0 < abs(metric_value) < sys.maxsize def test_data_preparation_for_multi_target_correct(multi_target_data_setup): diff --git a/test/unit/data/test_multimodal_data.py b/test/unit/data/test_multimodal_data.py index 4ecdafb931..baa87757bb 100644 --- a/test/unit/data/test_multimodal_data.py +++ b/test/unit/data/test_multimodal_data.py @@ -43,7 +43,7 @@ def test_multi_modal_data(): multi_modal.target = new_target assert np.array_equal(multi_modal.target, new_target) - +# TODO make test of text columns autodetection def test_multimodal_data_from_csv(): """ Checking correctness of MultiModalData import from csv file. @@ -54,7 +54,7 @@ def test_multimodal_data_from_csv(): text_data = np.array(df['description']) table_data = np.array(df.drop(columns=['id', 'description', 'variety'])) target = np.array(df['variety']).reshape(-1, 1) - actual_data = MultiModalData.from_csv(path) + actual_data = MultiModalData.from_csv(path, text_columns=['description']) actual_text_features = actual_data['data_source_text/description'].features actual_table_features = actual_data['data_source_table'].features actual_target = actual_data.target