aimclub · andreygetmanov · Sep 6, 2022 · Sep 20, 2022 · Sep 21, 2022 · Sep 29, 2022
diff --git a/fedot/core/composer/metrics.py b/fedot/core/composer/metrics.py
@@ -214,6 +214,7 @@ class R2(QualityMetric):
     default_value = 0
 
     @staticmethod
+    @from_maximised_metric
     def metric(reference: InputData, predicted: OutputData) -> float:
         return r2_score(y_true=reference.target, y_pred=predicted.predict)
 

diff --git a/fedot/core/constants.py b/fedot/core/constants.py
@@ -14,7 +14,9 @@
 
 MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100
 
-FRACTION_OF_UNIQUE_VALUES = 0.95
+# constants for text processing
+FRACTION_OF_UNIQUE_VALUES_IN_TEXT = 0.6
+MIN_VOCABULARY_SIZE = 20
 
 default_data_split_ratio_by_task = {
     TaskTypesEnum.classification: 0.8,

diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py
@@ -334,7 +334,7 @@ def num_classes(self) -> Optional[int]:
         return len(unique_values) if unique_values is not None else None
 
     @property
-    def class_labels(self) -> Optional[int]:
+    def class_labels(self) -> Optional[List[Union[int, str, float]]]:
         """Returns unique class labels that are present in the target"""
         if self.task.task_type == TaskTypesEnum.classification and self.target is not None:
             return np.unique(self.target)

diff --git a/fedot/core/data/data_detection.py b/fedot/core/data/data_detection.py
@@ -1,10 +1,14 @@
 from abc import abstractmethod
 from typing import List
 
+import re
 import numpy as np
 import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
 
-from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES
+from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES_IN_TEXT, MIN_VOCABULARY_SIZE
+from fedot.core.log import default_log
+from fedot.core.repository.default_params_repository import DefaultOperationParamsRepository
 
 ALLOWED_NAN_PERCENT = 0.9
 
@@ -26,24 +30,39 @@ class TextDataDetector(DataDetector):
     """
     Class for detecting text data during its import.
     """
+    def __init__(self):
+        self.logger = default_log(prefix='FEDOT logger')
+        super().__init__()
 
-    def define_text_columns(self, data_frame: pd.DataFrame) -> List[str]:
+    def find_text_columns(self, data_frame: pd.DataFrame) -> List[str]:
         """
         :param data_frame: pandas dataframe with data
         :return: list of text columns' names
         """
-        text_columns = []
-        for column_name in data_frame.columns:
-            if self._column_contains_text(data_frame[column_name]):
-                text_columns.append(column_name)
+        text_columns = [column_name for column_name in data_frame.columns
+                        if self._column_contains_text(data_frame[column_name])]
         return text_columns
 
+    def find_link_columns(self, data_frame: pd.DataFrame) -> List[str]:
+        """
+        :param data_frame: pandas dataframe with data
+        :return: list of link columns' names
+        """
+        link_columns = [column_name for column_name in data_frame.columns if self.is_link(data_frame[column_name])]
+        return link_columns
+
     @staticmethod
     def is_full_of_nans(text_data: np.array) -> bool:
         if np.sum(pd.isna(text_data)) / len(text_data) > ALLOWED_NAN_PERCENT:
             return True
         return False
 
+    @staticmethod
+    def is_link(text_data: np.array) -> bool:
+        link_pattern = \
+            '[(http(s)?):\\/\\/(www\\.)?a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)'
+        return re.search(link_pattern, str(next(el for el in text_data if el is not None))) is not None
+
     @staticmethod
     def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict:
         """ Prepares MultiModal text data in a form of dictionary
@@ -70,16 +89,26 @@ def _column_contains_text(self, column: pd.Series) -> bool:
         Column contains text if:
         1. it's not float or float compatible
         (e.g. ['1.2', '2.3', '3.4', ...] is float too)
-        2. fraction of unique values (except nans) is more than 0.95
+        2. fraction of unique values (except nans) is more than 0.6
+        3. size of tfidf vocabulary is more than 20
+
+        If size of tfidf vocabulary is less than 20, then it is probably
+        text column too, but it cannot be vectorized and used in model
 
         :param column: pandas series with data
-        :return: True if column contains text
+        :return: True if column contains text, False otherwise or if column contains links
         """
-        if column.dtype == object and not self._is_float_compatible(column):
-            unique_num = len(column.unique())
-            nan_num = pd.isna(column).sum()
-            return unique_num / len(column) > FRACTION_OF_UNIQUE_VALUES if nan_num == 0 \
-                else (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES
+        if self.is_link(column):
+            return False
+        elif column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column):
+            params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf')
+            tfidf_vectorizer = TfidfVectorizer(**params)
+            try:
+                # TODO now grey zone columns (not text, not numerical) are not processed. Need to drop them
+                tfidf_vectorizer.fit(np.where(pd.isna(column), '', column))
+                return len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE
+            except ValueError:
+                self.logger.warning(f"Column {column.name} possibly contains text, but it's impossible to vectorize it")
         return False
 
     @staticmethod
@@ -96,6 +125,22 @@ def _is_float_compatible(column: pd.Series) -> bool:
         failed_ratio = failed_objects_number / non_nan_all_objects_number
         return failed_ratio < 0.5
 
+    @staticmethod
+    def _has_unique_values(column: pd.Series) -> bool:
+        """
+        :param column: pandas series with data
+        :return: True if number of unique column values > threshold
+        """
+        unique_num = len(column.unique())
+        nan_num = pd.isna(column).sum()
+        # fraction of unique values in column if there is no nans
+        frac_unique_is_bigger_than_threshold = unique_num / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
+        # fraction of unique values in column if there are nans
+        frac_unique_is_bigger_than_threshold_with_nans = \
+            (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
+        return frac_unique_is_bigger_than_threshold if nan_num == 0 \
+            else frac_unique_is_bigger_than_threshold_with_nans
+
 
 class TimeSeriesDataDetector(DataDetector):
     """
@@ -114,10 +159,13 @@ def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict
         multi_modal_ts_data = {}
         for column_name in columns:
             feature_ts = np.array(dataframe[column_name])
+            idx = list(dataframe['datetime'])
 
             # Will be the same
             multi_modal_ts_data.update({column_name: feature_ts})
 
+        multi_modal_ts_data['idx'] = np.asarray(idx)
+
         return multi_modal_ts_data
 
     @staticmethod

diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py
@@ -59,10 +59,8 @@ def data_type(self):
 
     @property
     def num_classes(self) -> Optional[int]:
-        if self.task.task_type == TaskTypesEnum.classification:
-            return len(np.unique(self.target))
-        else:
-            return None
+        unique_values = self.class_labels
+        return len(unique_values) if unique_values is not None else None
 
     @property
     def class_labels(self) -> Optional[List[Union[int, str, float]]]:
@@ -168,16 +166,19 @@ def from_csv(cls,
         text_columns = [text_columns] if isinstance(text_columns, str) else text_columns
 
         if not text_columns:
-            text_columns = text_data_detector.define_text_columns(data_frame)
+            text_columns = text_data_detector.find_text_columns(data_frame)
 
+        link_columns = text_data_detector.find_link_columns(data_frame)
+        # TODO log drop of link columns
+        columns_to_drop = text_columns + link_columns
         data_text = text_data_detector.prepare_multimodal_data(data_frame, text_columns)
-        data_frame_table = data_frame.drop(columns=text_columns)
+        data_frame_table = data_frame.drop(columns=columns_to_drop)
         table_features, target = process_target_and_features(data_frame_table, target_columns)
 
         data_part_transformation_func = partial(array_to_input_data,
                                                 idx=idx, target_array=target, task=task)
 
-        # create labels for text data sources and remove source if there are many nans
+        # create labels for text data sources and remove source if there are many nans or text is link
         sources = dict((text_data_detector.new_key_name(data_part_key),
                         data_part_transformation_func(features_array=data_part, data_type=DataTypesEnum.text))
                        for (data_part_key, data_part) in data_text.items()

diff --git a/...t/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py b/...t/core/operations/evaluation/operation_implementations/data_operations/text_pretrained.py
@@ -13,7 +13,7 @@
 
 try:
     import gensim.downloader as api
-    from gensim.models import KeyedVectors
+    from gensim.models import KeyedVectors, Word2Vec
 except ModuleNotFoundError:
     warn_requirement('gensim')
     api = None
@@ -50,18 +50,21 @@ def transform(self, input_data: InputData) -> OutputData:
         return output_data
 
     @staticmethod
-    def vectorize_avg(text: str, embeddings):
+    def vectorize_avg(text: np.array, embeddings) -> np.array:
         """ Method converts text to an average of token vectors
 
-        :param text: str with text data
+        :param text: np.array with text data
         :param embeddings: gensim pretrained embeddings
         :return features: one-dimensional np.array with numbers
         """
+        def _arr2string(array: np.array) -> str:
+            return np.array2string(array).replace('[', '').replace(']', '').replace('"', '')
+
         embedding_dim = embeddings.vectors.shape[1]
         features = np.zeros([embedding_dim], dtype='float32')
         num_words = 0
 
-        for word in text.split():
+        for word in _arr2string(text).split():
             if word in embeddings:
                 features += embeddings[f'{word}']
                 num_words += 1
@@ -79,3 +82,18 @@ def _download_model_resources(self):
         if os.path.exists(model_path):
             self.logger.info('Embeddings are already downloaded. Loading model...')
             self.model = KeyedVectors.load_word2vec_format(model_path, binary=False)
+
+
+class TrainedEmbeddingsImplementation(DataOperationImplementation):
+
+    def __init__(self, **params: Optional[dict]):
+        self.params = params
+        self.logger = default_log(prefix='FEDOT logger')
+        super().__init__()
+
+    def fit(self, input_data: InputData):
+        pass
+
+    def transform(self, input_data: InputData) -> OutputData:
+        pass
+
diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py
@@ -285,8 +285,8 @@ def get_parameters_dict(self):
                                             'glove-wiki-gigaword-100', 'word2vec-ruscorpora-300']])
             },
             'tfidf': {
-                'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3)]]),
-                'min_df': (hp.uniform, [0.0001, 0.1]),
+                'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3), (1, 4)]]),
+                'min_df': (hp.uniform, [0.0001, 0.01]),
                 'max_df': (hp.uniform, [0.9, 0.99])
             },
         }

diff --git a/fedot/core/repository/data/default_operation_params.json b/fedot/core/repository/data/default_operation_params.json
@@ -138,8 +138,9 @@
     "model_name": "glove-twitter-25"
   },
   "tfidf": {
-    "min_df": 0.1,
-    "max_df": 0.9
+    "min_df": 0.01,
+    "max_df": 0.9,
+    "max_features": 100000
   },
   "fast_ica": {
     "whiten": "unit-variance"

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from fedot.core.log import LoggerAdapter, default_log
+from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.tasks import Task, TaskTypesEnum
 
 
@@ -82,15 +83,15 @@ def convert_data_for_fit(self, data: InputData):
         data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features,
                                                                               target=data.target,
                                                                               task=data.task)
-
-        self._into_numeric_features_transformation_for_fit(data)
-        # Launch conversion float and integer features into categorical
-        self._into_categorical_features_transformation_for_fit(data)
         # Save info about features and target types
         self.features_types = copy(data.supplementary_data.column_types['features'])
         self.target_types = copy(data.supplementary_data.column_types['target'])
 
-        self._retain_columns_info_without_types_conflicts(data)
+        if data.data_type is DataTypesEnum.table:
+            # Launch conversion float and integer features into categorical
+            self._into_numeric_features_transformation_for_fit(data)
+            self._into_categorical_features_transformation_for_fit(data)
+            self._retain_columns_info_without_types_conflicts(data)
         return data
 
     def convert_data_for_predict(self, data: InputData):
@@ -103,11 +104,11 @@ def convert_data_for_predict(self, data: InputData):
         data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features,
                                                                               target=data.target,
                                                                               task=data.task)
-
-        # Convert column types
-        self._into_numeric_features_transformation_for_predict(data)
-        self._into_categorical_features_transformation_for_predict(data)
-        self._retain_columns_info_without_types_conflicts(data)
+        if data.data_type is DataTypesEnum.table:
+            # Convert column types
+            self._into_numeric_features_transformation_for_predict(data)
+            self._into_categorical_features_transformation_for_predict(data)
+            self._retain_columns_info_without_types_conflicts(data)
         return data
 
     def remove_incorrect_features(self, table: np.array, converted_columns: dict):

diff --git a/test/data/multimodal_data_with_complicated_types.csv b/test/data/multimodal_data_with_complicated_types.csv
@@ -5,7 +5,7 @@
 3,3,,4,4,3,"make a type specimen book. It has survived not only five centuries, but also",4,  a  ,True,,,0,2,yes
 4,4,,5,5,0,"the leap into electronic typesetting, remaining essentially unchanged. It was",5,   b ,,,,0,3,no
 5,5,,6,6,0,popularised in the 1960s with the release of Letraset sheets containing Lorem ,6,   c  ,False,,,0,4,no
-6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7,    a  ,True,sample text,sample text,1,5,no
+6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7,    a  ,True,"Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.","Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.",1,5,no
 7,7,inf,8,8,1,PageMaker including versions of Lorem Ipsum.,1, b   ,,,4,0,6,no
 8,inf,inf,9,9,2,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots ",2,,True,,,1,7,no
 9,9,inf,10,10,2,"in a piece of classical Latin literature from 45 BC, making it over 2000 years ",3, c  ,False,,,0,8,yes
@@ -16,4 +16,4 @@
 14,14,,3,3,2,"1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good ",1,a,False,,,,error,no
 15,15,,4,4,1,"and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of",2,a  ,False,,,,13,no
 16,16,2,5,12,0,"ethics, very popular during the Renaissance. The first line of Lorem Ipsum,",3,   d       ,True,,,1,16,yes
-17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4,  d      ,False,,another sample text,0,17,no
+17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4,  d      ,False,,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",0,17,no
diff --git a/test/unit/composer/test_quality_metrics.py b/test/unit/composer/test_quality_metrics.py
@@ -91,7 +91,7 @@ def test_regression_quality_metric(data_setup):
     for metric in RegressionMetricsEnum:
         metric_function = MetricsRepository().metric_by_id(metric)
         metric_value = metric_function(pipeline=pipeline, reference_data=train)
-        assert metric_value > 0
+        assert 0 < abs(metric_value) < sys.maxsize
 
 
 def test_data_preparation_for_multi_target_correct(multi_target_data_setup):

diff --git a/test/unit/data/test_multimodal_data.py b/test/unit/data/test_multimodal_data.py
@@ -43,7 +43,7 @@ def test_multi_modal_data():
     multi_modal.target = new_target
     assert np.array_equal(multi_modal.target, new_target)
 
-
+# TODO make test of text columns autodetection
 def test_multimodal_data_from_csv():
     """
     Checking correctness of MultiModalData import from csv file.
@@ -54,7 +54,7 @@ def test_multimodal_data_from_csv():
     text_data = np.array(df['description'])
     table_data = np.array(df.drop(columns=['id', 'description', 'variety']))
     target = np.array(df['variety']).reshape(-1, 1)
-    actual_data = MultiModalData.from_csv(path)
+    actual_data = MultiModalData.from_csv(path, text_columns=['description'])
     actual_text_features = actual_data['data_source_text/description'].features
     actual_table_features = actual_data['data_source_table'].features
     actual_target = actual_data.target