Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvement of automatic text detection #903

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions fedot/core/composer/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ class R2(QualityMetric):
default_value = 0

@staticmethod
@from_maximised_metric
def metric(reference: InputData, predicted: OutputData) -> float:
return r2_score(y_true=reference.target, y_pred=predicted.predict)

Expand Down
4 changes: 3 additions & 1 deletion fedot/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@

MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100

FRACTION_OF_UNIQUE_VALUES = 0.95
# constants for text processing
FRACTION_OF_UNIQUE_VALUES_IN_TEXT = 0.6
MIN_VOCABULARY_SIZE = 20

default_data_split_ratio_by_task = {
TaskTypesEnum.classification: 0.8,
Expand Down
2 changes: 1 addition & 1 deletion fedot/core/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def num_classes(self) -> Optional[int]:
return len(unique_values) if unique_values is not None else None

@property
def class_labels(self) -> Optional[int]:
def class_labels(self) -> Optional[List[Union[int, str, float]]]:
"""Returns unique class labels that are present in the target"""
if self.task.task_type == TaskTypesEnum.classification and self.target is not None:
return np.unique(self.target)
Expand Down
74 changes: 61 additions & 13 deletions fedot/core/data/data_detection.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from abc import abstractmethod
from typing import List

import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES
from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES_IN_TEXT, MIN_VOCABULARY_SIZE
from fedot.core.log import default_log
from fedot.core.repository.default_params_repository import DefaultOperationParamsRepository

ALLOWED_NAN_PERCENT = 0.9

Expand All @@ -26,24 +30,39 @@ class TextDataDetector(DataDetector):
"""
Class for detecting text data during its import.
"""
def __init__(self):
self.logger = default_log(prefix='FEDOT logger')
super().__init__()

def define_text_columns(self, data_frame: pd.DataFrame) -> List[str]:
def find_text_columns(self, data_frame: pd.DataFrame) -> List[str]:
"""
:param data_frame: pandas dataframe with data
:return: list of text columns' names
"""
text_columns = []
for column_name in data_frame.columns:
if self._column_contains_text(data_frame[column_name]):
text_columns.append(column_name)
text_columns = [column_name for column_name in data_frame.columns
if self._column_contains_text(data_frame[column_name])]
return text_columns

def find_link_columns(self, data_frame: pd.DataFrame) -> List[str]:
"""
:param data_frame: pandas dataframe with data
:return: list of link columns' names
"""
link_columns = [column_name for column_name in data_frame.columns if self.is_link(data_frame[column_name])]
return link_columns

@staticmethod
def is_full_of_nans(text_data: np.array) -> bool:
if np.sum(pd.isna(text_data)) / len(text_data) > ALLOWED_NAN_PERCENT:
return True
return False

@staticmethod
def is_link(text_data: np.array) -> bool:
Copy link
Collaborator

@Dreamlone Dreamlone Oct 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Во первых, стоит написать тесты, покрывающие эту функциональность, но как я понял из описания PR - они и так в процессе

Во вторых, в описании к PR сказано, что "Columns with links (they don't contain useful information and sometimes lead to a FEDOT fail) are removed automatically". Отсюда возникает вопрос, а стоит ли привязываться именно к ссылкам и заносить их все в категорию "столбцов для удаления" (кстати, а не сбивается ли индексация столбцов после их удаления в supplementary data?)? То есть вполне себе могу представить кейс, когда количество уникальных значений в текстовом столбце будет равно двум и оба варианта будут ссылками например. Тогда после One Hot Encoding'а информация из этого столбца вполне может пригодиться. Поэтому имеет смысл выделить свойства столбца с гиперссылками, которые мешают ML алгоритмам и избавляться от всех столбцов с такими свойствами.

Например, если проблема в том, что ссылка всегда одинаковая, то тогда стоит просто удалять все столбцы с неизменным набором символов в ячейках безотносительно их содержания. Если же ссылки для кажого объекта уникальные, то может имеет смысл удалять все столбцы, в которых текст уникален и при этом не представляет собой сырье для NLP алгоритмов (например, нет осмысленных фраз в ячейках или пробелов). Или проблема именно с тем, что встречается набор символов http в ячейке?

С уже оставленными к этому PR комментариями согласен

link_pattern = \
'[(http(s)?):\\/\\/(www\\.)?a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)'
return re.search(link_pattern, str(next(el for el in text_data if el is not None))) is not None

@staticmethod
def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict:
""" Prepares MultiModal text data in a form of dictionary
Expand All @@ -70,16 +89,26 @@ def _column_contains_text(self, column: pd.Series) -> bool:
Column contains text if:
1. it's not float or float compatible
(e.g. ['1.2', '2.3', '3.4', ...] is float too)
2. fraction of unique values (except nans) is more than 0.95
2. fraction of unique values (except nans) is more than 0.6
3. size of tfidf vocabulary is more than 20

If size of tfidf vocabulary is less than 20, then it is probably
text column too, but it cannot be vectorized and used in model

:param column: pandas series with data
:return: True if column contains text
:return: True if column contains text, False otherwise or if column contains links
"""
if column.dtype == object and not self._is_float_compatible(column):
unique_num = len(column.unique())
nan_num = pd.isna(column).sum()
return unique_num / len(column) > FRACTION_OF_UNIQUE_VALUES if nan_num == 0 \
else (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES
if self.is_link(column):
return False
elif column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column):
params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf')
tfidf_vectorizer = TfidfVectorizer(**params)
try:
# TODO now grey zone columns (not text, not numerical) are not processed. Need to drop them
tfidf_vectorizer.fit(np.where(pd.isna(column), '', column))
return len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE
except ValueError:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Можем ли вынести из-под try лишние действия и сделать

# Полезные действия
try:
    # Полезные действия, где ловим ошибки
except:
    # Обработка
else: 
    # Полезные действия

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Можем ли вынести из-под try лишние действия и сделать

Вот так, например?

self.logger.warning(f"Column {column.name} possibly contains text, but it's impossible to vectorize it")
return False

@staticmethod
Expand All @@ -96,6 +125,22 @@ def _is_float_compatible(column: pd.Series) -> bool:
failed_ratio = failed_objects_number / non_nan_all_objects_number
return failed_ratio < 0.5

@staticmethod
def _has_unique_values(column: pd.Series) -> bool:
"""
:param column: pandas series with data
:return: True if number of unique column values > threshold
"""
unique_num = len(column.unique())
nan_num = pd.isna(column).sum()
# fraction of unique values in column if there is no nans
frac_unique_is_bigger_than_threshold = unique_num / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
# fraction of unique values in column if there are nans
frac_unique_is_bigger_than_threshold_with_nans = \
(unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
return frac_unique_is_bigger_than_threshold if nan_num == 0 \
else frac_unique_is_bigger_than_threshold_with_nans


class TimeSeriesDataDetector(DataDetector):
"""
Expand All @@ -114,10 +159,13 @@ def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict
multi_modal_ts_data = {}
for column_name in columns:
feature_ts = np.array(dataframe[column_name])
idx = list(dataframe['datetime'])

# Will be the same
multi_modal_ts_data.update({column_name: feature_ts})

multi_modal_ts_data['idx'] = np.asarray(idx)

return multi_modal_ts_data

@staticmethod
Expand Down
15 changes: 8 additions & 7 deletions fedot/core/data/multi_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,8 @@ def data_type(self):

@property
def num_classes(self) -> Optional[int]:
if self.task.task_type == TaskTypesEnum.classification:
return len(np.unique(self.target))
else:
return None
unique_values = self.class_labels
return len(unique_values) if unique_values is not None else None

@property
def class_labels(self) -> Optional[List[Union[int, str, float]]]:
Expand Down Expand Up @@ -168,16 +166,19 @@ def from_csv(cls,
text_columns = [text_columns] if isinstance(text_columns, str) else text_columns

if not text_columns:
text_columns = text_data_detector.define_text_columns(data_frame)
text_columns = text_data_detector.find_text_columns(data_frame)

link_columns = text_data_detector.find_link_columns(data_frame)
# TODO log drop of link columns
columns_to_drop = text_columns + link_columns
data_text = text_data_detector.prepare_multimodal_data(data_frame, text_columns)
data_frame_table = data_frame.drop(columns=text_columns)
data_frame_table = data_frame.drop(columns=columns_to_drop)
table_features, target = process_target_and_features(data_frame_table, target_columns)

data_part_transformation_func = partial(array_to_input_data,
idx=idx, target_array=target, task=task)

# create labels for text data sources and remove source if there are many nans
# create labels for text data sources and remove source if there are many nans or text is link
sources = dict((text_data_detector.new_key_name(data_part_key),
data_part_transformation_func(features_array=data_part, data_type=DataTypesEnum.text))
for (data_part_key, data_part) in data_text.items()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

try:
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.models import KeyedVectors, Word2Vec
except ModuleNotFoundError:
warn_requirement('gensim')
api = None
Expand Down Expand Up @@ -50,18 +50,21 @@ def transform(self, input_data: InputData) -> OutputData:
return output_data

@staticmethod
def vectorize_avg(text: str, embeddings):
def vectorize_avg(text: np.array, embeddings) -> np.array:
""" Method converts text to an average of token vectors

:param text: str with text data
:param text: np.array with text data
:param embeddings: gensim pretrained embeddings
:return features: one-dimensional np.array with numbers
"""
def _arr2string(array: np.array) -> str:
return np.array2string(array).replace('[', '').replace(']', '').replace('"', '')

embedding_dim = embeddings.vectors.shape[1]
features = np.zeros([embedding_dim], dtype='float32')
num_words = 0

for word in text.split():
for word in _arr2string(text).split():
if word in embeddings:
features += embeddings[f'{word}']
num_words += 1
Expand All @@ -79,3 +82,18 @@ def _download_model_resources(self):
if os.path.exists(model_path):
self.logger.info('Embeddings are already downloaded. Loading model...')
self.model = KeyedVectors.load_word2vec_format(model_path, binary=False)


class TrainedEmbeddingsImplementation(DataOperationImplementation):

def __init__(self, **params: Optional[dict]):
self.params = params
self.logger = default_log(prefix='FEDOT logger')
super().__init__()

def fit(self, input_data: InputData):
pass

def transform(self, input_data: InputData) -> OutputData:
pass

4 changes: 2 additions & 2 deletions fedot/core/pipelines/tuning/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,8 @@ def get_parameters_dict(self):
'glove-wiki-gigaword-100', 'word2vec-ruscorpora-300']])
},
'tfidf': {
'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3)]]),
'min_df': (hp.uniform, [0.0001, 0.1]),
'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3), (1, 4)]]),
'min_df': (hp.uniform, [0.0001, 0.01]),
'max_df': (hp.uniform, [0.9, 0.99])
},
}
Expand Down
5 changes: 3 additions & 2 deletions fedot/core/repository/data/default_operation_params.json
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,9 @@
"model_name": "glove-twitter-25"
},
"tfidf": {
"min_df": 0.1,
"max_df": 0.9
"min_df": 0.01,
"max_df": 0.9,
"max_features": 100000
},
"fast_ica": {
"whiten": "unit-variance"
Expand Down
21 changes: 11 additions & 10 deletions fedot/preprocessing/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd

from fedot.core.log import LoggerAdapter, default_log
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import Task, TaskTypesEnum


Expand Down Expand Up @@ -82,15 +83,15 @@ def convert_data_for_fit(self, data: InputData):
data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features,
target=data.target,
task=data.task)

self._into_numeric_features_transformation_for_fit(data)
# Launch conversion float and integer features into categorical
self._into_categorical_features_transformation_for_fit(data)
# Save info about features and target types
self.features_types = copy(data.supplementary_data.column_types['features'])
self.target_types = copy(data.supplementary_data.column_types['target'])

self._retain_columns_info_without_types_conflicts(data)
if data.data_type is DataTypesEnum.table:
# Launch conversion float and integer features into categorical
self._into_numeric_features_transformation_for_fit(data)
self._into_categorical_features_transformation_for_fit(data)
self._retain_columns_info_without_types_conflicts(data)
return data

def convert_data_for_predict(self, data: InputData):
Expand All @@ -103,11 +104,11 @@ def convert_data_for_predict(self, data: InputData):
data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features,
target=data.target,
task=data.task)

# Convert column types
self._into_numeric_features_transformation_for_predict(data)
self._into_categorical_features_transformation_for_predict(data)
self._retain_columns_info_without_types_conflicts(data)
if data.data_type is DataTypesEnum.table:
# Convert column types
self._into_numeric_features_transformation_for_predict(data)
self._into_categorical_features_transformation_for_predict(data)
self._retain_columns_info_without_types_conflicts(data)
return data

def remove_incorrect_features(self, table: np.array, converted_columns: dict):
Expand Down
4 changes: 2 additions & 2 deletions test/data/multimodal_data_with_complicated_types.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
3,3,,4,4,3,"make a type specimen book. It has survived not only five centuries, but also",4, a ,True,,,0,2,yes
4,4,,5,5,0,"the leap into electronic typesetting, remaining essentially unchanged. It was",5, b ,,,,0,3,no
5,5,,6,6,0,popularised in the 1960s with the release of Letraset sheets containing Lorem ,6, c ,False,,,0,4,no
6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7, a ,True,sample text,sample text,1,5,no
6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7, a ,True,"Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.","Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.",1,5,no
7,7,inf,8,8,1,PageMaker including versions of Lorem Ipsum.,1, b ,,,4,0,6,no
8,inf,inf,9,9,2,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots ",2,,True,,,1,7,no
9,9,inf,10,10,2,"in a piece of classical Latin literature from 45 BC, making it over 2000 years ",3, c ,False,,,0,8,yes
Expand All @@ -16,4 +16,4 @@
14,14,,3,3,2,"1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good ",1,a,False,,,,error,no
15,15,,4,4,1,"and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of",2,a ,False,,,,13,no
16,16,2,5,12,0,"ethics, very popular during the Renaissance. The first line of Lorem Ipsum,",3, d ,True,,,1,16,yes
17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4, d ,False,,another sample text,0,17,no
17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4, d ,False,,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",0,17,no
2 changes: 1 addition & 1 deletion test/unit/composer/test_quality_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_regression_quality_metric(data_setup):
for metric in RegressionMetricsEnum:
metric_function = MetricsRepository().metric_by_id(metric)
metric_value = metric_function(pipeline=pipeline, reference_data=train)
assert metric_value > 0
assert 0 < abs(metric_value) < sys.maxsize


def test_data_preparation_for_multi_target_correct(multi_target_data_setup):
Expand Down
4 changes: 2 additions & 2 deletions test/unit/data/test_multimodal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_multi_modal_data():
multi_modal.target = new_target
assert np.array_equal(multi_modal.target, new_target)


# TODO make test of text columns autodetection
def test_multimodal_data_from_csv():
"""
Checking correctness of MultiModalData import from csv file.
Expand All @@ -54,7 +54,7 @@ def test_multimodal_data_from_csv():
text_data = np.array(df['description'])
table_data = np.array(df.drop(columns=['id', 'description', 'variety']))
target = np.array(df['variety']).reshape(-1, 1)
actual_data = MultiModalData.from_csv(path)
actual_data = MultiModalData.from_csv(path, text_columns=['description'])
actual_text_features = actual_data['data_source_text/description'].features
actual_table_features = actual_data['data_source_table'].features
actual_target = actual_data.target
Expand Down