diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst index 61d746d4f..ec47578c9 100644 --- a/docs/user-guide/download.rst +++ b/docs/user-guide/download.rst @@ -80,7 +80,7 @@ By "extraction", we typically mean the process of converting a data format from * ``"2021-04"`` is the last common crawl snapshot that will be included in the download. * ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported. -You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below. + You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below. .. code-block:: python @@ -133,13 +133,33 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c Above, we changed the extraction algorithm from the default ``JusTextExtractor``. **Note:** The JusTextExtractor, ResiliparseExtractor, and TrafilaturaExtractor classes each have their own unique parameters which are specific to their extraction algorithms. Please see the docstrings for each class for more details. +You can set your own dictionary of stop words by language to be used when extracting text: + +.. code-block:: python + + from nemo_curator.download import download_common_crawl + + # Change the default stop list used + stop_lists = {"ENGLISH": frozenset(["the", "and", "is", "in", "for", "where", "when", "to", "at"])} + + common_crawl = download_common_crawl( + "/extracted/output/folder", + "2020-50", + "2021-04", + output_type="jsonl", + stop_lists=stop_lists, + ) + +This may be desirable to further customize your text extraction pipeline, or to enable text extraction support for languages not included by jusText and NeMo Curator. + The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use. NeMo Curator's Common Crawl extraction process looks like this under the hood: - 1. Decode the HTML within the record from binary to text. - 2. If the HTML can be properly decoded, then with `pyCLD2 `_, perform language detection on the input HTML. - 3. Finally, the extract the relevant text with `jusText `_, `Resiliparse `_, or `Trafilatura `_ from the HTML and write it out as a single string within the 'text' field of a json entry within a `.jsonl` file. +1. Decode the HTML within the record from binary to text. +2. If the HTML can be properly decoded, then with `pyCLD2 `_, perform language detection on the input HTML. +3. Finally, the extract the relevant text with `jusText `_, `Resiliparse `_, or `Trafilatura `_ from the HTML and write it out as a single string within the "text" field of a JSON entry within a ``.jsonl`` file. + * ``download_wikipedia`` will download and extract the latest wikipedia dump. Files are downloaded using ``wget``. Wikipedia might download slower than the other datasets. This is because they limit the number of downloads that can occur per-ip address. .. code-block:: python diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py index badc9840e..f23ad3a79 100644 --- a/nemo_curator/download/commoncrawl.py +++ b/nemo_curator/download/commoncrawl.py @@ -16,6 +16,7 @@ import os import subprocess import unicodedata +import warnings from abc import ABC, abstractmethod from copy import deepcopy from typing import Literal, Optional @@ -40,6 +41,8 @@ from nemo_curator.utils.download_utils import get_common_crawl_urls from nemo_curator.utils.file_utils import expand_outdir_and_mkdir +NON_SPACED_LANGUAGES = ["THAI", "CHINESE", "JAPANESE", "KOREAN"] + def decode_html(html_bytes): # Convert from bytes to text using utf-8 encoding @@ -76,7 +79,7 @@ def lang_detect(decoded_html): class HTMLExtractorAlgorithm(ABC): @abstractmethod - def extract_text(self, html, stop_words): + def extract_text(self, html, stop_words, language): pass @@ -90,6 +93,7 @@ def __init__( max_link_density=0.2, max_heading_distance=200, no_headings=False, + is_boilerplate=None, logger=None, ): """ @@ -123,6 +127,9 @@ def __init__( max_link_density: Maximum allowed link density in the text. max_heading_distance: Maximum distance from a heading to consider text for extraction. no_headings: If True, text extraction will ignore headings. + is_boilerplate: If True, text extraction will ignore boilerplate content. + Default is True for space-separated languages and False for non-space-separated languages + (Thai, Chinese, Japanese, and Korean). logger: Optional logger instance for logging messages. """ @@ -133,9 +140,10 @@ def __init__( self.max_link_density = max_link_density self.max_heading_distance = max_heading_distance self.no_headings = no_headings + self.is_boilerplate = is_boilerplate self.logger = logger - def extract_text(self, html, stop_words): + def extract_text(self, html, stop_words, language): # Segment the HTML into paragraphs try: # Form the DOM tree @@ -149,6 +157,7 @@ def extract_text(self, html, stop_words): if self.logger is not None: self.logger.info("Could not segment paragaphs in the document") return + paragraphs = handler.paragraphs # Context free classification @@ -175,7 +184,21 @@ def extract_text(self, html, stop_words): self.max_heading_distance, ) - return [p.text for p in paragraphs if not p.is_boilerplate] + if self.is_boilerplate is None: + if language in NON_SPACED_LANGUAGES: + warnings.warn("Disabling is_boilerplate check for jusText extraction.") + is_boilerplate = False + else: + is_boilerplate = True + + else: + is_boilerplate = self.is_boilerplate + + if is_boilerplate: + return [p.text for p in paragraphs if not p.is_boilerplate] + + else: + return [p.text for p in paragraphs] class ResiliparseExtractor(HTMLExtractorAlgorithm): @@ -212,26 +235,34 @@ def __init__( self.main_content = main_content self.alt_texts = alt_texts - def extract_text(self, html, stop_words): + def extract_text(self, html, stop_words, language): text = extract_plain_text( html, main_content=self.main_content, alt_texts=self.alt_texts ) paragraphs = list(filter(None, text.split("\n"))) - result = [] - for paragraph in paragraphs: - words = paragraph.split() - length = len(words) - if length == 0: - continue - stopwords = [word for word in words if word in stop_words] - stopword_density = len(stopwords) / length - if stopword_density >= self.required_stopword_density: - result.append(paragraph) + if language in NON_SPACED_LANGUAGES: + warnings.warn( + "stopword_density is ignored for non-space-separated languages." + ) + result = paragraphs + else: + result = [] + + for paragraph in paragraphs: + words = paragraph.split() + length = len(words) + + if length == 0: + continue + + stopwords = [word for word in words if word in stop_words] + stopword_density = len(stopwords) / length + + if stopword_density >= self.required_stopword_density: + result.append(paragraph) - if len(result) == 0: - return None return result @@ -300,7 +331,7 @@ def __init__( self.max_repetitions = max_repetitions self.extract_kwargs = extract_kwargs - def extract_text(self, html, stop_words): + def extract_text(self, html, stop_words, language): trafilatura_config = deepcopy(TRAFILATURA_DEFAULT_CONFIG) trafilatura_config["DEFAULT"]["MIN_EXTRACTED_SIZE"] = str( self.min_extracted_size @@ -328,17 +359,29 @@ def extract_text(self, html, stop_words): if text is not None: paragraphs = list(filter(None, text.split("\n"))) - result = [] - for paragraph in paragraphs: - words = paragraph.split() - length = len(words) - if length == 0: - continue - stopwords = [word for word in words if word in stop_words] - stopword_density = len(stopwords) / length - if stopword_density >= self.required_stopword_density: - result.append(paragraph) + if language in NON_SPACED_LANGUAGES: + warnings.warn( + "stopword_density is ignored for non-space-separated languages." + ) + result = paragraphs + + else: + result = [] + + for paragraph in paragraphs: + words = paragraph.split() + length = len(words) + + if length == 0: + continue + + stopwords = [word for word in words if word in stop_words] + stopword_density = len(stopwords) / length + + if stopword_density >= self.required_stopword_density: + result.append(paragraph) + else: return None @@ -357,12 +400,35 @@ def get_stop_list_dict(languages=[]): "Norwegian_Nynorsk": "NORWEGIAN_N", "Waray_Waray": "WARAY_PHILIPPINES", } + + # List obtained from https://github.com/stopwords-iso/stopwords-ja + from .ja_stopwords import ja_stopwords + + # List obtained from https://github.com/stopwords-iso/stopwords-th + from .th_stopwords import th_stopwords + + # List obtained from https://github.com/stopwords-iso/stopwords-zh + from .zh_stopwords import zh_stopwords + + custom_stopwords = { + "THAI": th_stopwords, + "CHINESE": zh_stopwords, + "JAPANESE": ja_stopwords, + } + if len(languages) == 0: languages = justext.get_stoplists() - # Remove latin as it yields a lot of low quality documents - languages_no_latin = list(languages) - languages_no_latin.remove("Latin") - languages = frozenset(languages_no_latin) + + # Remove Latin as it yields a lot of low quality documents + languages = list(languages) + languages.remove("Latin") + + # Manually add Thai, Chinese, and Japanese + languages.append("THAI") + languages.append("CHINESE") + languages.append("JAPANESE") + + languages = frozenset(languages) stop_list_dict = {} for language in languages: @@ -370,12 +436,11 @@ def get_stop_list_dict(languages=[]): lang_key = lang_map[language] else: lang_key = language.upper() - stop_list_dict[lang_key] = justext.get_stoplist(language) - - # List obtained from https://github.com/stopwords-iso/stopwords-th - from .thai_stopwords import thai_stopwords - stop_list_dict["THAI"] = thai_stopwords + if lang_key in custom_stopwords: + stop_list_dict[lang_key] = custom_stopwords[lang_key] + else: + stop_list_dict[lang_key] = justext.get_stoplist(language) return stop_list_dict @@ -484,8 +549,12 @@ def iterate(self, file_path): class CommonCrawlWARCExtractor(DocumentExtractor): - def __init__(self, algorithm=JusTextExtractor()): - self._stop_lists = get_stop_list_dict() + def __init__(self, algorithm=JusTextExtractor(), stop_lists=None): + if stop_lists is not None: + self._stop_lists = stop_lists + else: + self._stop_lists = get_stop_list_dict() + self.algorithm = algorithm super().__init__() @@ -496,7 +565,7 @@ def extract(self, content): lang = lang_detect(html) text = None if lang in self._stop_lists: - text = self.algorithm.extract_text(html, self._stop_lists[lang]) + text = self.algorithm.extract_text(html, self._stop_lists[lang], lang) if text is not None: if len(text) > 0: text = "\n\n".join(text) @@ -512,6 +581,7 @@ def download_common_crawl( end_snapshot: str, output_type: Literal["jsonl", "parquet"] = "jsonl", algorithm=JusTextExtractor(), + stop_lists=None, news: bool = False, aws: bool = False, raw_download_dir: Optional[str] = None, @@ -536,6 +606,10 @@ def download_common_crawl( • This is not used for the output file, but is used to check if an extracted output already exists. algorithm: The text extraction algorithm instance to use for HTML processing. • This can be a JusTextExtractor (default), ResiliparseExtractor, or TrafilaturaExtractor object. + stop_lists: A dictionary stop lists, where the keys are languages (e.g., "ENGLISH") + and the values are Python frozensets denoting the list of stop words for that language. + If None, it defaults to jusText's stop lists: https://github.com/miso-belica/jusText/tree/main/justext/stoplists, + with added Thai, Chinese, and Japanese support. news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset. • This also means snapshot identifiers should follow the 'YYYY-MM' format. aws (bool): If True, downloads are sourced from Common Crawl's S3 bucket using s5cmd; @@ -577,7 +651,7 @@ def download_common_crawl( expand_outdir_and_mkdir(raw_download_dir) downloader = CommonCrawlWARCDownloader(raw_download_dir, aws=aws) iterator = CommonCrawlWARCIterator() - extractor = CommonCrawlWARCExtractor(algorithm=algorithm) + extractor = CommonCrawlWARCExtractor(algorithm=algorithm, stop_lists=stop_lists) output_format = { "text": str, diff --git a/nemo_curator/download/ja_stopwords.py b/nemo_curator/download/ja_stopwords.py new file mode 100644 index 000000000..d5190d6ac --- /dev/null +++ b/nemo_curator/download/ja_stopwords.py @@ -0,0 +1,138 @@ +ja_stopwords = frozenset( + [ + "あそこ", + "あっ", + "あの", + "あのかた", + "あの人", + "あり", + "あります", + "ある", + "あれ", + "い", + "いう", + "います", + "いる", + "う", + "うち", + "え", + "お", + "および", + "おり", + "おります", + "か", + "かつて", + "から", + "が", + "き", + "ここ", + "こちら", + "こと", + "この", + "これ", + "これら", + "さ", + "さらに", + "し", + "しかし", + "する", + "ず", + "せ", + "せる", + "そこ", + "そして", + "その", + "その他", + "その後", + "それ", + "それぞれ", + "それで", + "た", + "ただし", + "たち", + "ため", + "たり", + "だ", + "だっ", + "だれ", + "つ", + "て", + "で", + "でき", + "できる", + "です", + "では", + "でも", + "と", + "という", + "といった", + "とき", + "ところ", + "として", + "とともに", + "とも", + "と共に", + "どこ", + "どの", + "な", + "ない", + "なお", + "なかっ", + "ながら", + "なく", + "なっ", + "など", + "なに", + "なら", + "なり", + "なる", + "なん", + "に", + "において", + "における", + "について", + "にて", + "によって", + "により", + "による", + "に対して", + "に対する", + "に関する", + "の", + "ので", + "のみ", + "は", + "ば", + "へ", + "ほか", + "ほとんど", + "ほど", + "ます", + "また", + "または", + "まで", + "も", + "もの", + "ものの", + "や", + "よう", + "より", + "ら", + "られ", + "られる", + "れ", + "れる", + "を", + "ん", + "何", + "及び", + "彼", + "彼女", + "我々", + "特に", + "私", + "私達", + "貴方", + "貴方方", + ] +) diff --git a/nemo_curator/download/thai_stopwords.py b/nemo_curator/download/th_stopwords.py similarity index 98% rename from nemo_curator/download/thai_stopwords.py rename to nemo_curator/download/th_stopwords.py index 0ef24737b..1680d1191 100644 --- a/nemo_curator/download/thai_stopwords.py +++ b/nemo_curator/download/th_stopwords.py @@ -1,4 +1,4 @@ -thai_stopwords = frozenset( +th_stopwords = frozenset( [ "กล่าว", "กว่า", diff --git a/nemo_curator/download/zh_stopwords.py b/nemo_curator/download/zh_stopwords.py new file mode 100644 index 000000000..05de06820 --- /dev/null +++ b/nemo_curator/download/zh_stopwords.py @@ -0,0 +1,798 @@ +zh_stopwords = frozenset( + [ + "、", + "。", + "〈", + "〉", + "《", + "》", + "一", + "一个", + "一些", + "一何", + "一切", + "一则", + "一方面", + "一旦", + "一来", + "一样", + "一种", + "一般", + "一转眼", + "七", + "万一", + "三", + "上", + "上下", + "下", + "不", + "不仅", + "不但", + "不光", + "不单", + "不只", + "不外乎", + "不如", + "不妨", + "不尽", + "不尽然", + "不得", + "不怕", + "不惟", + "不成", + "不拘", + "不料", + "不是", + "不比", + "不然", + "不特", + "不独", + "不管", + "不至于", + "不若", + "不论", + "不过", + "不问", + "与", + "与其", + "与其说", + "与否", + "与此同时", + "且", + "且不说", + "且说", + "两者", + "个", + "个别", + "中", + "临", + "为", + "为了", + "为什么", + "为何", + "为止", + "为此", + "为着", + "乃", + "乃至", + "乃至于", + "么", + "之", + "之一", + "之所以", + "之类", + "乌乎", + "乎", + "乘", + "九", + "也", + "也好", + "也罢", + "了", + "二", + "二来", + "于", + "于是", + "于是乎", + "云云", + "云尔", + "五", + "些", + "亦", + "人", + "人们", + "人家", + "什", + "什么", + "什么样", + "今", + "介于", + "仍", + "仍旧", + "从", + "从此", + "从而", + "他", + "他人", + "他们", + "他们们", + "以", + "以上", + "以为", + "以便", + "以免", + "以及", + "以故", + "以期", + "以来", + "以至", + "以至于", + "以致", + "们", + "任", + "任何", + "任凭", + "会", + "似的", + "但", + "但凡", + "但是", + "何", + "何以", + "何况", + "何处", + "何时", + "余外", + "作为", + "你", + "你们", + "使", + "使得", + "例如", + "依", + "依据", + "依照", + "便于", + "俺", + "俺们", + "倘", + "倘使", + "倘或", + "倘然", + "倘若", + "借", + "借傥然", + "假使", + "假如", + "假若", + "做", + "像", + "儿", + "先不先", + "光", + "光是", + "全体", + "全部", + "八", + "六", + "兮", + "共", + "关于", + "关于具体地说", + "其", + "其一", + "其中", + "其二", + "其他", + "其余", + "其它", + "其次", + "具体地说", + "具体说来", + "兼之", + "内", + "再", + "再其次", + "再则", + "再有", + "再者", + "再者说", + "再说", + "冒", + "冲", + "况且", + "几", + "几时", + "凡", + "凡是", + "凭", + "凭借", + "出于", + "出来", + "分", + "分别", + "则", + "则甚", + "别", + "别人", + "别处", + "别是", + "别的", + "别管", + "别说", + "到", + "前后", + "前此", + "前者", + "加之", + "加以", + "区", + "即", + "即令", + "即使", + "即便", + "即如", + "即或", + "即若", + "却", + "去", + "又", + "又及", + "及", + "及其", + "及至", + "反之", + "反而", + "反过来", + "反过来说", + "受到", + "另", + "另一方面", + "另外", + "另悉", + "只", + "只当", + "只怕", + "只是", + "只有", + "只消", + "只要", + "只限", + "叫", + "叮咚", + "可", + "可以", + "可是", + "可见", + "各", + "各个", + "各位", + "各种", + "各自", + "同", + "同时", + "后", + "后者", + "向", + "向使", + "向着", + "吓", + "吗", + "否则", + "吧", + "吧哒", + "含", + "吱", + "呀", + "呃", + "呕", + "呗", + "呜", + "呜呼", + "呢", + "呵", + "呵呵", + "呸", + "呼哧", + "咋", + "和", + "咚", + "咦", + "咧", + "咱", + "咱们", + "咳", + "哇", + "哈", + "哈哈", + "哉", + "哎", + "哎呀", + "哎哟", + "哗", + "哟", + "哦", + "哩", + "哪", + "哪个", + "哪些", + "哪儿", + "哪天", + "哪年", + "哪怕", + "哪样", + "哪边", + "哪里", + "哼", + "哼唷", + "唉", + "唯有", + "啊", + "啐", + "啥", + "啦", + "啪达", + "啷当", + "喂", + "喏", + "喔唷", + "喽", + "嗡", + "嗡嗡", + "嗬", + "嗯", + "嗳", + "嘎", + "嘎登", + "嘘", + "嘛", + "嘻", + "嘿", + "嘿嘿", + "四", + "因", + "因为", + "因了", + "因此", + "因着", + "因而", + "固然", + "在", + "在下", + "在于", + "地", + "基于", + "处在", + "多", + "多么", + "多少", + "大", + "大家", + "她", + "她们", + "好", + "如", + "如上", + "如上所述", + "如下", + "如何", + "如其", + "如同", + "如是", + "如果", + "如此", + "如若", + "始而", + "孰料", + "孰知", + "宁", + "宁可", + "宁愿", + "宁肯", + "它", + "它们", + "对", + "对于", + "对待", + "对方", + "对比", + "将", + "小", + "尔", + "尔后", + "尔尔", + "尚且", + "就", + "就是", + "就是了", + "就是说", + "就算", + "就要", + "尽", + "尽管", + "尽管如此", + "岂但", + "己", + "已", + "已矣", + "巴", + "巴巴", + "年", + "并", + "并且", + "庶乎", + "庶几", + "开外", + "开始", + "归", + "归齐", + "当", + "当地", + "当然", + "当着", + "彼", + "彼时", + "彼此", + "往", + "待", + "很", + "得", + "得了", + "怎", + "怎么", + "怎么办", + "怎么样", + "怎奈", + "怎样", + "总之", + "总的来看", + "总的来说", + "总的说来", + "总而言之", + "恰恰相反", + "您", + "惟其", + "慢说", + "我", + "我们", + "或", + "或则", + "或是", + "或曰", + "或者", + "截至", + "所", + "所以", + "所在", + "所幸", + "所有", + "才", + "才能", + "打", + "打从", + "把", + "抑或", + "拿", + "按", + "按照", + "换句话说", + "换言之", + "据", + "据此", + "接着", + "故", + "故此", + "故而", + "旁人", + "无", + "无宁", + "无论", + "既", + "既往", + "既是", + "既然", + "日", + "时", + "时候", + "是", + "是以", + "是的", + "更", + "曾", + "替", + "替代", + "最", + "月", + "有", + "有些", + "有关", + "有及", + "有时", + "有的", + "望", + "朝", + "朝着", + "本", + "本人", + "本地", + "本着", + "本身", + "来", + "来着", + "来自", + "来说", + "极了", + "果然", + "果真", + "某", + "某个", + "某些", + "某某", + "根据", + "欤", + "正值", + "正如", + "正巧", + "正是", + "此", + "此地", + "此处", + "此外", + "此时", + "此次", + "此间", + "毋宁", + "每", + "每当", + "比", + "比及", + "比如", + "比方", + "没奈何", + "沿", + "沿着", + "漫说", + "点", + "焉", + "然则", + "然后", + "然而", + "照", + "照着", + "犹且", + "犹自", + "甚且", + "甚么", + "甚或", + "甚而", + "甚至", + "甚至于", + "用", + "用来", + "由", + "由于", + "由是", + "由此", + "由此可见", + "的", + "的确", + "的话", + "直到", + "相对而言", + "省得", + "看", + "眨眼", + "着", + "着呢", + "矣", + "矣乎", + "矣哉", + "离", + "秒", + "称", + "竟而", + "第", + "等", + "等到", + "等等", + "简言之", + "管", + "类如", + "紧接着", + "纵", + "纵令", + "纵使", + "纵然", + "经", + "经过", + "结果", + "给", + "继之", + "继后", + "继而", + "综上所述", + "罢了", + "者", + "而", + "而且", + "而况", + "而后", + "而外", + "而已", + "而是", + "而言", + "能", + "能否", + "腾", + "自", + "自个儿", + "自从", + "自各儿", + "自后", + "自家", + "自己", + "自打", + "自身", + "至", + "至于", + "至今", + "至若", + "致", + "般的", + "若", + "若夫", + "若是", + "若果", + "若非", + "莫不然", + "莫如", + "莫若", + "虽", + "虽则", + "虽然", + "虽说", + "被", + "要", + "要不", + "要不是", + "要不然", + "要么", + "要是", + "譬喻", + "譬如", + "让", + "许多", + "论", + "设使", + "设或", + "设若", + "诚如", + "诚然", + "该", + "说", + "说来", + "请", + "诸", + "诸位", + "诸如", + "谁", + "谁人", + "谁料", + "谁知", + "贼死", + "赖以", + "赶", + "起", + "起见", + "趁", + "趁着", + "越是", + "距", + "跟", + "较", + "较之", + "边", + "过", + "还", + "还是", + "还有", + "还要", + "这", + "这一来", + "这个", + "这么", + "这么些", + "这么样", + "这么点儿", + "这些", + "这会儿", + "这儿", + "这就是说", + "这时", + "这样", + "这次", + "这般", + "这边", + "这里", + "进而", + "连", + "连同", + "逐步", + "通过", + "遵循", + "遵照", + "那", + "那个", + "那么", + "那么些", + "那么样", + "那些", + "那会儿", + "那儿", + "那时", + "那样", + "那般", + "那边", + "那里", + "都", + "鄙人", + "鉴于", + "针对", + "阿", + "除", + "除了", + "除外", + "除开", + "除此之外", + "除非", + "随", + "随后", + "随时", + "随着", + "难道说", + "零", + "非", + "非但", + "非徒", + "非特", + "非独", + "靠", + "顺", + "顺着", + "首先", + "︿", + "!", + "#", + "$", + "%", + "&", + "(", + ")", + "*", + "+", + ",", + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + ":", + ";", + "<", + ">", + "?", + "@", + "[", + "]", + "{", + "|", + "}", + "~", + "¥", + ] +) diff --git a/tests/test_download.py b/tests/test_download.py index 7a8434d64..c19bb9cc9 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -121,34 +121,6 @@ def test_imports(self): assert True - def test_resiliparse_extract_text(self, html_string): - algorithm = ResiliparseExtractor() - stop_words = get_stop_list_dict() - result = algorithm.extract_text(html_string, stop_words["ENGLISH"]) - - expected = [ - "This is a sample paragraph. In it we write words. These are stopwords: because did than has near we almost while what still.", - "Let's keep this paragraph: either came does last new took taken making became from.", - ] - - assert result == expected - - def test_trafilatura_extract_text(self, html_string): - algorithm = TrafilaturaExtractor( - min_extracted_size=10, - min_duplcheck_size=10, - max_repetitions=1, - deduplicate=True, - ) - stop_words = get_stop_list_dict() - result = algorithm.extract_text(html_string, stop_words["ENGLISH"]) - - expected = [ - "Let's keep this paragraph: either came does last new took taken making became from.", - ] - - assert result == expected - @pytest.mark.skip( reason="This test is flaky due to calling out to an external service and should be fixed." ) @@ -510,3 +482,228 @@ def test_common_crawl_extractor_resiliparse(self): "Common Crawl test paragraph for resiliparse extractor." in result["text"] ) assert "language" in result + + +class TestExtractor: + def test_resiliparse_extract_text(self, html_string): + algorithm = ResiliparseExtractor() + stop_words = get_stop_list_dict() + result = algorithm.extract_text(html_string, stop_words["ENGLISH"], "ENGLISH") + + expected = [ + "This is a sample paragraph. In it we write words. These are stopwords: because did than has near we almost while what still.", + "Let's keep this paragraph: either came does last new took taken making became from.", + ] + + assert result == expected + + def test_trafilatura_extract_text(self, html_string): + algorithm = TrafilaturaExtractor( + min_extracted_size=10, + min_duplcheck_size=10, + max_repetitions=1, + deduplicate=True, + ) + stop_words = get_stop_list_dict() + result = algorithm.extract_text(html_string, stop_words["ENGLISH"], "ENGLISH") + + expected = [ + "Let's keep this paragraph: either came does last new took taken making became from.", + ] + + assert result == expected + + @pytest.mark.parametrize( + "extraction_algorithm", ["justext", "resiliparse", "trafilatura"] + ) + def test_extract_thai_text(self, extraction_algorithm): + thai_html = """ + + ชื่อเรื่องของฉัน + + +
+ นี่คือตัวอย่างย่อหน้า ในนั้นเราเขียนคำต่างๆ + เหล่านี้เป็นคำหยุด: เพราะว่า ทำ กว่า มี ใกล้ เรา เกือบจะ ขณะที่ อะไร ยังคง + +

+ ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก +
เรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก +

+ +
+ + """ + + if extraction_algorithm == "justext": + algorithm = JusTextExtractor() + expected = [ + "นี่คือตัวอย่างย่อหน้า ในนั้นเราเขียนคำต่างๆ\nเหล่านี้เป็นคำหยุด: เพราะว่า ทำ กว่า มี ใกล้ เรา เกือบจะ ขณะที่ อะไร ยังคง", + "ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก\nเรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก", + ] + elif extraction_algorithm == "resiliparse": + algorithm = ResiliparseExtractor() + expected = [ + "นี่คือตัวอย่างย่อหน้า ในนั้นเราเขียนคำต่างๆ เหล่านี้เป็นคำหยุด: เพราะว่า ทำ กว่า มี ใกล้ เรา เกือบจะ ขณะที่ อะไร ยังคง", + "ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก", + "เรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก", + ] + elif extraction_algorithm == "trafilatura": + algorithm = TrafilaturaExtractor() + expected = [ + "ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก", + "เรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก", + "ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก", + "เรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก", + ] + + stop_words = get_stop_list_dict() + result = algorithm.extract_text(thai_html, stop_words["THAI"], "THAI") + + assert result == expected + + @pytest.mark.parametrize( + "extraction_algorithm", ["justext", "resiliparse", "trafilatura"] + ) + def test_extract_chinese_text(self, extraction_algorithm): + chinese_html = """ + + 我的标题 + + +
+ 这是一个示例段落。我们在其中写下单词。 + +

+ 本段落没有太多停用词。请将其删除。 +
让我们保留这一段:要么来了,要么最后来了,要么新来了,要么采取了行动。 +

+ +
+ + """ + + if extraction_algorithm == "justext": + algorithm = JusTextExtractor() + expected = [ + "这是一个示例段落。我们在其中写下单词。", + "本段落没有太多停用词。请将其删除。\n让我们保留这一段:要么来了,要么最后来了,要么新来了,要么采取了行动。", + ] + elif extraction_algorithm == "resiliparse": + algorithm = ResiliparseExtractor() + expected = [ + "这是一个示例段落。我们在其中写下单词。", + "本段落没有太多停用词。请将其删除。", + "让我们保留这一段:要么来了,要么最后来了,要么新来了,要么采取了行动。", + ] + elif extraction_algorithm == "trafilatura": + algorithm = TrafilaturaExtractor() + expected = [ + "这是一个示例段落。我们在其中写下单词。", + "本段落没有太多停用词。请将其删除。", + "让我们保留这一段:要么来了,要么最后来了,要么新来了,要么采取了行动。", + ] + + stop_words = get_stop_list_dict() + result = algorithm.extract_text(chinese_html, stop_words["CHINESE"], "CHINESE") + + assert result == expected + + @pytest.mark.parametrize( + "extraction_algorithm", ["justext", "resiliparse", "trafilatura"] + ) + def test_extract_japanese_text(self, extraction_algorithm): + japanese_html = """ + + 私のタイトル + + +
+ これはサンプルの段落です。ここに単語を書き込みます。 + +

+ この段落にはストップワードがあまりありません。削除してください。 +
この段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。 +

+ +
+ + """ + + if extraction_algorithm == "justext": + algorithm = JusTextExtractor() + expected = [ + "これはサンプルの段落です。ここに単語を書き込みます。", + "この段落にはストップワードがあまりありません。削除してください。\nこの段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。", + ] + elif extraction_algorithm == "resiliparse": + algorithm = ResiliparseExtractor() + expected = [ + "これはサンプルの段落です。ここに単語を書き込みます。", + "この段落にはストップワードがあまりありません。削除してください。", + "この段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。", + ] + elif extraction_algorithm == "trafilatura": + algorithm = TrafilaturaExtractor() + expected = [ + "この段落にはストップワードがあまりありません。削除してください。", + "この段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。", + "この段落にはストップワードがあまりありません。削除してください。", + "この段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。", + ] + + stop_words = get_stop_list_dict() + result = algorithm.extract_text( + japanese_html, stop_words["JAPANESE"], "JAPANESE" + ) + + assert result == expected + + @pytest.mark.parametrize( + "extraction_algorithm", ["justext", "resiliparse", "trafilatura"] + ) + def test_extract_korean_text(self, extraction_algorithm): + korean_html = """ + + 내 제목 + + +
+ 이것은 샘플 문단입니다. 여기에 단어를 적습니다. + 이것들은 불용어입니다: 왜냐하면, 했으므로, 보다, 가까이에, 우리, 거의, 동안, 무엇, 아직도. + +

+ 이 문단에는 불용어가 많지 않습니다. 제거하세요. +
이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다. +

+ +
+ + """ + + if extraction_algorithm == "justext": + algorithm = JusTextExtractor() + expected = [ + "이것은 샘플 문단입니다. 여기에 단어를 적습니다.\n이것들은 불용어입니다: 왜냐하면, 했으므로, 보다, 가까이에, 우리, 거의, 동안, 무엇, 아직도.", + "이 문단에는 불용어가 많지 않습니다. 제거하세요.\n이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다.", + ] + elif extraction_algorithm == "resiliparse": + algorithm = ResiliparseExtractor() + expected = [ + "이것은 샘플 문단입니다. 여기에 단어를 적습니다. 이것들은 불용어입니다: 왜냐하면, 했으므로, 보다, 가까이에, 우리, 거의, 동안, 무엇, 아직도.", + "이 문단에는 불용어가 많지 않습니다. 제거하세요.", + "이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다.", + ] + elif extraction_algorithm == "trafilatura": + algorithm = TrafilaturaExtractor() + expected = [ + "이 문단에는 불용어가 많지 않습니다. 제거하세요.", + "이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다.", + "이 문단에는 불용어가 많지 않습니다. 제거하세요.", + "이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다.", + ] + + stop_words = get_stop_list_dict() + result = algorithm.extract_text(korean_html, stop_words["KOREAN"], "KOREAN") + + assert result == expected