diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst
index 61d746d4f..ec47578c9 100644
--- a/docs/user-guide/download.rst
+++ b/docs/user-guide/download.rst
@@ -80,7 +80,7 @@ By "extraction", we typically mean the process of converting a data format from
 * ``"2021-04"`` is the last common crawl snapshot that will be included in the download.
 * ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported.
 
-You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below.
+  You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below.
 
 .. code-block:: python
 
@@ -133,13 +133,33 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c
 
 Above, we changed the extraction algorithm from the default ``JusTextExtractor``. **Note:** The JusTextExtractor, ResiliparseExtractor, and TrafilaturaExtractor classes each have their own unique parameters which are specific to their extraction algorithms. Please see the docstrings for each class for more details.
 
+You can set your own dictionary of stop words by language to be used when extracting text:
+
+.. code-block:: python
+
+  from nemo_curator.download import download_common_crawl
+
+  # Change the default stop list used
+  stop_lists = {"ENGLISH": frozenset(["the", "and", "is", "in", "for", "where", "when", "to", "at"])}
+
+  common_crawl = download_common_crawl(
+      "/extracted/output/folder",
+      "2020-50",
+      "2021-04",
+      output_type="jsonl",
+      stop_lists=stop_lists,
+  )
+
+This may be desirable to further customize your text extraction pipeline, or to enable text extraction support for languages not included by jusText and NeMo Curator.
+
 The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use.
 
 NeMo Curator's Common Crawl extraction process looks like this under the hood:
 
- 1. Decode the HTML within the record from binary to text.
- 2. If the HTML can be properly decoded, then with `pyCLD2 <https://github.com/aboSamoor/pycld2>`_, perform language detection on the input HTML.
- 3. Finally, the extract the relevant text with `jusText <https://github.com/miso-belica/jusText>`_, `Resiliparse <https://github.com/chatnoir-eu/chatnoir-resiliparse>`_, or `Trafilatura <https://trafilatura.readthedocs.io/en/latest/>`_ from the HTML and write it out as a single string within the 'text' field of a json entry within a `.jsonl` file.
+1. Decode the HTML within the record from binary to text.
+2. If the HTML can be properly decoded, then with `pyCLD2 <https://github.com/aboSamoor/pycld2>`_, perform language detection on the input HTML.
+3. Finally, the extract the relevant text with `jusText <https://github.com/miso-belica/jusText>`_, `Resiliparse <https://github.com/chatnoir-eu/chatnoir-resiliparse>`_, or `Trafilatura <https://trafilatura.readthedocs.io/en/latest/>`_ from the HTML and write it out as a single string within the "text" field of a JSON entry within a ``.jsonl`` file.
+
 * ``download_wikipedia`` will download and extract the latest wikipedia dump. Files are downloaded using ``wget``. Wikipedia might download slower than the other datasets. This is because they limit the number of downloads that can occur per-ip address.
 
   .. code-block:: python
diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py
index badc9840e..f23ad3a79 100644
--- a/nemo_curator/download/commoncrawl.py
+++ b/nemo_curator/download/commoncrawl.py
@@ -16,6 +16,7 @@
 import os
 import subprocess
 import unicodedata
+import warnings
 from abc import ABC, abstractmethod
 from copy import deepcopy
 from typing import Literal, Optional
@@ -40,6 +41,8 @@
 from nemo_curator.utils.download_utils import get_common_crawl_urls
 from nemo_curator.utils.file_utils import expand_outdir_and_mkdir
 
+NON_SPACED_LANGUAGES = ["THAI", "CHINESE", "JAPANESE", "KOREAN"]
+
 
 def decode_html(html_bytes):
     # Convert from bytes to text using utf-8 encoding
@@ -76,7 +79,7 @@ def lang_detect(decoded_html):
 
 class HTMLExtractorAlgorithm(ABC):
     @abstractmethod
-    def extract_text(self, html, stop_words):
+    def extract_text(self, html, stop_words, language):
         pass
 
 
@@ -90,6 +93,7 @@ def __init__(
         max_link_density=0.2,
         max_heading_distance=200,
         no_headings=False,
+        is_boilerplate=None,
         logger=None,
     ):
         """
@@ -123,6 +127,9 @@ def __init__(
             max_link_density: Maximum allowed link density in the text.
             max_heading_distance: Maximum distance from a heading to consider text for extraction.
             no_headings: If True, text extraction will ignore headings.
+            is_boilerplate: If True, text extraction will ignore boilerplate content.
+                Default is True for space-separated languages and False for non-space-separated languages
+                (Thai, Chinese, Japanese, and Korean).
             logger: Optional logger instance for logging messages.
 
         """
@@ -133,9 +140,10 @@ def __init__(
         self.max_link_density = max_link_density
         self.max_heading_distance = max_heading_distance
         self.no_headings = no_headings
+        self.is_boilerplate = is_boilerplate
         self.logger = logger
 
-    def extract_text(self, html, stop_words):
+    def extract_text(self, html, stop_words, language):
         # Segment the HTML into paragraphs
         try:
             # Form the DOM tree
@@ -149,6 +157,7 @@ def extract_text(self, html, stop_words):
             if self.logger is not None:
                 self.logger.info("Could not segment paragaphs in the document")
             return
+
         paragraphs = handler.paragraphs
 
         # Context free classification
@@ -175,7 +184,21 @@ def extract_text(self, html, stop_words):
             self.max_heading_distance,
         )
 
-        return [p.text for p in paragraphs if not p.is_boilerplate]
+        if self.is_boilerplate is None:
+            if language in NON_SPACED_LANGUAGES:
+                warnings.warn("Disabling is_boilerplate check for jusText extraction.")
+                is_boilerplate = False
+            else:
+                is_boilerplate = True
+
+        else:
+            is_boilerplate = self.is_boilerplate
+
+        if is_boilerplate:
+            return [p.text for p in paragraphs if not p.is_boilerplate]
+
+        else:
+            return [p.text for p in paragraphs]
 
 
 class ResiliparseExtractor(HTMLExtractorAlgorithm):
@@ -212,26 +235,34 @@ def __init__(
         self.main_content = main_content
         self.alt_texts = alt_texts
 
-    def extract_text(self, html, stop_words):
+    def extract_text(self, html, stop_words, language):
         text = extract_plain_text(
             html, main_content=self.main_content, alt_texts=self.alt_texts
         )
 
         paragraphs = list(filter(None, text.split("\n")))
-        result = []
-        for paragraph in paragraphs:
-            words = paragraph.split()
-            length = len(words)
-            if length == 0:
-                continue
-            stopwords = [word for word in words if word in stop_words]
-            stopword_density = len(stopwords) / length
 
-            if stopword_density >= self.required_stopword_density:
-                result.append(paragraph)
+        if language in NON_SPACED_LANGUAGES:
+            warnings.warn(
+                "stopword_density is ignored for non-space-separated languages."
+            )
+            result = paragraphs
+        else:
+            result = []
+
+            for paragraph in paragraphs:
+                words = paragraph.split()
+                length = len(words)
+
+                if length == 0:
+                    continue
+
+                stopwords = [word for word in words if word in stop_words]
+                stopword_density = len(stopwords) / length
+
+                if stopword_density >= self.required_stopword_density:
+                    result.append(paragraph)
 
-        if len(result) == 0:
-            return None
         return result
 
 
@@ -300,7 +331,7 @@ def __init__(
         self.max_repetitions = max_repetitions
         self.extract_kwargs = extract_kwargs
 
-    def extract_text(self, html, stop_words):
+    def extract_text(self, html, stop_words, language):
         trafilatura_config = deepcopy(TRAFILATURA_DEFAULT_CONFIG)
         trafilatura_config["DEFAULT"]["MIN_EXTRACTED_SIZE"] = str(
             self.min_extracted_size
@@ -328,17 +359,29 @@ def extract_text(self, html, stop_words):
 
         if text is not None:
             paragraphs = list(filter(None, text.split("\n")))
-            result = []
-            for paragraph in paragraphs:
-                words = paragraph.split()
-                length = len(words)
-                if length == 0:
-                    continue
-                stopwords = [word for word in words if word in stop_words]
-                stopword_density = len(stopwords) / length
 
-                if stopword_density >= self.required_stopword_density:
-                    result.append(paragraph)
+            if language in NON_SPACED_LANGUAGES:
+                warnings.warn(
+                    "stopword_density is ignored for non-space-separated languages."
+                )
+                result = paragraphs
+
+            else:
+                result = []
+
+                for paragraph in paragraphs:
+                    words = paragraph.split()
+                    length = len(words)
+
+                    if length == 0:
+                        continue
+
+                    stopwords = [word for word in words if word in stop_words]
+                    stopword_density = len(stopwords) / length
+
+                    if stopword_density >= self.required_stopword_density:
+                        result.append(paragraph)
+
         else:
             return None
 
@@ -357,12 +400,35 @@ def get_stop_list_dict(languages=[]):
         "Norwegian_Nynorsk": "NORWEGIAN_N",
         "Waray_Waray": "WARAY_PHILIPPINES",
     }
+
+    # List obtained from https://github.com/stopwords-iso/stopwords-ja
+    from .ja_stopwords import ja_stopwords
+
+    # List obtained from https://github.com/stopwords-iso/stopwords-th
+    from .th_stopwords import th_stopwords
+
+    # List obtained from https://github.com/stopwords-iso/stopwords-zh
+    from .zh_stopwords import zh_stopwords
+
+    custom_stopwords = {
+        "THAI": th_stopwords,
+        "CHINESE": zh_stopwords,
+        "JAPANESE": ja_stopwords,
+    }
+
     if len(languages) == 0:
         languages = justext.get_stoplists()
-        # Remove latin as it yields a lot of low quality documents
-        languages_no_latin = list(languages)
-        languages_no_latin.remove("Latin")
-        languages = frozenset(languages_no_latin)
+
+        # Remove Latin as it yields a lot of low quality documents
+        languages = list(languages)
+        languages.remove("Latin")
+
+        # Manually add Thai, Chinese, and Japanese
+        languages.append("THAI")
+        languages.append("CHINESE")
+        languages.append("JAPANESE")
+
+        languages = frozenset(languages)
 
     stop_list_dict = {}
     for language in languages:
@@ -370,12 +436,11 @@ def get_stop_list_dict(languages=[]):
             lang_key = lang_map[language]
         else:
             lang_key = language.upper()
-        stop_list_dict[lang_key] = justext.get_stoplist(language)
-
-    # List obtained from https://github.com/stopwords-iso/stopwords-th
-    from .thai_stopwords import thai_stopwords
 
-    stop_list_dict["THAI"] = thai_stopwords
+        if lang_key in custom_stopwords:
+            stop_list_dict[lang_key] = custom_stopwords[lang_key]
+        else:
+            stop_list_dict[lang_key] = justext.get_stoplist(language)
 
     return stop_list_dict
 
@@ -484,8 +549,12 @@ def iterate(self, file_path):
 
 class CommonCrawlWARCExtractor(DocumentExtractor):
 
-    def __init__(self, algorithm=JusTextExtractor()):
-        self._stop_lists = get_stop_list_dict()
+    def __init__(self, algorithm=JusTextExtractor(), stop_lists=None):
+        if stop_lists is not None:
+            self._stop_lists = stop_lists
+        else:
+            self._stop_lists = get_stop_list_dict()
+
         self.algorithm = algorithm
         super().__init__()
 
@@ -496,7 +565,7 @@ def extract(self, content):
             lang = lang_detect(html)
             text = None
             if lang in self._stop_lists:
-                text = self.algorithm.extract_text(html, self._stop_lists[lang])
+                text = self.algorithm.extract_text(html, self._stop_lists[lang], lang)
             if text is not None:
                 if len(text) > 0:
                     text = "\n\n".join(text)
@@ -512,6 +581,7 @@ def download_common_crawl(
     end_snapshot: str,
     output_type: Literal["jsonl", "parquet"] = "jsonl",
     algorithm=JusTextExtractor(),
+    stop_lists=None,
     news: bool = False,
     aws: bool = False,
     raw_download_dir: Optional[str] = None,
@@ -536,6 +606,10 @@ def download_common_crawl(
           • This is not used for the output file, but is used to check if an extracted output already exists.
       algorithm: The text extraction algorithm instance to use for HTML processing.
           • This can be a JusTextExtractor (default), ResiliparseExtractor, or TrafilaturaExtractor object.
+      stop_lists: A dictionary stop lists, where the keys are languages (e.g., "ENGLISH")
+          and the values are Python frozensets denoting the list of stop words for that language.
+          If None, it defaults to jusText's stop lists: https://github.com/miso-belica/jusText/tree/main/justext/stoplists,
+          with added Thai, Chinese, and Japanese support.
       news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset.
           • This also means snapshot identifiers should follow the 'YYYY-MM' format.
       aws (bool): If True, downloads are sourced from Common Crawl's S3 bucket using s5cmd;
@@ -577,7 +651,7 @@ def download_common_crawl(
     expand_outdir_and_mkdir(raw_download_dir)
     downloader = CommonCrawlWARCDownloader(raw_download_dir, aws=aws)
     iterator = CommonCrawlWARCIterator()
-    extractor = CommonCrawlWARCExtractor(algorithm=algorithm)
+    extractor = CommonCrawlWARCExtractor(algorithm=algorithm, stop_lists=stop_lists)
 
     output_format = {
         "text": str,
diff --git a/nemo_curator/download/ja_stopwords.py b/nemo_curator/download/ja_stopwords.py
new file mode 100644
index 000000000..d5190d6ac
--- /dev/null
+++ b/nemo_curator/download/ja_stopwords.py
@@ -0,0 +1,138 @@
+ja_stopwords = frozenset(
+    [
+        "あそこ",
+        "あっ",
+        "あの",
+        "あのかた",
+        "あの人",
+        "あり",
+        "あります",
+        "ある",
+        "あれ",
+        "い",
+        "いう",
+        "います",
+        "いる",
+        "う",
+        "うち",
+        "え",
+        "お",
+        "および",
+        "おり",
+        "おります",
+        "か",
+        "かつて",
+        "から",
+        "が",
+        "き",
+        "ここ",
+        "こちら",
+        "こと",
+        "この",
+        "これ",
+        "これら",
+        "さ",
+        "さらに",
+        "し",
+        "しかし",
+        "する",
+        "ず",
+        "せ",
+        "せる",
+        "そこ",
+        "そして",
+        "その",
+        "その他",
+        "その後",
+        "それ",
+        "それぞれ",
+        "それで",
+        "た",
+        "ただし",
+        "たち",
+        "ため",
+        "たり",
+        "だ",
+        "だっ",
+        "だれ",
+        "つ",
+        "て",
+        "で",
+        "でき",
+        "できる",
+        "です",
+        "では",
+        "でも",
+        "と",
+        "という",
+        "といった",
+        "とき",
+        "ところ",
+        "として",
+        "とともに",
+        "とも",
+        "と共に",
+        "どこ",
+        "どの",
+        "な",
+        "ない",
+        "なお",
+        "なかっ",
+        "ながら",
+        "なく",
+        "なっ",
+        "など",
+        "なに",
+        "なら",
+        "なり",
+        "なる",
+        "なん",
+        "に",
+        "において",
+        "における",
+        "について",
+        "にて",
+        "によって",
+        "により",
+        "による",
+        "に対して",
+        "に対する",
+        "に関する",
+        "の",
+        "ので",
+        "のみ",
+        "は",
+        "ば",
+        "へ",
+        "ほか",
+        "ほとんど",
+        "ほど",
+        "ます",
+        "また",
+        "または",
+        "まで",
+        "も",
+        "もの",
+        "ものの",
+        "や",
+        "よう",
+        "より",
+        "ら",
+        "られ",
+        "られる",
+        "れ",
+        "れる",
+        "を",
+        "ん",
+        "何",
+        "及び",
+        "彼",
+        "彼女",
+        "我々",
+        "特に",
+        "私",
+        "私達",
+        "貴方",
+        "貴方方",
+    ]
+)
diff --git a/nemo_curator/download/thai_stopwords.py b/nemo_curator/download/th_stopwords.py
similarity index 98%
rename from nemo_curator/download/thai_stopwords.py
rename to nemo_curator/download/th_stopwords.py
index 0ef24737b..1680d1191 100644
--- a/nemo_curator/download/thai_stopwords.py
+++ b/nemo_curator/download/th_stopwords.py
@@ -1,4 +1,4 @@
-thai_stopwords = frozenset(
+th_stopwords = frozenset(
     [
         "กล่าว",
         "กว่า",
diff --git a/nemo_curator/download/zh_stopwords.py b/nemo_curator/download/zh_stopwords.py
new file mode 100644
index 000000000..05de06820
--- /dev/null
+++ b/nemo_curator/download/zh_stopwords.py
@@ -0,0 +1,798 @@
+zh_stopwords = frozenset(
+    [
+        "、",
+        "。",
+        "〈",
+        "〉",
+        "《",
+        "》",
+        "一",
+        "一个",
+        "一些",
+        "一何",
+        "一切",
+        "一则",
+        "一方面",
+        "一旦",
+        "一来",
+        "一样",
+        "一种",
+        "一般",
+        "一转眼",
+        "七",
+        "万一",
+        "三",
+        "上",
+        "上下",
+        "下",
+        "不",
+        "不仅",
+        "不但",
+        "不光",
+        "不单",
+        "不只",
+        "不外乎",
+        "不如",
+        "不妨",
+        "不尽",
+        "不尽然",
+        "不得",
+        "不怕",
+        "不惟",
+        "不成",
+        "不拘",
+        "不料",
+        "不是",
+        "不比",
+        "不然",
+        "不特",
+        "不独",
+        "不管",
+        "不至于",
+        "不若",
+        "不论",
+        "不过",
+        "不问",
+        "与",
+        "与其",
+        "与其说",
+        "与否",
+        "与此同时",
+        "且",
+        "且不说",
+        "且说",
+        "两者",
+        "个",
+        "个别",
+        "中",
+        "临",
+        "为",
+        "为了",
+        "为什么",
+        "为何",
+        "为止",
+        "为此",
+        "为着",
+        "乃",
+        "乃至",
+        "乃至于",
+        "么",
+        "之",
+        "之一",
+        "之所以",
+        "之类",
+        "乌乎",
+        "乎",
+        "乘",
+        "九",
+        "也",
+        "也好",
+        "也罢",
+        "了",
+        "二",
+        "二来",
+        "于",
+        "于是",
+        "于是乎",
+        "云云",
+        "云尔",
+        "五",
+        "些",
+        "亦",
+        "人",
+        "人们",
+        "人家",
+        "什",
+        "什么",
+        "什么样",
+        "今",
+        "介于",
+        "仍",
+        "仍旧",
+        "从",
+        "从此",
+        "从而",
+        "他",
+        "他人",
+        "他们",
+        "他们们",
+        "以",
+        "以上",
+        "以为",
+        "以便",
+        "以免",
+        "以及",
+        "以故",
+        "以期",
+        "以来",
+        "以至",
+        "以至于",
+        "以致",
+        "们",
+        "任",
+        "任何",
+        "任凭",
+        "会",
+        "似的",
+        "但",
+        "但凡",
+        "但是",
+        "何",
+        "何以",
+        "何况",
+        "何处",
+        "何时",
+        "余外",
+        "作为",
+        "你",
+        "你们",
+        "使",
+        "使得",
+        "例如",
+        "依",
+        "依据",
+        "依照",
+        "便于",
+        "俺",
+        "俺们",
+        "倘",
+        "倘使",
+        "倘或",
+        "倘然",
+        "倘若",
+        "借",
+        "借傥然",
+        "假使",
+        "假如",
+        "假若",
+        "做",
+        "像",
+        "儿",
+        "先不先",
+        "光",
+        "光是",
+        "全体",
+        "全部",
+        "八",
+        "六",
+        "兮",
+        "共",
+        "关于",
+        "关于具体地说",
+        "其",
+        "其一",
+        "其中",
+        "其二",
+        "其他",
+        "其余",
+        "其它",
+        "其次",
+        "具体地说",
+        "具体说来",
+        "兼之",
+        "内",
+        "再",
+        "再其次",
+        "再则",
+        "再有",
+        "再者",
+        "再者说",
+        "再说",
+        "冒",
+        "冲",
+        "况且",
+        "几",
+        "几时",
+        "凡",
+        "凡是",
+        "凭",
+        "凭借",
+        "出于",
+        "出来",
+        "分",
+        "分别",
+        "则",
+        "则甚",
+        "别",
+        "别人",
+        "别处",
+        "别是",
+        "别的",
+        "别管",
+        "别说",
+        "到",
+        "前后",
+        "前此",
+        "前者",
+        "加之",
+        "加以",
+        "区",
+        "即",
+        "即令",
+        "即使",
+        "即便",
+        "即如",
+        "即或",
+        "即若",
+        "却",
+        "去",
+        "又",
+        "又及",
+        "及",
+        "及其",
+        "及至",
+        "反之",
+        "反而",
+        "反过来",
+        "反过来说",
+        "受到",
+        "另",
+        "另一方面",
+        "另外",
+        "另悉",
+        "只",
+        "只当",
+        "只怕",
+        "只是",
+        "只有",
+        "只消",
+        "只要",
+        "只限",
+        "叫",
+        "叮咚",
+        "可",
+        "可以",
+        "可是",
+        "可见",
+        "各",
+        "各个",
+        "各位",
+        "各种",
+        "各自",
+        "同",
+        "同时",
+        "后",
+        "后者",
+        "向",
+        "向使",
+        "向着",
+        "吓",
+        "吗",
+        "否则",
+        "吧",
+        "吧哒",
+        "含",
+        "吱",
+        "呀",
+        "呃",
+        "呕",
+        "呗",
+        "呜",
+        "呜呼",
+        "呢",
+        "呵",
+        "呵呵",
+        "呸",
+        "呼哧",
+        "咋",
+        "和",
+        "咚",
+        "咦",
+        "咧",
+        "咱",
+        "咱们",
+        "咳",
+        "哇",
+        "哈",
+        "哈哈",
+        "哉",
+        "哎",
+        "哎呀",
+        "哎哟",
+        "哗",
+        "哟",
+        "哦",
+        "哩",
+        "哪",
+        "哪个",
+        "哪些",
+        "哪儿",
+        "哪天",
+        "哪年",
+        "哪怕",
+        "哪样",
+        "哪边",
+        "哪里",
+        "哼",
+        "哼唷",
+        "唉",
+        "唯有",
+        "啊",
+        "啐",
+        "啥",
+        "啦",
+        "啪达",
+        "啷当",
+        "喂",
+        "喏",
+        "喔唷",
+        "喽",
+        "嗡",
+        "嗡嗡",
+        "嗬",
+        "嗯",
+        "嗳",
+        "嘎",
+        "嘎登",
+        "嘘",
+        "嘛",
+        "嘻",
+        "嘿",
+        "嘿嘿",
+        "四",
+        "因",
+        "因为",
+        "因了",
+        "因此",
+        "因着",
+        "因而",
+        "固然",
+        "在",
+        "在下",
+        "在于",
+        "地",
+        "基于",
+        "处在",
+        "多",
+        "多么",
+        "多少",
+        "大",
+        "大家",
+        "她",
+        "她们",
+        "好",
+        "如",
+        "如上",
+        "如上所述",
+        "如下",
+        "如何",
+        "如其",
+        "如同",
+        "如是",
+        "如果",
+        "如此",
+        "如若",
+        "始而",
+        "孰料",
+        "孰知",
+        "宁",
+        "宁可",
+        "宁愿",
+        "宁肯",
+        "它",
+        "它们",
+        "对",
+        "对于",
+        "对待",
+        "对方",
+        "对比",
+        "将",
+        "小",
+        "尔",
+        "尔后",
+        "尔尔",
+        "尚且",
+        "就",
+        "就是",
+        "就是了",
+        "就是说",
+        "就算",
+        "就要",
+        "尽",
+        "尽管",
+        "尽管如此",
+        "岂但",
+        "己",
+        "已",
+        "已矣",
+        "巴",
+        "巴巴",
+        "年",
+        "并",
+        "并且",
+        "庶乎",
+        "庶几",
+        "开外",
+        "开始",
+        "归",
+        "归齐",
+        "当",
+        "当地",
+        "当然",
+        "当着",
+        "彼",
+        "彼时",
+        "彼此",
+        "往",
+        "待",
+        "很",
+        "得",
+        "得了",
+        "怎",
+        "怎么",
+        "怎么办",
+        "怎么样",
+        "怎奈",
+        "怎样",
+        "总之",
+        "总的来看",
+        "总的来说",
+        "总的说来",
+        "总而言之",
+        "恰恰相反",
+        "您",
+        "惟其",
+        "慢说",
+        "我",
+        "我们",
+        "或",
+        "或则",
+        "或是",
+        "或曰",
+        "或者",
+        "截至",
+        "所",
+        "所以",
+        "所在",
+        "所幸",
+        "所有",
+        "才",
+        "才能",
+        "打",
+        "打从",
+        "把",
+        "抑或",
+        "拿",
+        "按",
+        "按照",
+        "换句话说",
+        "换言之",
+        "据",
+        "据此",
+        "接着",
+        "故",
+        "故此",
+        "故而",
+        "旁人",
+        "无",
+        "无宁",
+        "无论",
+        "既",
+        "既往",
+        "既是",
+        "既然",
+        "日",
+        "时",
+        "时候",
+        "是",
+        "是以",
+        "是的",
+        "更",
+        "曾",
+        "替",
+        "替代",
+        "最",
+        "月",
+        "有",
+        "有些",
+        "有关",
+        "有及",
+        "有时",
+        "有的",
+        "望",
+        "朝",
+        "朝着",
+        "本",
+        "本人",
+        "本地",
+        "本着",
+        "本身",
+        "来",
+        "来着",
+        "来自",
+        "来说",
+        "极了",
+        "果然",
+        "果真",
+        "某",
+        "某个",
+        "某些",
+        "某某",
+        "根据",
+        "欤",
+        "正值",
+        "正如",
+        "正巧",
+        "正是",
+        "此",
+        "此地",
+        "此处",
+        "此外",
+        "此时",
+        "此次",
+        "此间",
+        "毋宁",
+        "每",
+        "每当",
+        "比",
+        "比及",
+        "比如",
+        "比方",
+        "没奈何",
+        "沿",
+        "沿着",
+        "漫说",
+        "点",
+        "焉",
+        "然则",
+        "然后",
+        "然而",
+        "照",
+        "照着",
+        "犹且",
+        "犹自",
+        "甚且",
+        "甚么",
+        "甚或",
+        "甚而",
+        "甚至",
+        "甚至于",
+        "用",
+        "用来",
+        "由",
+        "由于",
+        "由是",
+        "由此",
+        "由此可见",
+        "的",
+        "的确",
+        "的话",
+        "直到",
+        "相对而言",
+        "省得",
+        "看",
+        "眨眼",
+        "着",
+        "着呢",
+        "矣",
+        "矣乎",
+        "矣哉",
+        "离",
+        "秒",
+        "称",
+        "竟而",
+        "第",
+        "等",
+        "等到",
+        "等等",
+        "简言之",
+        "管",
+        "类如",
+        "紧接着",
+        "纵",
+        "纵令",
+        "纵使",
+        "纵然",
+        "经",
+        "经过",
+        "结果",
+        "给",
+        "继之",
+        "继后",
+        "继而",
+        "综上所述",
+        "罢了",
+        "者",
+        "而",
+        "而且",
+        "而况",
+        "而后",
+        "而外",
+        "而已",
+        "而是",
+        "而言",
+        "能",
+        "能否",
+        "腾",
+        "自",
+        "自个儿",
+        "自从",
+        "自各儿",
+        "自后",
+        "自家",
+        "自己",
+        "自打",
+        "自身",
+        "至",
+        "至于",
+        "至今",
+        "至若",
+        "致",
+        "般的",
+        "若",
+        "若夫",
+        "若是",
+        "若果",
+        "若非",
+        "莫不然",
+        "莫如",
+        "莫若",
+        "虽",
+        "虽则",
+        "虽然",
+        "虽说",
+        "被",
+        "要",
+        "要不",
+        "要不是",
+        "要不然",
+        "要么",
+        "要是",
+        "譬喻",
+        "譬如",
+        "让",
+        "许多",
+        "论",
+        "设使",
+        "设或",
+        "设若",
+        "诚如",
+        "诚然",
+        "该",
+        "说",
+        "说来",
+        "请",
+        "诸",
+        "诸位",
+        "诸如",
+        "谁",
+        "谁人",
+        "谁料",
+        "谁知",
+        "贼死",
+        "赖以",
+        "赶",
+        "起",
+        "起见",
+        "趁",
+        "趁着",
+        "越是",
+        "距",
+        "跟",
+        "较",
+        "较之",
+        "边",
+        "过",
+        "还",
+        "还是",
+        "还有",
+        "还要",
+        "这",
+        "这一来",
+        "这个",
+        "这么",
+        "这么些",
+        "这么样",
+        "这么点儿",
+        "这些",
+        "这会儿",
+        "这儿",
+        "这就是说",
+        "这时",
+        "这样",
+        "这次",
+        "这般",
+        "这边",
+        "这里",
+        "进而",
+        "连",
+        "连同",
+        "逐步",
+        "通过",
+        "遵循",
+        "遵照",
+        "那",
+        "那个",
+        "那么",
+        "那么些",
+        "那么样",
+        "那些",
+        "那会儿",
+        "那儿",
+        "那时",
+        "那样",
+        "那般",
+        "那边",
+        "那里",
+        "都",
+        "鄙人",
+        "鉴于",
+        "针对",
+        "阿",
+        "除",
+        "除了",
+        "除外",
+        "除开",
+        "除此之外",
+        "除非",
+        "随",
+        "随后",
+        "随时",
+        "随着",
+        "难道说",
+        "零",
+        "非",
+        "非但",
+        "非徒",
+        "非特",
+        "非独",
+        "靠",
+        "顺",
+        "顺着",
+        "首先",
+        "︿",
+        "！",
+        "＃",
+        "＄",
+        "％",
+        "＆",
+        "（",
+        "）",
+        "＊",
+        "＋",
+        "，",
+        "０",
+        "１",
+        "２",
+        "３",
+        "４",
+        "５",
+        "６",
+        "７",
+        "８",
+        "９",
+        "：",
+        "；",
+        "＜",
+        "＞",
+        "？",
+        "＠",
+        "［",
+        "］",
+        "｛",
+        "｜",
+        "｝",
+        "～",
+        "￥",
+    ]
+)
diff --git a/tests/test_download.py b/tests/test_download.py
index 7a8434d64..c19bb9cc9 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -121,34 +121,6 @@ def test_imports(self):
 
         assert True
 
-    def test_resiliparse_extract_text(self, html_string):
-        algorithm = ResiliparseExtractor()
-        stop_words = get_stop_list_dict()
-        result = algorithm.extract_text(html_string, stop_words["ENGLISH"])
-
-        expected = [
-            "This is a sample paragraph. In it we write words. These are stopwords: because did than has near we almost while what still.",
-            "Let's keep this paragraph: either came does last new took taken making became from.",
-        ]
-
-        assert result == expected
-
-    def test_trafilatura_extract_text(self, html_string):
-        algorithm = TrafilaturaExtractor(
-            min_extracted_size=10,
-            min_duplcheck_size=10,
-            max_repetitions=1,
-            deduplicate=True,
-        )
-        stop_words = get_stop_list_dict()
-        result = algorithm.extract_text(html_string, stop_words["ENGLISH"])
-
-        expected = [
-            "Let's keep this paragraph: either came does last new took taken making became from.",
-        ]
-
-        assert result == expected
-
     @pytest.mark.skip(
         reason="This test is flaky due to calling out to an external service and should be fixed."
     )
@@ -510,3 +482,228 @@ def test_common_crawl_extractor_resiliparse(self):
             "Common Crawl test paragraph for resiliparse extractor." in result["text"]
         )
         assert "language" in result
+
+
+class TestExtractor:
+    def test_resiliparse_extract_text(self, html_string):
+        algorithm = ResiliparseExtractor()
+        stop_words = get_stop_list_dict()
+        result = algorithm.extract_text(html_string, stop_words["ENGLISH"], "ENGLISH")
+
+        expected = [
+            "This is a sample paragraph. In it we write words. These are stopwords: because did than has near we almost while what still.",
+            "Let's keep this paragraph: either came does last new took taken making became from.",
+        ]
+
+        assert result == expected
+
+    def test_trafilatura_extract_text(self, html_string):
+        algorithm = TrafilaturaExtractor(
+            min_extracted_size=10,
+            min_duplcheck_size=10,
+            max_repetitions=1,
+            deduplicate=True,
+        )
+        stop_words = get_stop_list_dict()
+        result = algorithm.extract_text(html_string, stop_words["ENGLISH"], "ENGLISH")
+
+        expected = [
+            "Let's keep this paragraph: either came does last new took taken making became from.",
+        ]
+
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "extraction_algorithm", ["justext", "resiliparse", "trafilatura"]
+    )
+    def test_extract_thai_text(self, extraction_algorithm):
+        thai_html = """<!doctype html>
+            <head>
+                <title>ชื่อเรื่องของฉัน</title>
+            </head>
+            <body>
+                    <main>
+                        นี่คือตัวอย่างย่อหน้า ในนั้นเราเขียนคำต่างๆ
+                        เหล่านี้เป็นคำหยุด: เพราะว่า ทำ กว่า มี ใกล้ เรา เกือบจะ ขณะที่ อะไร ยังคง
+
+                        <p>
+                        ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก
+                        <br>เรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก
+                        </p>
+
+                    </main>
+            </body>
+        </html>"""
+
+        if extraction_algorithm == "justext":
+            algorithm = JusTextExtractor()
+            expected = [
+                "นี่คือตัวอย่างย่อหน้า ในนั้นเราเขียนคำต่างๆ\nเหล่านี้เป็นคำหยุด: เพราะว่า ทำ กว่า มี ใกล้ เรา เกือบจะ ขณะที่ อะไร ยังคง",
+                "ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก\nเรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก",
+            ]
+        elif extraction_algorithm == "resiliparse":
+            algorithm = ResiliparseExtractor()
+            expected = [
+                "นี่คือตัวอย่างย่อหน้า ในนั้นเราเขียนคำต่างๆ เหล่านี้เป็นคำหยุด: เพราะว่า ทำ กว่า มี ใกล้ เรา เกือบจะ ขณะที่ อะไร ยังคง",
+                "ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก",
+                "เรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก",
+            ]
+        elif extraction_algorithm == "trafilatura":
+            algorithm = TrafilaturaExtractor()
+            expected = [
+                "ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก",
+                "เรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก",
+                "ย่อหน้านี้ไม่มีคำหยุดมากนัก ลบออก",
+                "เรามาเก็บย่อหน้าไว้ดังนี้: ไม่ว่าจะมาทำอะไรใหม่ ๆ ก็เกิดขึ้น เกิดขึ้นจาก",
+            ]
+
+        stop_words = get_stop_list_dict()
+        result = algorithm.extract_text(thai_html, stop_words["THAI"], "THAI")
+
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "extraction_algorithm", ["justext", "resiliparse", "trafilatura"]
+    )
+    def test_extract_chinese_text(self, extraction_algorithm):
+        chinese_html = """<!doctype html>
+            <head>
+                <title>我的标题</title>
+            </head>
+            <body>
+                    <main>
+                        这是一个示例段落。我们在其中写下单词。
+
+                        <p>
+                        本段落没有太多停用词。请将其删除。
+                        <br>让我们保留这一段：要么来了，要么最后来了，要么新来了，要么采取了行动。
+                        </p>
+
+                    </main>
+            </body>
+        </html>"""
+
+        if extraction_algorithm == "justext":
+            algorithm = JusTextExtractor()
+            expected = [
+                "这是一个示例段落。我们在其中写下单词。",
+                "本段落没有太多停用词。请将其删除。\n让我们保留这一段：要么来了，要么最后来了，要么新来了，要么采取了行动。",
+            ]
+        elif extraction_algorithm == "resiliparse":
+            algorithm = ResiliparseExtractor()
+            expected = [
+                "这是一个示例段落。我们在其中写下单词。",
+                "本段落没有太多停用词。请将其删除。",
+                "让我们保留这一段：要么来了，要么最后来了，要么新来了，要么采取了行动。",
+            ]
+        elif extraction_algorithm == "trafilatura":
+            algorithm = TrafilaturaExtractor()
+            expected = [
+                "这是一个示例段落。我们在其中写下单词。",
+                "本段落没有太多停用词。请将其删除。",
+                "让我们保留这一段：要么来了，要么最后来了，要么新来了，要么采取了行动。",
+            ]
+
+        stop_words = get_stop_list_dict()
+        result = algorithm.extract_text(chinese_html, stop_words["CHINESE"], "CHINESE")
+
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "extraction_algorithm", ["justext", "resiliparse", "trafilatura"]
+    )
+    def test_extract_japanese_text(self, extraction_algorithm):
+        japanese_html = """<!doctype html>
+            <head>
+                <title>私のタイトル</title>
+            </head>
+            <body>
+                    <main>
+                        これはサンプルの段落です。ここに単語を書き込みます。
+
+                        <p>
+                        この段落にはストップワードがあまりありません。削除してください。
+                        <br>この段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。
+                        </p>
+
+                    </main>
+            </body>
+        </html>"""
+
+        if extraction_algorithm == "justext":
+            algorithm = JusTextExtractor()
+            expected = [
+                "これはサンプルの段落です。ここに単語を書き込みます。",
+                "この段落にはストップワードがあまりありません。削除してください。\nこの段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。",
+            ]
+        elif extraction_algorithm == "resiliparse":
+            algorithm = ResiliparseExtractor()
+            expected = [
+                "これはサンプルの段落です。ここに単語を書き込みます。",
+                "この段落にはストップワードがあまりありません。削除してください。",
+                "この段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。",
+            ]
+        elif extraction_algorithm == "trafilatura":
+            algorithm = TrafilaturaExtractor()
+            expected = [
+                "この段落にはストップワードがあまりありません。削除してください。",
+                "この段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。",
+                "この段落にはストップワードがあまりありません。削除してください。",
+                "この段落を維持しましょう: どちらかが来て、最後に新しいものを取って、作成し、なったのです。",
+            ]
+
+        stop_words = get_stop_list_dict()
+        result = algorithm.extract_text(
+            japanese_html, stop_words["JAPANESE"], "JAPANESE"
+        )
+
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "extraction_algorithm", ["justext", "resiliparse", "trafilatura"]
+    )
+    def test_extract_korean_text(self, extraction_algorithm):
+        korean_html = """<!doctype html>
+            <head>
+                <title>내 제목</title>
+            </head>
+            <body>
+                    <main>
+                        이것은 샘플 문단입니다. 여기에 단어를 적습니다.
+                        이것들은 불용어입니다: 왜냐하면, 했으므로, 보다, 가까이에, 우리, 거의, 동안, 무엇, 아직도.
+
+                        <p>
+                        이 문단에는 불용어가 많지 않습니다. 제거하세요.
+                        <br>이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다.
+                        </p>
+
+                    </main>
+            </body>
+        </html>"""
+
+        if extraction_algorithm == "justext":
+            algorithm = JusTextExtractor()
+            expected = [
+                "이것은 샘플 문단입니다. 여기에 단어를 적습니다.\n이것들은 불용어입니다: 왜냐하면, 했으므로, 보다, 가까이에, 우리, 거의, 동안, 무엇, 아직도.",
+                "이 문단에는 불용어가 많지 않습니다. 제거하세요.\n이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다.",
+            ]
+        elif extraction_algorithm == "resiliparse":
+            algorithm = ResiliparseExtractor()
+            expected = [
+                "이것은 샘플 문단입니다. 여기에 단어를 적습니다. 이것들은 불용어입니다: 왜냐하면, 했으므로, 보다, 가까이에, 우리, 거의, 동안, 무엇, 아직도.",
+                "이 문단에는 불용어가 많지 않습니다. 제거하세요.",
+                "이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다.",
+            ]
+        elif extraction_algorithm == "trafilatura":
+            algorithm = TrafilaturaExtractor()
+            expected = [
+                "이 문단에는 불용어가 많지 않습니다. 제거하세요.",
+                "이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다.",
+                "이 문단에는 불용어가 많지 않습니다. 제거하세요.",
+                "이 문단을 유지해 보겠습니다: 왔거나 마지막이거나 새로운 것이거나 가져갔거나 만들어지거나 되었거나에서 왔습니다.",
+            ]
+
+        stop_words = get_stop_list_dict()
+        result = algorithm.extract_text(korean_html, stop_words["KOREAN"], "KOREAN")
+
+        assert result == expected