diff --git a/.github/workflows/auto_push_pypi.yml b/.github/workflows/auto_push_pypi.yml new file mode 100644 index 0000000..9ac1b98 --- /dev/null +++ b/.github/workflows/auto_push_pypi.yml @@ -0,0 +1,74 @@ +name: Push latex to image to pypi + +on: + push: + # branches: [ main ] + # paths: + # - 'lineless_table_rec/**' + # tags: + # - v* + +jobs: + UnitTesting: + runs-on: ubuntu-latest + steps: + - name: Pull latest code + uses: actions/checkout@v3 + + - name: Set up Python 3.7 + uses: actions/setup-python@v4 + with: + python-version: '3.7' + architecture: 'x64' + + - name: Display Python version + run: python -c "import sys; print(sys.version)" + + - name: Unit testings + run: | + pip install -r requirements.txt + pip install pytest + + wget https://github.com/RapidAI/TableStructureRec/releases/download/v0.0.0/lineless_table_rec_models.zip + unzip lineless_table_rec_models.zip + mv lineless_table_rec_models/*.onnx lineless_table_rec/models/ + + pytest tests/test_lore.py + + GenerateWHL_PushPyPi: + needs: UnitTesting + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.7 + uses: actions/setup-python@v4 + with: + python-version: '3.7' + architecture: 'x64' + + - name: Run setup.py + run: | + pip install -r requirements.txt + python -m pip install --upgrade pip + pip install wheel get_pypi_latest_version + + wget https://github.com/RapidAI/TableStructureRec/releases/download/v0.0.0/lineless_table_rec_models.zip + unzip lineless_table_rec_models.zip + mv lineless_table_rec_models/*.onnx lineless_table_rec/models/ + + python setup_lineless.py bdist_wheel ${{ github.event.head_commit.message }} + + # - name: Publish distribution 📦 to Test PyPI + # uses: pypa/gh-action-pypi-publish@v1.5.0 + # with: + # password: ${{ secrets.TEST_PYPI_API_TOKEN }} + # repository_url: https://test.pypi.org/legacy/ + # packages_dir: dist/ + + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@v1.5.0 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + packages_dir: dist/ diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..4c21ff5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,159 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +.pytest_cache + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +# *.manifest +# *.spec +*.res + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +#idea +.vs +.vscode +.idea +/images +/models + +#models +*.onnx + +*.ttf +*.ttc + +long1.jpg + +*.bin +*.mapping +*.xml + +*.pdiparams +*.pdiparams.info +*.pdmodel + +.DS_Store +*.npy \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..5c227d6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: +- repo: https://gitee.com/SWHL/autoflake + rev: v2.1.1 + hooks: + - id: autoflake + args: + [ + "--recursive", + "--in-place", + "--remove-all-unused-imports", + "--remove-unused-variable", + "--ignore-init-module-imports", + ] + files: \.py$ +- repo: https://gitee.com/SWHL/black + rev: 23.1.0 + hooks: + - id: black + files: \.py$ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..1357436 --- /dev/null +++ b/README.md @@ -0,0 +1,44 @@ +
+
+

LaTeX To Image

+
+ + + SemVer2.0 + + GitHub + +
+ +### 简介 +该仓库是用于将LaTeX的公式借助LaTeX工具转换为对应的图像。 + + +### 安装 +1. 安装texlive +- Ubuntu + ```bash + # Ubuntu + sudo apt-get install texlive-full + + # 确认是否安装成功 + $ pdflatex --help + Usage: pdftex [OPTION]... [TEXNAME[.tex]] [COMMANDS] + or: pdftex [OPTION]... \FIRST-LINE + or: pdftex [OPTION]... &FMT ARGS + Run pdfTeX on TEXNAME, usually creating TEXNAME.pdf. + ``` +- MacOS +推荐安装[MacTex](https://tug.org/mactex/mactex-download.html) + + +2. 安装运行环境 + + +### 使用 + + + +### 参考代码 +- [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR/blob/main/pix2tex/dataset/latex2png.py) +- [latex2image](https://pypi.org/project/latex2image/#description) \ No newline at end of file diff --git a/demo.py b/demo.py new file mode 100644 index 0000000..a7497b4 --- /dev/null +++ b/demo.py @@ -0,0 +1,15 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from PIL import Image + +from latex_to_image import LaTeXToImg + +render = LaTeXToImg() + +formula = "x^2 + y ^2 = 1" + +img_formula = render(formula) +img_formula = Image.fromarray(img_formula) +img_formula.save("res2.png") +print("ok") diff --git a/latex_to_image/__init__.py b/latex_to_image/__init__.py new file mode 100644 index 0000000..8231886 --- /dev/null +++ b/latex_to_image/__init__.py @@ -0,0 +1,4 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from .main import LaTeXToImg diff --git a/latex_to_image/crop_img.py b/latex_to_image/crop_img.py new file mode 100644 index 0000000..126bd0e --- /dev/null +++ b/latex_to_image/crop_img.py @@ -0,0 +1,78 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +import cv2 +import numpy as np + + +class CropByProject: + """投影法裁剪""" + + def __init__(self, threshold=128): + self.threshold = threshold + + def __call__(self, origin_img, margin=(0, 0, 0, 0)): + # image = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY) + + # 将图片二值化 + retval, img = cv2.threshold( + origin_img, self.threshold, 255, cv2.THRESH_BINARY_INV + ) + + # 使文字增长成块 + closed = cv2.dilate(img, None, iterations=1) + + # 水平投影 + x0, x1 = self.get_project_loc(closed, direction="width") + + # 竖直投影 + y0, y1 = self.get_project_loc(closed, direction="height") + + h, w = img.shape[:2] + x0 = max(x0 - margin[0], 0) + y0 = max(y0 - margin[1], 0) + x1 = min(x1 + margin[2], w) + y1 = min(y1 + margin[3], h) + + return origin_img[y0:y1, x0:x1] + + @staticmethod + def get_project_loc(img, direction): + """获得裁剪的起始和终点索引位置 + Args: + img (ndarray): 二值化后得到的图像 + direction (str): 'width/height' + Raises: + ValueError: 不支持的求和方向 + Returns: + tuple: 起始索引位置 + """ + if direction == "width": + axis = 0 + elif direction == "height": + axis = 1 + else: + raise ValueError(f"direction {direction} is not supported!") + + loc_sum = np.sum(img == 255, axis=axis) + loc_range = np.argwhere(loc_sum > 0) + i0, i1 = loc_range[0][0], loc_range[-1][0] + return i0, i1 + + +if __name__ == "__main__": + croper = CropByProject() + + img_path = "/Users/joshuawang/projects/latex2img/res.png" + img = cv2.imread(img_path) + + img = croper(img) + h, w = img.shape[:2] + + img_half = img[: int(h / 2), :] + img_half2 = img[int(h / 2) :, :] + + crop_im1 = croper(img_half) + crop_im2 = croper(img_half2) + cv2.imwrite("crop_im1.png", crop_im1) + cv2.imwrite("crop_im2.png", crop_im2) diff --git a/latex_to_image/main.py b/latex_to_image/main.py new file mode 100644 index 0000000..86fddd1 --- /dev/null +++ b/latex_to_image/main.py @@ -0,0 +1,22 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +import numpy as np + +from .crop_img import CropByProject +from .render_latex import RenderLaTeX + + +class LaTeXToImg: + def __init__( + self, + ): + self.cropper = CropByProject() + self.latex = RenderLaTeX() + + def __call__(self, math: str) -> np.ndarray: + img = self.latex(math) + img = self.cropper(img) + return img + + diff --git a/latex_to_image/render_latex.py b/latex_to_image/render_latex.py new file mode 100644 index 0000000..fb8fef5 --- /dev/null +++ b/latex_to_image/render_latex.py @@ -0,0 +1,98 @@ +# -*- encoding: utf-8 -*- +# mostly taken from http://code.google.com/p/latexmath2png/ +# install preview.sty +import io +import os +import re +import shlex +import subprocess +import tempfile +import traceback +from pathlib import Path +from typing import Union + +import numpy as np +from PIL import Image + + +class RenderLaTeX: + def __init__(self, dpi=200): + self.dpi = dpi + self.BASE = r"""\documentclass[12pt]{article}\usepackage{fontspec,unicode-math}\thispagestyle{empty}\setmathfont{Latin Modern Math}\begin{document}$%s$\end{document}""" + + def __call__(self, math: str): + work_dir, tex_file = self.generate_tmp(math) + try: + pdf_file = self.render_by_xelatex(work_dir, tex_file) + img = self.convert_pdf_to_png(pdf_file) + return img + except Exception as e: + traceback.print_exc() + return None + finally: + self.clear_files(tex_file) + + def generate_tmp(self, math): + workdir = tempfile.gettempdir() + fd, tex_file = tempfile.mkstemp(".tex", "eq", workdir, True) + with os.fdopen(fd, "w+") as f: + document = self.BASE % (math) + f.write(document) + return workdir, tex_file + + def render_by_xelatex(self, work_dir, in_file) -> Path: + cmd = f"xelatex -interaction errorstopmode -file-line-error -output-directory {work_dir} {in_file}" + sout, _ = self.run_cmd(cmd) + + pdf_file: Path = Path(in_file).with_suffix(".pdf") + expression = pdf_file.parent / rf"{pdf_file.stem}.p\ndf \((\d+)? page" + flag = self.is_success( + text=sout, + expression=str(expression), + ) + if flag: + return pdf_file + raise LatexError("xelatex meets error.") + + def convert_pdf_to_png(self, pdf_file): + png_file: Path = Path(pdf_file).with_suffix(".png") + cmd = f"convert -background white -flatten -density {self.dpi} -colorspace gray {pdf_file} -quality 90 {png_file}" + _, return_code = self.run_cmd(cmd) + if return_code != 0: + raise LatexError(f"PDF to png error\n{cmd}\n{pdf_file}") + img = np.array(Image.open(png_file)) + return img + + @staticmethod + def run_cmd(shell_cmd: str): + with subprocess.Popen( + shlex.split(shell_cmd), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) as p: + sout, serr = p.communicate() + return sout, p.returncode + + @staticmethod + def is_success(text, expression=None): + try: + pattern = re.compile(expression) + results = re.findall(pattern, text) + if int(results[0]) != 1: + return False + return True + except Exception: + traceback.print_exc() + return False + + @staticmethod + def clear_files(in_file: Union[str, Path]) -> None: + invalid_files = Path(in_file).parent.glob(f"{Path(in_file).stem}*") + for file_path in invalid_files: + file_path.unlink() + + +class LatexError(Exception): + pass diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9b70bf7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +opencv_python_headless +Pillow diff --git a/res2.png b/res2.png new file mode 100644 index 0000000..7cae059 Binary files /dev/null and b/res2.png differ diff --git a/t.py b/t.py new file mode 100644 index 0000000..06b86ad --- /dev/null +++ b/t.py @@ -0,0 +1,148 @@ +import os +import re +import shutil +import subprocess +import tempfile +from pathlib import Path + +import matplotlib as mpl +import matplotlib.pyplot as plt +from pdf2image import convert_from_path + + +def latex(tex, output_file, dpi=300, bgcolor="#FFFFFF", fgcolor="#000000"): + converter = LatexConverter( + tex, output_file, dpi=dpi, bgcolor=bgcolor, fgcolor=fgcolor + ) + converter.convert() + + +def latex_by_mpl(tex, output_file, dpi=300, bgcolor="white", fgcolor="black"): + tex = "$ " + tex + " $" + mpl.rcParams["text.usetex"] = True + mpl.rcParams["text.latex.preamble"] = r"\usepackage{{amsmath}}" + plt.rcParams.update({"mathtext.fontset": "cm"}) + fig = plt.figure(figsize=(10, 10), dpi=100) + + t = fig.text( + 0, 0, tex, horizontalalignment="left", verticalalignment="bottom", fontsize=30 + ) + + r = fig.canvas.get_renderer() + + bbox = t.get_tightbbox(r) + w, h = (bbox.width / r.dpi, bbox.height / r.dpi) + + fig = plt.figure(figsize=(1.1 * w, 1.1 * h), dpi=dpi) + t = fig.text( + 0, + 0, + tex, + fontsize=30, + verticalalignment="bottom", + horizontalalignment="left", + bbox={"facecolor": bgcolor, "edgecolor": bgcolor}, + color=fgcolor, + ) + + fig.savefig(output_file, transparent=False) + + +class LatexExpection(Exception): + pass + + +class CropException(Exception): + pass + + +class ConversionException(Exception): + pass + + +class LatexConverter: + def __init__(self, tex, output_file, bgcolor, fgcolor, dpi): + self.cwd = Path.cwd() + self.output = output_file + self.dpi = dpi + self.bgcolor = self._translate_color(bgcolor) + self.fgcolor = self._translate_color(fgcolor) + self.file_base, self.file_extension = os.path.splitext(output_file) + self.tex = tex + + def convert(self): + with tempfile.TemporaryDirectory() as tmp_dir: + os.chdir(tmp_dir) + self.write_latex_file() + self.tex_to_pdf() + self.pdf_to_image() + + def write_latex_file(self): + with open(self.file_base + ".tex", "w") as f: + f.write("\\documentclass{article}") + f.write("\\thispagestyle{empty}") + f.write("\\usepackage{amsmath,amssymb,amsfonts,amsthm}") + f.write("\\usepackage{xcolor}") + f.write("\\definecolor{fgcolor}{RGB}{%s, %s, %s}" % self.fgcolor) + f.write("\\definecolor{bgcolor}{RGB}{%s, %s, %s}" % self.bgcolor) + f.write("\\begin{document}") + f.write("\\color{fgcolor}") + f.write("\\pagecolor{bgcolor}") + f.write("\\begin{eqnarray*}") + f.write(self.tex) + f.write("\\end{eqnarray*}") + f.write("\\end{document}") + + def tex_to_pdf(self): + rc = subprocess.call( + ["pdflatex", "-halt-on-error", self.file_base + ".tex"], + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + ) + if rc != 0: + raise LatexExpection("Invalid Latex Expression") + + rc = subprocess.call( + ["pdfcrop", self.file_base + ".pdf"], + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + ) + if rc != 0: + raise CropException("Cannot crop pdf file") + + pdf = str(self.file_base) + ".pdf" + shutil.move(self.file_base + "-crop.pdf", Path.cwd() / pdf) + + def pdf_to_image(self): + pdf = str(self.file_base) + ".pdf" + try: + pages = convert_from_path(pdf, dpi=self.dpi) + pages[0].save( + self.file_base + self.file_extension, self.file_extension[1:].upper() + ) + shutil.move( + self.file_base + self.file_extension, + self.cwd / (str(self.file_base) + self.file_extension), + ) + shutil.move(pdf, self.cwd / (str(self.file_base) + ".pdf")) + except: + print( + 'Unable to convert image (is "poppler" installed?). Falling back on PDF instead.' + ) + shutil.move(pdf, self.cwd / (str(self.file_base) + ".pdf")) + + def _translate_color(self, color): + assert re.match("#[0-9A-Fa-f]{6}", color) + red, green, blue = int(color[1:3], 16), int(color[3:5], 16), int(color[5:], 16) + return red, green, blue + + +if __name__ == "__main__": + converter = LatexConverter( + r"\frac{\partial}{\partial z}", + "test.png", + dpi=100, + bgcolor="#ffffff", + fgcolor="#000000", + ) + converter.convert() diff --git a/tests/test_latex2img.py b/tests/test_latex2img.py new file mode 100644 index 0000000..23a0486 --- /dev/null +++ b/tests/test_latex2img.py @@ -0,0 +1,23 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +import sys +from pathlib import Path + +from PIL import Image + +cur_dir = Path(__file__).resolve().parent +root_dir = cur_dir.parent + +sys.path.append(str(root_dir)) + +from latex_to_image import GetRenderImg + +render = GetRenderImg() + + +def test_normal(): + formula = "x^2 + y ^2 = 1" + img_formula = render(formula) + + assert img_formula.shape == (34, 162) diff --git a/tests/test_render_two.py b/tests/test_render_two.py new file mode 100644 index 0000000..2806ff7 --- /dev/null +++ b/tests/test_render_two.py @@ -0,0 +1,26 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +import numpy as np +from PIL import Image, ImageChops + +from latex_to_image.main import LaTeX2Img + +render = LaTeX2Img() + +formula1 = r"=e^{-\lambda}\sum_{k=0}^{\infty}\frac{\left(-2\lambda\right)^{k}}{k!}" +formula2 = r"= e ^ { - \lambda } \sum _ { k = 0 } ^ { \infty } \frac { \left( - 2 \lambda \right) ^ { k } } { k ! }" + +img1, img2 = render(formula1, formula1) + +img1_np = Image.fromarray(img1) +img2_np = Image.fromarray(img2) + +diff = ImageChops.difference(img1_np, img2_np) + +diff.show() + +np.testing.assert_allclose(img1, img2, rtol=1e-3, atol=1e-5) +np.testing.assert_array_equal(img1, img2) + +print("ok") diff --git a/tests/test_template.py b/tests/test_template.py new file mode 100644 index 0000000..b4d6577 --- /dev/null +++ b/tests/test_template.py @@ -0,0 +1,5 @@ +from string import Template + +s = Template("$who 在 $do") +ts = s.substitute(who="张三", do="赏花") +print(ts)