Skip to content

Commit a75499d

Browse files
authored
feat: local inference (#125)
Splits partition_pdf into two paths, one used for local inference when url is None, another for inference via api when url is a string.
1 parent 17045ae commit a75499d

File tree

14 files changed

+325
-35
lines changed

14 files changed

+325
-35
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.3.5-dev6
1+
## 0.3.5
22

3+
* Add support for local inference
34
* Add new pattern to recognize plain text dash bullets
45
* Add test for bullet patterns
56
* Fix for `partition_html` that allows for processing `div` tags that have both text and child

Makefile

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ install-base: install-base-pip-packages install-nltk-models
1717

1818
## install: installs all test, dev, and experimental requirements
1919
.PHONY: install
20-
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface
20+
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-unstructured-inference
2121

2222
.PHONY: install-ci
23-
install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface
23+
install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface install-unstructured-inference
2424

2525
.PHONY: install-base-pip-packages
2626
install-base-pip-packages:
@@ -49,6 +49,18 @@ install-dev:
4949
install-build:
5050
pip install -r requirements/build.txt
5151

52+
.PHONY: install-unstructured-inference
53+
install-unstructured-inference:
54+
pip install -r requirements/local-inference.txt
55+
56+
.PHONY: install-detectron2
57+
install-detectron2:
58+
pip install "detectron2@git+https://github.com/facebookresearch/[email protected]#egg=detectron2"
59+
60+
## install-local-inference: installs requirements for local inference
61+
.PHONY: install-local-inference
62+
install-local-inference: install install-unstructured-inference install-detectron2
63+
5264
## pip-compile: compiles all base/dev/test requirements
5365
.PHONY: pip-compile
5466
pip-compile:
@@ -61,6 +73,7 @@ pip-compile:
6173
pip-compile requirements/dev.in
6274
pip-compile requirements/test.in
6375
pip-compile requirements/build.in
76+
pip-compile requirements/local-inference.in
6477
# NOTE(robinson) - doc/requirements.txt is where the GitHub action for building
6578
# sphinx docs looks for additional requirements
6679
cp requirements/build.txt docs/requirements.txt

docs/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#
2-
# This file is autogenerated by pip-compile with python 3.8
3-
# To update, run:
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
44
#
55
# pip-compile requirements/build.in
66
#

requirements/base.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#
2-
# This file is autogenerated by pip-compile with python 3.8
3-
# To update, run:
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
44
#
55
# pip-compile --output-file=requirements/base.txt
66
#

requirements/build.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#
2-
# This file is autogenerated by pip-compile with python 3.8
3-
# To update, run:
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
44
#
55
# pip-compile requirements/build.in
66
#

requirements/dev.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#
2-
# This file is autogenerated by pip-compile with python 3.8
3-
# To update, run:
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
44
#
55
# pip-compile requirements/dev.in
66
#

requirements/huggingface.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#
2-
# This file is autogenerated by pip-compile with python 3.8
3-
# To update, run:
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
44
#
55
# pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
66
#

requirements/local-inference.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
unstructured-inference>=0.2.1

requirements/local-inference.txt

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
#
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
4+
#
5+
# pip-compile requirements/local-inference.in
6+
#
7+
antlr4-python3-runtime==4.9.3
8+
# via omegaconf
9+
anyio==3.6.2
10+
# via starlette
11+
certifi==2022.12.7
12+
# via requests
13+
cffi==1.15.1
14+
# via cryptography
15+
charset-normalizer==2.1.1
16+
# via
17+
# pdfminer-six
18+
# requests
19+
click==8.1.3
20+
# via uvicorn
21+
contourpy==1.0.6
22+
# via matplotlib
23+
cryptography==39.0.0
24+
# via pdfminer-six
25+
cycler==0.11.0
26+
# via matplotlib
27+
effdet==0.3.0
28+
# via layoutparser
29+
fastapi==0.88.0
30+
# via unstructured-inference
31+
filelock==3.9.0
32+
# via huggingface-hub
33+
fonttools==4.38.0
34+
# via matplotlib
35+
h11==0.14.0
36+
# via uvicorn
37+
huggingface-hub==0.11.1
38+
# via
39+
# timm
40+
# unstructured-inference
41+
idna==3.4
42+
# via
43+
# anyio
44+
# requests
45+
iopath==0.1.10
46+
# via layoutparser
47+
kiwisolver==1.4.4
48+
# via matplotlib
49+
layoutparser[layoutmodels,tesseract]==0.3.4
50+
# via unstructured-inference
51+
matplotlib==3.6.2
52+
# via pycocotools
53+
numpy==1.24.1
54+
# via
55+
# contourpy
56+
# layoutparser
57+
# matplotlib
58+
# opencv-python
59+
# pandas
60+
# pycocotools
61+
# scipy
62+
# torchvision
63+
omegaconf==2.3.0
64+
# via effdet
65+
opencv-python==4.7.0.68
66+
# via layoutparser
67+
packaging==22.0
68+
# via
69+
# huggingface-hub
70+
# matplotlib
71+
# pytesseract
72+
pandas==1.5.2
73+
# via layoutparser
74+
pdf2image==1.16.2
75+
# via layoutparser
76+
pdfminer-six==20221105
77+
# via pdfplumber
78+
pdfplumber==0.7.6
79+
# via layoutparser
80+
pillow==9.4.0
81+
# via
82+
# layoutparser
83+
# matplotlib
84+
# pdf2image
85+
# pdfplumber
86+
# pytesseract
87+
# torchvision
88+
portalocker==2.6.0
89+
# via iopath
90+
pycocotools==2.0.6
91+
# via effdet
92+
pycparser==2.21
93+
# via cffi
94+
pydantic==1.10.4
95+
# via fastapi
96+
pyparsing==3.0.9
97+
# via matplotlib
98+
pytesseract==0.3.10
99+
# via layoutparser
100+
python-dateutil==2.8.2
101+
# via
102+
# matplotlib
103+
# pandas
104+
python-multipart==0.0.5
105+
# via unstructured-inference
106+
pytz==2022.7
107+
# via pandas
108+
pyyaml==6.0
109+
# via
110+
# huggingface-hub
111+
# layoutparser
112+
# omegaconf
113+
# timm
114+
requests==2.28.1
115+
# via
116+
# huggingface-hub
117+
# torchvision
118+
scipy==1.10.0
119+
# via layoutparser
120+
six==1.16.0
121+
# via
122+
# python-dateutil
123+
# python-multipart
124+
sniffio==1.3.0
125+
# via anyio
126+
starlette==0.22.0
127+
# via fastapi
128+
timm==0.6.12
129+
# via effdet
130+
torch==1.13.1
131+
# via
132+
# effdet
133+
# layoutparser
134+
# timm
135+
# torchvision
136+
torchvision==0.14.1
137+
# via
138+
# effdet
139+
# layoutparser
140+
# timm
141+
tqdm==4.64.1
142+
# via
143+
# huggingface-hub
144+
# iopath
145+
typing-extensions==4.4.0
146+
# via
147+
# huggingface-hub
148+
# iopath
149+
# pydantic
150+
# starlette
151+
# torch
152+
# torchvision
153+
unstructured-inference==0.2.1
154+
# via -r requirements/local-inference.in
155+
urllib3==1.26.13
156+
# via requests
157+
uvicorn==0.20.0
158+
# via unstructured-inference
159+
wand==0.6.10
160+
# via pdfplumber

requirements/test.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#
2-
# This file is autogenerated by pip-compile with python 3.8
3-
# To update, run:
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
44
#
55
# pip-compile requirements/test.in
66
#

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,5 +67,6 @@
6767
"torch",
6868
"transformers",
6969
],
70+
"local-inference": ["unstructured-inference>=0.2.1"],
7071
},
7172
)

test_unstructured/partition/test_pdf.py

Lines changed: 69 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import pytest
2-
32
import requests
3+
from unittest import mock
44

55
import unstructured.partition.pdf as pdf
6+
import unstructured_inference.inference.layout as layout
67

78

89
class MockResponse:
@@ -38,40 +39,97 @@ def mock_successful_post(url, **kwargs):
3839
return MockResponse(status_code=200, response=response)
3940

4041

41-
def test_partition_pdf(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
42+
class MockPageLayout(layout.PageLayout):
43+
def __init__(self, number: int):
44+
pass
45+
46+
@property
47+
def elements(self):
48+
return [
49+
layout.LayoutElement(
50+
type="Title",
51+
coordinates=[(0, 0), (2, 2)],
52+
text="Charlie Brown and the Great Pumpkin",
53+
)
54+
]
55+
56+
57+
class MockDocumentLayout(layout.DocumentLayout):
58+
@property
59+
def pages(self):
60+
return [
61+
MockPageLayout(
62+
number=0,
63+
)
64+
]
65+
66+
67+
def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
4268
monkeypatch.setattr(requests, "post", mock_successful_post)
4369
monkeypatch.setattr(requests, "get", mock_healthy_get)
4470

45-
partition_pdf_response = pdf.partition_pdf(filename)
71+
partition_pdf_response = pdf._partition_pdf_via_api(filename)
4672
assert partition_pdf_response[0]["type"] == "Title"
4773
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
4874

4975

50-
def test_partition_pdf_raises_with_no_filename(
51-
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
52-
):
76+
@pytest.mark.parametrize(
77+
"filename, file", [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")]
78+
)
79+
def test_partition_pdf_local(monkeypatch, filename, file):
80+
monkeypatch.setattr(layout, "process_data_with_model", lambda *args: MockDocumentLayout())
81+
monkeypatch.setattr(
82+
layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
83+
)
84+
85+
partition_pdf_response = pdf._partition_pdf_via_local(filename, file)
86+
assert partition_pdf_response[0].type == "Title"
87+
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
88+
89+
90+
def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
91+
monkeypatch.setattr(requests, "post", mock_successful_post)
92+
monkeypatch.setattr(requests, "get", mock_healthy_get)
93+
94+
with pytest.raises(FileNotFoundError):
95+
pdf._partition_pdf_via_api(filename=None, file=None)
96+
97+
98+
def test_partition_pdf_local_raises_with_no_filename(monkeypatch):
5399
monkeypatch.setattr(requests, "post", mock_successful_post)
54100
monkeypatch.setattr(requests, "get", mock_healthy_get)
55101

56102
with pytest.raises(FileNotFoundError):
57-
pdf.partition_pdf(filename=None, file=None)
103+
pdf._partition_pdf_via_api(filename=None, file=None)
58104

59105

60-
def test_partition_pdf_raises_with_failed_healthcheck(
106+
def test_partition_pdf_api_raises_with_failed_healthcheck(
61107
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
62108
):
63109
monkeypatch.setattr(requests, "post", mock_successful_post)
64110
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
65111

66112
with pytest.raises(ValueError):
67-
pdf.partition_pdf(filename=filename)
113+
pdf._partition_pdf_via_api(filename=filename)
68114

69115

70-
def test_partition_pdf_raises_with_failed_api_call(
116+
def test_partition_pdf_api_raises_with_failed_api_call(
71117
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
72118
):
73119
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
74120
monkeypatch.setattr(requests, "get", mock_healthy_get)
75121

76122
with pytest.raises(ValueError):
77-
pdf.partition_pdf(filename=filename)
123+
pdf._partition_pdf_via_api(filename=filename)
124+
125+
126+
@pytest.mark.parametrize(
127+
"url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
128+
)
129+
def test_partition_pdf(url, api_called, local_called):
130+
with mock.patch(
131+
"unstructured.partition.pdf._partition_pdf_via_api", mock.MagicMock()
132+
), mock.patch("unstructured.partition.pdf._partition_pdf_via_local", mock.MagicMock()):
133+
pdf.partition_pdf(filename="fake.pdf", url=url)
134+
assert pdf._partition_pdf_via_api.called == api_called
135+
assert pdf._partition_pdf_via_local.called == local_called

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.3.5-dev6" # pragma: no cover
1+
__version__ = "0.3.5" # pragma: no cover

0 commit comments

Comments
 (0)