-
Notifications
You must be signed in to change notification settings - Fork 0
/
to_plain_text.py
84 lines (65 loc) · 2.22 KB
/
to_plain_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import sys
import pypandoc
import PyPDF2
from pptx import Presentation
def todo():
return print_return_error("Not yet implemented.")
def print_return_error(err):
print(err)
return err
def with_pandoc(filepath):
try:
plain_text = pypandoc.convert_file(filepath, 'plain')
return plain_text
except Exception as e:
return print_return_error(f"Error converting {filepath} to plain text.")
def from_docx(filepath):
return with_pandoc(filepath)
def from_doc(filepath):
return with_pandoc(filepath)
def from_pptx(filepath):
presentation = Presentation(filepath)
extracted_text = []
for slide in presentation.slides:
slide_text = ""
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
slide_text += run.text + "\n"
extracted_text.append(slide_text)
return "\n".join(extracted_text)
def from_pdf(filepath):
try:
with open(filepath, 'rb') as file:
reader = PyPDF2.PdfFileReader(file)
num_pages = reader.numPages
plain_text = ''
for page_num in range(num_pages):
page = reader.getPage(page_num)
plain_text += page.extractText()
return plain_text.strip()
except Exception as e:
return print_return_error(f"Error converting {filepath} to plain text.")
def from_plain_text(filepath):
with open(filepath, "r") as f:
output = f.read()
if output is None or output == "":
return print_return_error(f"Error gathering text from {filepath}.")
return output
def get_content(filepath):
print(filepath)
parts = filepath.split(".")
ext = parts[len(parts) - 1]
if ext == "txt":
return from_plain_text(filepath)
elif ext == "pdf":
return from_pdf(filepath)
elif ext == "docx":
return from_docx(filepath)
elif ext == "doc":
return from_doc(filepath)
elif ext == "pptx":
return from_pptx(filepath)
else:
return print_return_error(f"Extension {ext} is not yet supported.")