-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstreamlit_app.py
192 lines (158 loc) · 7.2 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import streamlit as st
import pdfplumber
import pandas as pd
from numpy import nan
from io import BytesIO
from datetime import datetime, timezone, timedelta
def extract_data(pdf_path) -> list:
"""
Extracts raw data from pdf, outputs a list
"""
all_data = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if len(table) > 0:
for row in table:
cleaned_row = [cell.strip() if cell else '' for cell in row]
all_data.append(cleaned_row)
return all_data
def process_data(all_data: list) -> pd.DataFrame:
"""
Agarra los datos crudos, pone las columnas correctas, parsea bien los nombres
y datos para luego ser correctamente casteados a float, luego llama a la fun
`checkear_y_asignar` para mover los datos desfazados y limpiar las filas
"""
df = pd.DataFrame(all_data, columns=["temp1", "temp2", "temp3", "temp4"])
df.columns = df.iloc[0]
df = df.drop(0).reset_index(drop=True)
df.columns = [name.replace("\n", " ") for name in df.columns]
df2 = df.copy()
df2["PRECIO PACK"] = df2["PRECIO PACK"].astype(str).str.replace("$", "").str.replace("\n", "").str.replace(".", "").str.replace(",", ".")
df2.loc[df2["PRECIO PACK"] == "", "PRECIO PACK"] = nan
df2["PRECIO PACK"] = pd.to_numeric(df2["PRECIO PACK"], errors='coerce')
df2["PRECIO UNITARIO"] = df2["PRECIO UNITARIO"].astype(str).str.replace("$", "").str.replace("\n", "").str.replace(".", "").str.replace(",", ".")
df2.loc[df2["PRECIO UNITARIO"] == "", "PRECIO UNITARIO"] = nan
df2["PRECIO UNITARIO"] = pd.to_numeric(df2["PRECIO UNITARIO"], errors='coerce')
df2 = df2[~df2.iloc[:, 0].str.contains('ARTÍCULO', na=False)].reset_index(drop=True)
df2_1 = df2.copy()
df2_1 = checkear_y_asignar(df2_1)
print(df2["PRECIO UNITARIO"].head())
return df2_1
def checkear_y_asignar(df: pd.DataFrame) -> pd.DataFrame:
"""
Busca en la columna articulo si la fila esta vacia o no y agarra el valor
de esa fila en la columna PRECIO PACK para pasarlo a la fila de arriba
donde deberia estar
luego llama la funcion `more_processing` para limpiar la fila vacia
"""
for index in df.index.to_list():
if (pd.isna(df.at[index, 'ARTÍCULO']) or df.at[index, 'ARTÍCULO'] == ''):
if pd.isna(df.at[index-1, 'PRECIO PACK']) or df.at[index-1, 'PRECIO PACK'] == '':
df.at[index-1, 'PRECIO PACK'] = df.at[index, 'PRECIO PACK']
# drop empty since they values were transfered
df_f = more_processing(df)
return df_f
def more_processing(df_t1: pd.DataFrame) -> pd.DataFrame:
"""
Limpia la fila con ARTICULO vacio ya que su valor fue extraido y ordenado
en la funcion `checkear_y_asignar`
"""
df_t2 = df_t1.copy()
df_t2 = df_t2.drop(df_t2[df_t2["ARTÍCULO"] == ""].index)
df_final = df_t2.reset_index(drop=True)
return df_final
def add_categories(df_final: pd.DataFrame) -> pd.DataFrame:
"""
Checkea que en la fila las columnas PRECIO PACK O CANT X PACK sean o nulas o vacias
de ser asi, agarra el articulo, si no esta vacia agrega ese articulo previamente
guardado y lo agrega como fila en la nueva columna Categoria
Una vez encontradas todas las categorias borra las filas que las contenian
en ultima instancia pone bien los data types a las columnas
"""
dff = df_final.copy()
# Inicializar la columna
dff['Categoria'] = None
# guardar la categoria que encontramos sola
current_category = None
# Iterar las filas y asignar la categoria que encontremos sola, de lo contrario agregarla a la nueva columna
for index, row in dff.iterrows():
if (pd.isnull(row['PRECIO PACK']) or row['PRECIO PACK'] == "") and (pd.isnull(row['CANT. X PACK']) or row['CANT. X PACK'] == ""):
current_category = row['ARTÍCULO']
else:
dff.at[index, 'Categoria'] = current_category ## using df.at since only requires 1 specific row to assing
## since df.iloc is more suitable for grouping rows
# Dropear las columnas de las categorias ya encontradas
dff = dff.dropna(subset=['PRECIO PACK', 'CANT. X PACK'])
# Reordenamos
dff = dff[['Categoria', 'ARTÍCULO', 'PRECIO PACK', 'CANT. X PACK', 'PRECIO UNITARIO']]
# Reseteamos index
dff2 = dff.reset_index(drop=True)
# Make sure of types
dff2['PRECIO PACK'] = pd.to_numeric(dff2['PRECIO PACK'])
dff2['PRECIO UNITARIO'] = pd.to_numeric(dff2['PRECIO UNITARIO'])
#dff2['CANT. X PACK'] = dff2['CANT. X PACK'].astype(int)
dff2['CANT. X PACK'] = pd.to_numeric(dff2['CANT. X PACK'])
dff2.head()
return dff2
def save_file(df_final: pd.DataFrame) -> BytesIO:
"""
Writes the data in memory instead of saving in locally
slighly styles the Excel adding spacing to columns
"""
output = BytesIO()
# writes to the bytes object
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
df_final.to_excel(writer, index=False, sheet_name='Bebidas')
workbook = writer.book
worksheet = writer.sheets['Bebidas']
for i, col in enumerate(df_final.columns):
max_len = max(df_final[col].astype(str).map(len).max(), len(col)) + 2
worksheet.set_column(i, i, max_len)
header_format = workbook.add_format({
'bold': True,
'text_wrap': True,
'valign': 'top',
'fg_color': '#D7E4BC',
'border': 1
})
for i, col in enumerate(df_final.columns):
worksheet.write(0, i, col, header_format)
# reset the pointer
output.seek(0)
return output
def main() -> None:
st.title("PDF a Excel")
if 'uploaded_file' not in st.session_state:
st.session_state.uploaded_file = None
uploaded_file = st.file_uploader("Elegi un PDF", type="pdf")
if uploaded_file is not None:
st.session_state.uploaded_file = uploaded_file
st.write("PDF cargado con exito!")
if st.session_state.uploaded_file is not None:
if st.button("Que empiece la fiesta"):
pdf_name = uploaded_file.name.strip(".pdf")
all_data = extract_data(uploaded_file)
df_t1 = process_data(all_data)
df_final= add_categories(df_t1) # con nueva columna
output_file = save_file(df_final)
# set timezone to ARG
AR_hour = -3
AR_offset = timedelta(hours=AR_hour)
AR_timezone = timezone(AR_offset)
now = datetime.now()
AR_now = now.astimezone(AR_timezone)
AR_now_final = AR_now.strftime("%d-%m-%Y_%Hh%Mm%Ss")
# format file
file_name = f"{pdf_name}-{AR_now_final}.xlsx"
st.write("PDF procesado con exito!")
st.write(df_final)
st.download_button(
label="Descargar Excel",
data=output_file,
file_name=file_name,
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
if __name__ == "__main__":
main()