Skip to content

Commit

Permalink
ENH semi-automatic parser (#141)
Browse files Browse the repository at this point in the history
* feat&fix(jsonParser): refactoring

* feat(jsonParser): PDF to CSV

* feat(data): CdL
  • Loading branch information
Gigi-G committed Nov 4, 2023
1 parent f13a586 commit 2e14586
Show file tree
Hide file tree
Showing 26 changed files with 9,733 additions and 310 deletions.
34 changes: 34 additions & 0 deletions jsonParser/semi-automatic/converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import aspose.pdf as pdf
import os
import click
from glob import glob

@click.command()
@click.option('--input', '-i', help='Input path with PDF files', required=True)
def main(input:str):
if not os.path.isdir(input):
raise FileNotFoundError(f"'{input}' is not a folder or does not exist")
folder = input
folders = glob(f"{folder}/*.pdf")
for file_pdf in folders:
output_file = os.path.join(file_pdf.replace(".pdf", ".csv"))
print(f"Converting {file_pdf} to {output_file}...")
convert_PDF_to_CSV(file_pdf, output_file)

def convert_PDF_to_CSV(infile:str, outfile:str):
# Load input PDF document
document = pdf.Document(infile)

# Initialize the ExcelSaveOptions
excelSaveOptions = pdf.ExcelSaveOptions()

# Set CSV format
excelSaveOptions.format= pdf.ExcelSaveOptions.ExcelFormat.CSV

# Convert the PDF to Comma-Separated Values
document.save(outfile, excelSaveOptions)

print("Rendering process completed")

if __name__ == "__main__":
main()
66 changes: 52 additions & 14 deletions jsonParser/semi-automatic/parse_election_results.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,39 @@
import csv
import json
import click
import os
import csv
from typing import List
from glob import glob

@click.command()
@click.option('--input', '-i', help='Input CSV file', required=True)
@click.option('--output', '-o', help='Output JSON file', required=True)
@click.option('--input', '-i', help='Input CSV file or path with CSV files', required=True)
@click.option('--output', '-o', help='Output JSON file or folder', required=True)
def main(input:str, output:str):
check_file_exists(input)

input_files = []
output_files = []
if os.path.isdir(input):
input_files = glob(input + "/*.csv")
if len(input_files) == 0:
input_files = glob(input + "/*/*.csv")
if len(input_files) == 0:
raise FileNotFoundError(f"No CSV files found in '{input}'")
for file in input_files:
output_files.append(os.path.join(output, file.split("/")[-1].replace(".csv", ".json")))
else:
input_files.append(input)
output_files.append(output)

for input, output in zip(input_files, output_files):
print(f"Creating JSON file '{output}' from CSV file '{input}'...")
create_json_file(input, output)
print("#"*50)

def create_json_file(input:str, output:str) -> None:
# Read the CSV file
with open(input, 'r', encoding='utf-8-sig') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=';')
csv_reader = csv.reader(csv_file, delimiter=',')
rows_list = create_list(csv_reader)

# Create the dictionary that will contain the data
Expand Down Expand Up @@ -44,13 +66,12 @@ def main(input:str, output:str):
# Save the JSON data to a file
with open(output, 'w', encoding='utf-8') as json_file:
json_file.write(json_output)

def check_file_exists(file_path:str) -> None:
if not os.path.isfile(file_path):
if not os.path.isfile(file_path) and not os.path.isdir(file_path):
raise FileNotFoundError(f"File '{file_path}' not found")

def create_list(csv_reader:List[str]) -> List[str]:

"""Create a list of rows from the CSV file
Args:
Expand All @@ -60,10 +81,24 @@ def create_list(csv_reader:List[str]) -> List[str]:
list: The list of rows of the CSV file
"""
rows_list = []
for row in csv_reader:
# Replace \xa0 with a space in the entire row
row = [cell.replace('\xa0', ' ') for cell in row]
rows_list.append(row)
for line in csv_reader:
for i in range(len(line)):
line[i] = line[i].replace("\"", "").replace("\n", "").replace("\xa0", " ").replace("Evaluation Only. Created with Aspose.PDF. Copyright 2002-2023 Aspose Pty Ltd.", "")
if len(line) == 1 and (line[0] == "" or line[0] == "VOTI DI LISTA" or line[0] == "BIENNIO 2023/2025" or "ELEZIONI RAPPRESENTANTI" in line[0]):
continue
if line[0] == "VOTI DI LISTA" or line[0] == "BIENNIO 2023/2025" or "ELEZIONI RAPPRESENTANTI" in line[0]:
continue
line = [x.strip() for x in line if x.strip() != "" and "aequo" not in x.strip()]
if len(line) == 0:
continue
if "DIPARTIMENTO" in line[0]:
l = " ".join(line)
line = [l.split("-")[0].strip()]
if len(line) > 1:
if line[1] in line[0]:
line[0] = line[0].replace(line[1], "")
line.append(line[1])
rows_list.append(line)
return rows_list

def get_name_and_seats(rows_list:list, data:dict) -> list:
Expand All @@ -77,8 +112,10 @@ def get_name_and_seats(rows_list:list, data:dict) -> list:
list: The list of rows of the CSV file
"""
row = rows_list[0]
print(row)
data["dipartimento"] = str(row[0])
row = rows_list[1]
print(row)
data["seggi_da_assegnare"] = row[1]
rows_list = rows_list[4:]
return rows_list
Expand All @@ -100,6 +137,7 @@ def get_list_information(rows_list: list, data: dict) -> list:
if row[0].strip() == "TOTALE":
data["liste"].append({"totale": int(row[1].strip())})
break
print(row)
lista = {
"nome": str(row[0].strip()),
"seggi": {
Expand Down Expand Up @@ -142,7 +180,7 @@ def get_votation_information(rows_list:list, data:dict) -> list:
elif row[0].strip() == "VOTANTI":
data["votanti"] = {
"totali": int(row[1].strip()),
"percentuale": float(row[4].strip().replace(",", ".")),
"percentuale": float(row[3].strip().replace(",", ".")),
"seggio_n_telematico": int(row[-1].strip())
}
elif row[0].strip() == "TOTALE ELETTORI AVENTI DIRITTO":
Expand Down Expand Up @@ -176,10 +214,10 @@ def get_candidates_information(rows_list:list, data:dict) -> None:
"lista": list_name,
"voti": {
"totali": int(row[1].strip()),
"seggio_telematico": int(row[-4].strip())
"seggio_telematico": int(row[-1].strip())
}
}
if "ELETTO" in row[3].strip():
if "ELETTO" in row:
data["eletti"].append(candidate)
else:
data["non_eletti"].append(candidate)
Expand Down
119 changes: 119 additions & 0 deletions src/data/2023-2025/Corso di Laurea/Beni_culturali.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
{
"schede": {
"bianche": {
"totali": 5,
"seggio_n_telematico": 5
},
"nulle": {
"totali": 0
},
"contestate": {
"totali": 0
}
},
"liste": [
{
"nome": "Beni Culturali Disum",
"seggi": {
"seggi_pieni": "9",
"resti": "0,0",
"seggi_ai_resti": "0",
"seggi_totali": "9"
},
"voti": {
"totali": "127",
"seggio_telematico": "127"
}
},
{
"totale": 127
}
],
"eletti": [
{
"nominativo": "SANFILIPPO Lorenzo",
"lista": "Beni Culturali Disum",
"voti": {
"totali": 34,
"seggio_telematico": 34
}
},
{
"nominativo": "PETRALIA Salvatore Giuseppe",
"lista": "Beni Culturali Disum",
"voti": {
"totali": 34,
"seggio_telematico": 34
}
},
{
"nominativo": "CUSUMANO Francesca Maria",
"lista": "Beni Culturali Disum",
"voti": {
"totali": 13,
"seggio_telematico": 13
}
},
{
"nominativo": "CINO Anastasia",
"lista": "Beni Culturali Disum",
"voti": {
"totali": 2,
"seggio_telematico": 2
}
},
{
"nominativo": "RUGGIERI Edith Maria Gae",
"lista": "Beni Culturali Disum",
"voti": {
"totali": 5,
"seggio_telematico": 5
}
},
{
"nominativo": "DI STEFANO Agnese Pia",
"lista": "Beni Culturali Disum",
"voti": {
"totali": 2,
"seggio_telematico": 2
}
},
{
"nominativo": "ASSENNATO Concetta Ambra",
"lista": "Beni Culturali Disum",
"voti": {
"totali": 8,
"seggio_telematico": 8
}
},
{
"nominativo": "LO PIERO William",
"lista": "Beni Culturali Disum",
"voti": {
"totali": 16,
"seggio_telematico": 16
}
},
{
"nominativo": "BENTIVEGNA Riccardo",
"lista": "Beni Culturali Disum",
"voti": {
"totali": 9,
"seggio_telematico": 9
}
}
],
"non_eletti": [],
"dipartimento": "BENI CULTURALI-L-1-Laurea Triennale (D.M.270/2004)",
"seggi_da_assegnare": "9",
"quoziente": 14.111,
"votanti": {
"totali": 132,
"percentuale": 21.09,
"seggio_n_telematico": 132
},
"elettori": {
"totali": 626,
"seggio_n_telematico": 626
}
}
Loading

0 comments on commit 2e14586

Please sign in to comment.