-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathetl.py
More file actions
117 lines (91 loc) · 2.89 KB
/
etl.py
File metadata and controls
117 lines (91 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import csv
import json
import os
import argparse
def create_parser():
p = argparse.ArgumentParser(description="ETL for all region on production and consumption")
p.add_argument('output_file',
help='Result json file will be written in this file')
return p
def extract_data_source(file_path):
"""
newline=''
delimiter='\t'
:param file_path: must be a csv file
:return: list First dim is row, second one is column
"""
print("Extracting {}".format(os.path.basename(file_path)))
with open(file_path, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
content = list(reader)
return content
def transform(data_prod, data_conso):
"""
Structure :
[
{
"year": "1990",
"regions": [
{
"name": "ALSACE",
"prod": 3008.0,
"cons": 4323.0,
"diff": -1315.0
},
...
]
},
...
]
:param data_prod:
:param data_conso:
:return:
"""
print("Transforming data")
# region's name is stored into 'col_region' column
col_region = 0
# beginning of value linked to a date
col_begin_date = 4
years = []
for id, year in enumerate(data_conso[0][col_begin_date:]):
col_data = col_begin_date + id
regions = []
for region_id, region in enumerate(data_conso[1:len(data_conso)-2]):
row_data = region_id + 1
cons = float(region[col_data])
prod = float(data_prod[row_data][col_data])
data_region = {
"name": region[col_region],
"prod": prod,
"cons": cons,
"diff": prod - cons
}
regions.append(data_region)
data_year = {
"year": year,
"regions": regions
}
years.append(data_year)
for region in years[0]['regions']:
region['variation'] = 0
for year_id, year in enumerate(years[1:]):
for id, region in enumerate(year['regions']):
region['variation'] = region['diff'] - years[year_id]['regions'][id]['diff']
return years
def load(output_file, content):
"""
Writes everything in the output_file
:param output_file: file_path
:param content: written as json
:return: None
"""
print("Loading data into {}".format(output_file))
with open(output_file, "w") as output:
output.writelines(json.dumps(content))
if __name__ == '__main__':
args = create_parser().parse_args()
print(args)
conso = extract_data_source('datasource/conso_totale_petrole.csv')
prod = extract_data_source('datasource/prod_totale_petrole.csv')
data = transform(prod, conso)
load(args.output_file, data)