-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_filings.py
73 lines (50 loc) · 1.85 KB
/
get_filings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
import json
from tqdm.auto import tqdm
from multiprocessing import Pool
import glob
import os
all_submissions = glob.glob('submissions/*')
print("Number of files: ", len(all_submissions))
columns = ['cik', 'entityType', 'sic', 'category']
def get_filings(submission):
try:
with open(submission, 'rb') as f:
sub = json.load(f)
output = dict([(key, sub.get(key)) for key in columns])
except:
output = dict([(key, 0) for key in columns])
return output
# if sub.get('filings'):
# filings = sub.get('filings')
# temp = pd.DataFrame(filings['recent'])
# if filings.get('files'):
# add_files_names = [f['name'] for f in filings['files']]
# for name in add_files_names:
# with open(f'subs/{name}', 'rb') as f:
# add_files = json.load(f)
# temp = pd.concat([temp, pd.DataFrame(add_files)])
# for col in columns:
# try:
# temp[col] = sub.get(col)
# except:
# continue
# temp.drop(
# columns=['acceptanceDateTime', 'act', 'fileNumber', 'filmNumber',
# 'items', 'size', 'isXBRL', 'isInlineXBRL', 'primaryDocDescription'],
# inplace=True
# )
# return temp
# else:
# return None
# except:
# return None
print("<< START >>\n")
with Pool(processes=os.cpu_count()) as p:
output = p.map(get_filings, all_submissions[:100])
p.join()
# filings_df = pd.concat(output)
# filings_df = filings_df[filings_df["form"]=="10-K"]
filings_df = pd.DataFrame(output).drop_duplicates()
print(filings_df.shape)
filings_df.to_csv('SIC_df.csv', index=False)