-
Notifications
You must be signed in to change notification settings - Fork 1
/
aggregate.rb
111 lines (87 loc) · 2.71 KB
/
aggregate.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
libdir = File.expand_path(File.join(File.dirname(__FILE__), 'lib/'))
srcdir = File.expand_path(File.join(File.dirname(__FILE__), 'src/'))
$LOAD_PATH.unshift(srcdir) unless $LOAD_PATH.include?(srcdir)
$LOAD_PATH.unshift(libdir) unless $LOAD_PATH.include?(libdir)
require 'yaml'
require 'json'
require 'confstruct'
require 'debugger'
# get our configuration data
config = Confstruct::Configuration.new(
YAML.load_file(
File.expand_path(
File.join(File.dirname(__FILE__), 'config.yaml')
)
)
)
# which collection are we processing?
collection = ARGV[0]
if (collection.nil?)
puts "Please pass a collection name to process. Available collections are in the config.yaml file."
exit;
end
category = nil
if (ARGV[1])
category = ARGV[1]
"Generating aggregate for #{category} articles only."
end
gvarticles = Dir.glob(File.join(File.dirname(__FILE__), "articles/#{collection}/*.json"))
count = gvarticles.size - 1
time = Time.now().strftime('%Y_%m_%d')
cat_name = ""
if (!category.nil?)
cat_name = "-#{category}"
end
output_all = File.open("assets/#{time}_global_voices_#{collection}_all#{cat_name}.csv", "wb")
output_just_names = File.open("assets/#{time}_global_voices_#{collection}_names#{cat_name}.csv", "wb")
output_all.write("id,pubdate,byline,gender_by_pronoun,gender_by_byline,wc,pc\n")
output_just_names.write("name,likely_byline_gender\n")
cache = {}
gvarticles.each do |a|
file = File.open(a, "r")
body = file.read
article = JSON.parse(body)
# begin
if (article["byline"])
add_article = false
if (category.nil?)
add_article = true
else
# check if article has such categories
if (article["categories"].index(category) != nil)
add_article = true
end
end
if (add_article)
# wordcount
wc = article["decompositions"]["tokens"].length
# paragraph count
pc = article["body"].split("\n").length
line_all = article["id"] + "," +
article["pub_date"] + "," +
article["byline"] + "," +
article["metrics"]["pronouns"]["result"] + "," +
article["metrics"]["byline_gender"]["result"] + "," +
wc.to_s + "," + pc.to_s +
"\n"
line_names = article["byline"] + "," +
article["metrics"]["byline_gender"]["result"] +
"\n"
if (!cache[line_all])
output_all.write(line_all)
cache[line_all] = 1
end
if (!cache[line_names])
output_just_names.write(line_names)
cache[line_names] = 1
end
end
end
# rescue
# puts a
# end
end
output_all.flush
output_all.close
output_just_names.flush
output_just_names.close