Skip to content

Commit 45f6506

Browse files
committed
Tests pass on new database schema
1 parent 4a3a409 commit 45f6506

File tree

7 files changed

+57
-28
lines changed

7 files changed

+57
-28
lines changed

INSTALL.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Then, if you want to restore from a database backup:
3131
psql -U scraper -d taaraxtak -h localhost < [your-database-dump].sql
3232
```
3333

34+
Alternatively, you can initiate a fresh database. See Python setup, below.
3435

3536
### Python
3637

@@ -48,7 +49,7 @@ pip3 install -e .
4849

4950
3. Copy `config.example.py` to `config.py` and enter your Postgres credentials. See Config section below for info on
5051
logging options.
51-
4. Run `python3 create_tables.py` to set up the database. (Alternatively, restore the database from a recent DB dump).
52+
4. Run `python3 create_tables.py` to set up the database. (Alternatively, restore the database from a recent DB dump. See above.)
5253

5354
Now you can run `python3 collect.py` to start collecting data.
5455

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ For local development, start the collection server with
2222
python3 collect.py
2323
```
2424

25+
You can also run the data collection as separate one-time jobs:
26+
27+
- `python3 run.py w3techs`
28+
- `python3 run.py ooni`
29+
30+
This makes it suitable to be run from a system cron, rather than as a standalone continuous process.
31+
2532
Then check out your Grafana instance (by default, https://localhost:3000).
2633

2734
To deploy a production environment, see [DEPLOY.md](DEPLOY.md)

create_tables.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,5 @@
3131
#
3232
# run
3333
#
34-
# w3techs()
35-
# ooni()
34+
w3techs()
35+
ooni()

src/w3techs/collect.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def collect(postgres_config: dict):
132132
# Compute gini coefficients
133133
for market in included_markets:
134134
logging.info(f'Computing gini for {market}')
135-
pop_weighted_gini = utils.population_weighted_gini(cur, market, pd.Timestamp(datetime.now()))
135+
pop_weighted_gini = utils.population_weighted_gini(cur, 'all', market, pd.Timestamp(datetime.now()))
136136
pop_weighted_gini.write_to_db(cur, conn)
137137

138138
logging.debug('W3Techs complete.')

src/w3techs/types.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def __init__(self,
3131
name: str,
3232
url: Optional[str],
3333
jurisdiction_alpha2: Optional[shared_types.Alpha2],
34+
measurement_scope: str,
3435
market: str,
3536
marketshare: float,
3637
time: pd.Timestamp):
@@ -50,6 +51,13 @@ def __init__(self,
5051
# we'll just store the str version
5152
self.jurisdiction_alpha2 = str(jurisdiction_alpha2)
5253

54+
assert(
55+
(measurement_scope == 'all') or
56+
(measurement_scope == '10k') or
57+
(measurement_scope == '1k')
58+
)
59+
self.measurement_scope = measurement_scope
60+
5361
assert(shared_utils.is_nonempty_str(market))
5462
self.market = market
5563

@@ -68,6 +76,7 @@ def create_table(
6876
name VARCHAR NOT NULL,
6977
url VARCHAR,
7078
jurisdiction_alpha2 CHAR(2),
79+
measurement_scope VARCHAR NOT NULL,
7180
market VARCHAR NOT NULL,
7281
marketshare NUMERIC NOT NULL,
7382
time TIMESTAMPTZ NOT NULL DEFAULT now()
@@ -85,12 +94,13 @@ def write_to_db(
8594
cur.execute(
8695
"""
8796
INSERT INTO provider_marketshare
88-
(name, url, jurisdiction_alpha2, market, marketshare, time)
97+
(name, url, jurisdiction_alpha2, measurement_scope, market, marketshare, time)
8998
VALUES
90-
(%s, %s, %s, %s, %s, %s)
99+
(%s, %s, %s, %s, %s, %s, %s)
91100
""", (self.name,
92101
self.url,
93102
self.jurisdiction_alpha2,
103+
self.measurement_scope,
94104
self.market,
95105
self.marketshare,
96106
self.time))
@@ -99,7 +109,7 @@ def write_to_db(
99109
return
100110

101111
def __str__(self):
102-
return f'{self.name} {self.url} {self.jurisdiction_alpha2} {self.market} {self.marketshare} {self.time}'
112+
return f'{self.name} {self.url} {self.jurisdiction_alpha2} {self.measurement_scope} {self.market} {self.marketshare} {self.time}'
103113

104114
def __repr__(self):
105115
return self.__str__()
@@ -114,12 +124,20 @@ class PopWeightedGini ():
114124
TODO - Check for SQL injection attacks.
115125
'''
116126
def __init__(self,
127+
measurement_scope: str,
117128
market: str,
118129
gini: float,
119130
time: pd.Timestamp):
120131
assert(shared_utils.is_nonempty_str(market))
121132
self.market = market
122133

134+
assert(
135+
(measurement_scope == 'all') or
136+
(measurement_scope == '10k') or
137+
(measurement_scope == '1k')
138+
)
139+
self.measurement_scope = measurement_scope
140+
123141
assert(is_float_0_1(float(gini)))
124142
self.gini = gini
125143

@@ -132,6 +150,7 @@ def create_table(
132150
conn: connection):
133151
cmd = '''
134152
CREATE TABLE pop_weighted_gini (
153+
measurement_scope VARCHAR NOT NULL,
135154
market VARCHAR NOT NULL,
136155
gini NUMERIC NOT NULL,
137156
time TIMESTAMPTZ NOT NULL DEFAULT now()
@@ -149,16 +168,16 @@ def write_to_db(
149168
cur.execute(
150169
"""
151170
INSERT INTO pop_weighted_gini
152-
(market, gini, time)
171+
(measurement_scope, market, gini, time)
153172
VALUES
154-
(%s, %s, %s)
155-
""", (self.market, self.gini, self.time))
173+
(%s, %s, %s, %s)
174+
""", (self.measurement_scope, self.market, self.gini, self.time))
156175
if commit:
157176
return conn.commit()
158177
return
159178

160179
def __str__(self):
161-
return f'{self.market} {self.gini} {self.time}'
180+
return f'{self.measurement_scope} {self.market} {self.gini} {self.time}'
162181

163182
def __repr__(self):
164183
return self.__str__()
@@ -171,10 +190,10 @@ def create_tables(cur: cursor, conn: connection):
171190

172191
# dummy data - just a demo
173192
ProviderMarketshare(
174-
'name', None, shared_types.Alpha2('CA'), 'ssl-certificate', 0.5, pd.Timestamp('2021-04-20')
193+
'name', None, shared_types.Alpha2('CA'), 'all', 'ssl-certificate', 0.5, pd.Timestamp('2021-04-20')
175194
).create_table(cur, conn)
176195

177196
# dummy data - just a demo
178197
PopWeightedGini(
179-
'ssl-certificate', 0.9, pd.Timestamp('2021-04-20')
198+
'all', 'ssl-certificate', 0.9, pd.Timestamp('2021-04-20')
180199
).create_table(cur, conn)

src/w3techs/utils.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def extract_from_row(market: str, time: pd.Timestamp, df_row: pd.Series) -> Prov
130130
# Once data is in this type, it *should* be trustworthy.
131131
# See /design-notes.md for more detail on this pattern.
132132
return ProviderMarketshare(
133-
str(name), str(url), juris, market, float(marketshare), time
133+
str(name), str(url), juris, 'all', market, float(marketshare), time
134134
)
135135

136136

@@ -144,25 +144,26 @@ def extract_from_row(market: str, time: pd.Timestamp, df_row: pd.Series) -> Prov
144144

145145
def to_df(db_rows) -> pd.DataFrame:
146146
# TODO put this in TYPES somehow?
147-
db_rows = pd.DataFrame(db_rows, columns=['name', 'url', 'jurisdiction_alpha2', 'market', 'marketshare', 'time'])
147+
db_rows = pd.DataFrame(db_rows, columns=['name', 'url', 'jurisdiction_alpha2', 'measurement_scope', 'market', 'marketshare', 'time'])
148148
return db_rows
149149

150150

151-
def fetch_rows(cur: cursor, market: str, date: pd.Timestamp) -> pd.DataFrame:
151+
def fetch_rows(cur: cursor, measurement_scope: str, market: str, date: pd.Timestamp) -> pd.DataFrame:
152152
# TODO why this window? a magic number.
153153
cur.execute(f'''
154154
SELECT * from provider_marketshare
155-
WHERE market = '{market}'
155+
WHERE measurement_scope = '{measurement_scope}'
156+
AND market = '{market}'
156157
AND time BETWEEN timestamp '{date}' - interval '24 hour' AND '{date}'
157158
''')
158159
return to_df(cur.fetchall())
159160

160161

161-
def fetch_by_jurisdiction(cur: cursor, market: str, date: pd.Timestamp) -> pd.DataFrame:
162+
def fetch_by_jurisdiction(cur: cursor, measurement_scope: str, market: str, date: pd.Timestamp) -> pd.DataFrame:
162163
'''
163164
Get a DataFrame mapping alpha2 codes to (mean) marketshares on a given date.
164165
'''
165-
rows = fetch_rows(cur, market, date)
166+
rows = fetch_rows(cur, measurement_scope, market, date)
166167
rows['marketshare'] = rows['marketshare'].astype(float)
167168
# in case we have the same name and jurisidiction repeated, we take the median marketshare
168169
rows = rows.groupby(['name', 'jurisdiction_alpha2']).median()
@@ -206,8 +207,8 @@ def weighted_gini(marketshares: pd.Series, population_shares: pd.Series) -> floa
206207
return gini(vs)
207208

208209

209-
def population_weighted_gini(cur: cursor, market: str, time: pd.Timestamp) -> Optional[PopWeightedGini]:
210-
by_juris = fetch_by_jurisdiction(cur, market, time)
210+
def population_weighted_gini(cur: cursor, measurement_scope: str, market: str, time: pd.Timestamp) -> Optional[PopWeightedGini]:
211+
by_juris = fetch_by_jurisdiction(cur, measurement_scope, market, time)
211212
# if there are no values, None
212213
if len(by_juris) == 0:
213214
return None
@@ -228,4 +229,4 @@ def population_weighted_gini(cur: cursor, market: str, time: pd.Timestamp) -> Op
228229
merged['marketshare'],
229230
merged[relevant_year],
230231
)
231-
return PopWeightedGini(market, g, time)
232+
return PopWeightedGini(measurement_scope, market, g, time)

test/w3techs-test.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def test_provider_marketshare_type(postgresdb):
6868
cur, conn = postgresdb
6969

7070
ex_ms = types.ProviderMarketshare(
71-
'Foo', None, shared_types.Alpha2('NL'), 'ssl-certificate', 0.5, pd.Timestamp('2021-04-20')
71+
'Foo', None, shared_types.Alpha2('NL'), 'all', 'ssl-certificate', 0.5, pd.Timestamp('2021-04-20')
7272
)
7373
ex_ms.write_to_db(cur, conn)
7474

@@ -81,14 +81,14 @@ def test_pop_weighted_gini_type(postgresdb):
8181
cur, conn = postgresdb
8282

8383
ex_g = types.PopWeightedGini(
84-
'ssl-certificate', 0.9, pd.Timestamp('2021-04-20')
84+
'all', 'ssl-certificate', 0.9, pd.Timestamp('2021-04-20')
8585
)
8686

8787
ex_g.write_to_db(cur, conn)
8888

8989
cur.execute('SELECT * FROM pop_weighted_gini')
9090
item = cur.fetchone()
91-
assert(item[0] == 'ssl-certificate')
91+
assert(item[1] == 'ssl-certificate')
9292

9393
#
9494
# utils tests
@@ -128,6 +128,7 @@ def test_compute_pop_weighted_gini(postgresdb):
128128
cur, conn = postgresdb
129129
res = utils.population_weighted_gini(
130130
cur,
131+
'all',
131132
'fake-market',
132133
pd.Timestamp('2021-01-20'),
133134
)
@@ -136,17 +137,17 @@ def test_compute_pop_weighted_gini(postgresdb):
136137
# add a provider marketshare
137138
# tiny netherlands has 50% of the world's market
138139
types.ProviderMarketshare(
139-
'Foo', None, shared_types.Alpha2('NL'), 'ssl-certificate',
140+
'Foo', None, shared_types.Alpha2('NL'), 'all', 'ssl-certificate',
140141
0.5, pd.Timestamp('2021-04-20')
141142
).write_to_db(cur, conn)
142143
# US has the rest
143144
types.ProviderMarketshare(
144-
'Foo', None, shared_types.Alpha2('US'), 'ssl-certificate',
145+
'Foo', None, shared_types.Alpha2('US'), 'all', 'ssl-certificate',
145146
0.5, pd.Timestamp('2021-04-20')
146147
).write_to_db(cur, conn)
147148

148149
res = utils.population_weighted_gini(
149-
cur, 'ssl-certificate', pd.Timestamp('2021-04-20')
150+
cur, 'all', 'ssl-certificate', pd.Timestamp('2021-04-20')
150151
)
151152
# should result in a gini of 0.99
152153
assert(round(res.gini, 2) == 0.99)

0 commit comments

Comments
 (0)