Skip to content

Commit 4589e62

Browse files
committed
[u r] Add type and domain fields to TDRSourceSpec (#6426)
1 parent e358cdf commit 4589e62

File tree

15 files changed

+95
-42
lines changed

15 files changed

+95
-42
lines changed

UPGRADING.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ have too many entries in this file.
2323
#6426 Clean-up and generalize TDR source specs
2424
==============================================
2525

26-
The "snapshot/" string has been removed from TDR source specs.
26+
The "snapshot/" string has been removed from TDR source specs, and the ``type``
27+
and ``domain`` fields have been added.
2728
Update the ``mksrc`` function in ``environment.py`` for each of your personal
2829
deployments. As always, use the sandbox deployment's ``environment.py`` as a
2930
model when upgrading personal deployments.

deployments/anvilbox/environment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def mksrc(google_project,
4040
prefix = common_prefix(subgraphs)
4141
source = None if flags & pop else ':'.join([
4242
'tdr',
43+
'bigquery',
44+
'gcp',
4345
google_project,
4446
snapshot,
4547
prefix + '/0'

deployments/anvildev/environment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
2727
assert flags <= ma | pop
2828
source = None if flags & pop else ':'.join([
2929
'tdr',
30+
'bigquery',
31+
'gcp',
3032
google_project,
3133
snapshot,
3234
'/' + str(partition_prefix_length(subgraphs))

deployments/anvilprod/environment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
2727
assert flags <= ma | pop
2828
source = None if flags & pop else ':'.join([
2929
'tdr',
30+
'bigquery',
31+
'gcp',
3032
google_project,
3133
snapshot,
3234
'/' + str(partition_prefix_length(subgraphs))

deployments/dev/environment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
2727
assert flags <= ma | pop
2828
source = None if flags & pop else ':'.join([
2929
'tdr',
30+
'bigquery',
31+
'gcp',
3032
google_project,
3133
snapshot,
3234
'/' + str(partition_prefix_length(subgraphs))

deployments/hammerbox/environment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def mksrc(google_project,
4040
prefix = common_prefix(subgraphs)
4141
source = None if flags & pop else ':'.join([
4242
'tdr',
43+
'bigquery',
44+
'gcp',
4345
google_project,
4446
snapshot,
4547
prefix + '/0'

deployments/prod/environment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
2929
assert flags <= ma | pop
3030
source = None if flags & pop else ':'.join([
3131
'tdr',
32+
'bigquery',
33+
'gcp',
3234
google_project,
3335
snapshot,
3436
'/' + str(partition_prefix_length(subgraphs))

deployments/sandbox/environment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def mksrc(google_project,
4040
prefix = common_prefix(subgraphs)
4141
source = None if flags & pop else ':'.join([
4242
'tdr',
43+
'bigquery',
44+
'gcp',
4345
google_project,
4446
snapshot,
4547
prefix + '/0'

deployments/tempdev/environment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
2727
assert flags <= ma | pop
2828
source = None if flags & pop else ':'.join([
2929
'tdr',
30+
'bigquery',
31+
'gcp',
3032
google_project,
3133
snapshot,
3234
'/' + str(partition_prefix_length(subgraphs))

src/azul/terra.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from collections.abc import (
66
Sequence,
77
)
8+
import enum
89
import json
910
import logging
1011
from time import (
@@ -52,6 +53,7 @@
5253
cache,
5354
config,
5455
mutable_furl,
56+
reject,
5557
require,
5658
)
5759
from azul.auth import (
@@ -91,37 +93,67 @@
9193
log = logging.getLogger(__name__)
9294

9395

96+
class SourceType(enum.Enum):
97+
bigquery = 'bigquery'
98+
parquet = 'parquet'
99+
100+
101+
class SourceDomain(enum.Enum):
102+
gcp = 'gcp'
103+
azure = 'azure'
104+
105+
94106
@attrs.frozen(kw_only=True)
95107
class TDRSourceSpec(SourceSpec):
108+
type: SourceType
109+
domain: SourceDomain
96110
subdomain: str
97111
name: str
98112

99113
@classmethod
100114
def parse(cls, spec: str) -> 'TDRSourceSpec':
101115
"""
102116
Construct an instance from its string representation, using the syntax
103-
'tdr:{subdomain}:{name}:{prefix}' ending with an optional
117+
'tdr:{type}{domain}{subdomain}:{name}:{prefix}' ending with an optional
104118
'/{partition_prefix_length}'.
105119
106-
>>> s = TDRSourceSpec.parse('tdr:foo:bar:/0')
120+
>>> s = TDRSourceSpec.parse('tdr:bigquery:gcp:foo:bar:/0')
107121
>>> s # doctest: +NORMALIZE_WHITESPACE
108122
TDRSourceSpec(prefix=Prefix(common='', partition=0),
123+
type=<SourceType.bigquery: 'bigquery'>,
124+
domain=<SourceDomain.gcp: 'gcp'>,
109125
subdomain='foo',
110126
name='bar')
111127
112128
>>> str(s)
113-
'tdr:foo:bar:/0'
129+
'tdr:bigquery:gcp:foo:bar:/0'
130+
131+
>>> TDRSourceSpec.parse('tdr:spam:gcp:foo:bar:/0')
132+
Traceback (most recent call last):
133+
...
134+
ValueError: 'spam' is not a valid SourceType
135+
136+
>>> TDRSourceSpec.parse('tdr:bigquery:eggs:foo:bar:/0')
137+
Traceback (most recent call last):
138+
...
139+
ValueError: 'eggs' is not a valid SourceDomain
114140
115-
>>> TDRSourceSpec.parse('tdr:foo:bar:n32/0')
141+
>>> TDRSourceSpec.parse('tdr:bigquery:gcp:foo:bar:n32/0')
116142
Traceback (most recent call last):
117143
...
118144
azul.uuids.InvalidUUIDPrefixError: 'n32' is not a valid UUID prefix.
119145
"""
120146
rest, prefix = cls._parse(spec)
121147
# BigQuery (and by extension the TDR) does not allow : or / in dataset names
122-
service, subdomain, name = rest.split(':')
148+
service, type, domain, subdomain, name = rest.split(':')
123149
assert service == 'tdr', service
150+
type = SourceType(type)
151+
reject(type == SourceType.parquet, 'Parquet sources are not yet supported')
152+
domain = SourceDomain(domain)
153+
reject(domain == SourceDomain.azure, 'Azure sources are not yet supported')
124154
self = cls(prefix=prefix,
155+
type=SourceType(type),
156+
domain=SourceDomain(domain),
125157
subdomain=subdomain,
126158
name=name)
127159
assert spec == str(self), spec
@@ -131,20 +163,22 @@ def __str__(self) -> str:
131163
"""
132164
The inverse of :meth:`parse`.
133165
134-
>>> s = 'tdr:foo:bar:/0'
166+
>>> s = 'tdr:bigquery:gcp:foo:bar:/0'
135167
>>> s == str(TDRSourceSpec.parse(s))
136168
True
137169
138-
>>> s = 'tdr:foo:bar:22/0'
170+
>>> s = 'tdr:bigquery:gcp:foo:bar:22/0'
139171
>>> s == str(TDRSourceSpec.parse(s))
140172
True
141173
142-
>>> s = 'tdr:foo:bar:22/2'
174+
>>> s = 'tdr:bigquery:gcp:foo:bar:22/2'
143175
>>> s == str(TDRSourceSpec.parse(s))
144176
True
145177
"""
146178
return ':'.join([
147179
'tdr',
180+
self.type.value,
181+
self.domain.value,
148182
self.subdomain,
149183
self.name,
150184
str(self.prefix)
@@ -157,18 +191,20 @@ def contains(self, other: 'SourceSpec') -> bool:
157191
"""
158192
>>> p = TDRSourceSpec.parse
159193
160-
>>> p('tdr:foo:bar:/0').contains(p('tdr:foo:bar:/0'))
194+
>>> p('tdr:bigquery:gcp:foo:bar:/0').contains(p('tdr:bigquery:gcp:foo:bar:/0'))
161195
True
162196
163-
>>> p('tdr:foo:bar:/0').contains(p('tdr:bar:bar:/0'))
197+
>>> p('tdr:bigquery:gcp:foo:bar:/0').contains(p('tdr:bigquery:gcp:bar:bar:/0'))
164198
False
165199
166-
>>> p('tdr:foo:bar:/0').contains(p('tdr:foo:baz:/0'))
200+
>>> p('tdr:bigquery:gcp:foo:bar:/0').contains(p('tdr:bigquery:gcp:foo:baz:/0'))
167201
False
168202
"""
169203
return (
170204
isinstance(other, TDRSourceSpec)
171205
and super().contains(other)
206+
and self.type == other.type
207+
and self.domain == other.domain
172208
and self.subdomain == other.subdomain
173209
and self.name == other.name
174210
)

0 commit comments

Comments
 (0)