Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow for more flexible microdata inputs #202

Merged
merged 19 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
shell: bash -l {0}
working-directory: ./
run: |
pytest --cov=./ --cov-report=xml
pytest -m 'not requires_pufcsv and not requires_tmdcsv' --cov=./ --cov-report=xml
- name: Upload coverage to Codecov
if: matrix.os == 'ubuntu-latest' && contains(github.repository, 'PSLmodels/Tax-Brain')
uses: codecov/codecov-action@v4
Expand Down
10 changes: 10 additions & 0 deletions RELEASES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Tax-Brain Release History


## 2024-06-10 Release 2.7.0

Last Merged Pull Request: [#202](https://github.com/PSLmodels/Tax-Brain/pull/196)

Changes in this release:

* Use of the [Tax Micro Data (TMD)](https://github.com/PSLmodels/tax-microdata-benchmarking) file: [#202](https://github.com/PSLmodels/Tax-Brain/pull/202)


## 2024-04-25 Release 2.7.0

Last Merged Pull Request: [#196](https://github.com/PSLmodels/Tax-Brain/pull/196)
Expand Down
42 changes: 36 additions & 6 deletions cs-config/cs_config/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
postprocess,
nth_year_results,
retrieve_puf,
retrieve_tmd,
)
from .outputs import create_layout, aggregate_plot
from taxbrain import TaxBrain, report
Expand All @@ -25,6 +26,9 @@
PUF_S3_FILE_LOCATION = os.environ.get(
"PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
TMD_S3_FILE_LOCATION = os.environ.get(
"TMD_S3_LOCATION", "s3://ospc-data-files/tmd.20210720.csv.gz"
)

CUR_PATH = os.path.abspath(os.path.dirname(__file__))

Expand Down Expand Up @@ -108,7 +112,6 @@ def run_model(meta_params_dict, adjustment):
behavior_mods = cs2tc.convert_behavior_adjustment(adjustment["behavior"])
user_mods = {"policy": policy_mods, "behavior": behavior_mods}
start_year = int(meta_params.year)
use_cps = meta_params.data_source == "CPS"
if meta_params.data_source == "PUF":
puf_df = retrieve_puf(
PUF_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
Expand All @@ -117,22 +120,45 @@ def run_model(meta_params_dict, adjustment):
if not isinstance(puf_df, pd.DataFrame):
raise TypeError("'puf_df' must be a Pandas DataFrame.")
fuzz = True
use_cps = False
sampling_frac = 0.05
sampling_seed = 2222
full_sample = puf_df
data_start_year = taxcalc.Records.PUFCSV_YEAR
weights = taxcalc.Records.PUF_WEIGHTS_FILENAME
else:
# Access keys are not available. Default to the CPS.
print("Defaulting to the CPS")
meta_params.adjust({"data_source": "CPS"})
if meta_params.data_source == "CPS":
elif meta_params.data_source == "TMD":
tmd_df = retrieve_tmd(
TMD_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
)
if tmd_df is not None:
if not isinstance(tmd_df, pd.DataFrame):
raise TypeError("'tmd_df' must be a Pandas DataFrame.")
fuzz = True
sampling_frac = 0.05
sampling_seed = 2222
full_sample = tmd_df
data_start_year = taxcalc.Records.TMDCSV_YEAR
weights = taxcalc.Records.TMD_WEIGHTS_FILENAME
else:
# Access keys are not available. Default to the CPS.
print("Defaulting to the CPS")
meta_params.adjust({"data_source": "CPS"})
elif meta_params.data_source == "CPS":
fuzz = False
use_cps = True
input_path = os.path.join(TCDIR, "cps.csv.gz")
# full_sample = read_egg_csv(cpspath) # pragma: no cover
sampling_frac = 0.03
sampling_seed = 180
full_sample = pd.read_csv(input_path)
data_start_year = taxcalc.Records.CPSCSV_YEAR
weights = taxcalc.Records.CPS_WEIGHTS_FILENAME
else:
raise ValueError(
f"Data source '{meta_params.data_source}' is not supported."
)

if meta_params.use_full_sample:
sample = full_sample
Expand All @@ -146,8 +172,12 @@ def run_model(meta_params_dict, adjustment):
tb = TaxBrain(
start_year,
end_year,
microdata=sample,
use_cps=use_cps,
microdata={
"data": sample,
"start_year": data_start_year,
"growfactors": None,
"weights": weights,
},
reform=policy_mods,
behavior=behavior_mods,
)
Expand Down
43 changes: 42 additions & 1 deletion cs-config/cs_config/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@
"PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)

TMD_S3_FILE_LOCATION = os.environ.get(
"TMD_S3_LOCATION", "s3://ospc-data-files/tmd.20210720.csv.gz"
)


def random_seed(user_mods, year):
"""
Expand Down Expand Up @@ -376,7 +380,7 @@ def retrieve_puf(
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
):
"""
Function for retrieving the PUF from the OSPC S3 bucket
Function for retrieving the PUF from the S3 bucket
"""
s3_reader_installed = S3FileSystem is not None
has_credentials = (
Expand Down Expand Up @@ -405,3 +409,40 @@ def retrieve_puf(
f"s3_reader_installed={s3_reader_installed})"
)
return None


def retrieve_tmd(
tmd_s3_file_location=TMD_S3_FILE_LOCATION,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
):
"""
Function for retrieving the TMD from the S3 bucket
"""
s3_reader_installed = S3FileSystem is not None
has_credentials = (
aws_access_key_id is not None and aws_secret_access_key is not None
)
if tmd_s3_file_location and has_credentials and s3_reader_installed:
print("Reading tmd from S3 bucket.", tmd_s3_file_location)
fs = S3FileSystem(
key=AWS_ACCESS_KEY_ID,
secret=AWS_SECRET_ACCESS_KEY,
)
with fs.open(tmd_s3_file_location) as f:
# Skips over header from top of file.
tmd_df = pd.read_csv(f)
return tmd_df
elif Path("tmd.csv.gz").exists():
print("Reading tmd from tmd.csv.gz.")
return pd.read_csv("tmd.csv.gz", compression="gzip")
elif Path("tmd.csv").exists():
print("Reading tmd from tmd.csv.")
return pd.read_csv("tmd.csv")
else:
warnings.warn(
f"TMD file not available (tmd_location={tmd_s3_file_location}, "
f"has_credentials={has_credentials}, "
f"s3_reader_installed={s3_reader_installed})"
)
return None
7 changes: 7 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[pytest]
testpaths =
taxbrain
cs-config/cs_config/tests
markers =
requires_pufcsv
requires_tmdcsv
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

with open("README.md", "r") as f:
long_description = f.read()
version = "2.7.0"
version = "2.7.1"
setuptools.setup(
name="taxbrain",
version=version,
Expand Down
16 changes: 4 additions & 12 deletions taxbrain/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def cli_core(
startyear,
endyear,
data,
usecps,
reform,
behavior,
assump,
Expand Down Expand Up @@ -91,12 +90,14 @@ def cli_core(
start_year=startyear,
end_year=endyear,
microdata=data,
use_cps=usecps,
reform=reform,
behavior=behavior,
assump=assump,
base_policy=baseline,
verbose=True,
corp_revenue=None,
corp_incidence_assumptions=None,
verbose=False,
stacked=False,
)
tb.run()

Expand Down Expand Up @@ -156,15 +157,6 @@ def cli_main():
),
default=None,
)
parser.add_argument(
"--usecps",
help=(
"If this argument is present, the CPS file included in "
"Tax-Calculator will be used for the analysis."
),
default=False,
action="store_true",
),
parser.add_argument(
"--reform",
help=("--reform should be a path to a JSON file."),
Expand Down
Loading
Loading