various updates (#111)

* Package updates (#109) * ActivitySim 0.9.2; Pandas 1.0 * Freeze ortools package below 7.5 * Add info about zero-person households (#106) This addresses the documentation needs arising from #104 * Fixes (#110) * ActivitySim 0.9.2; Pandas 1.0 * Freeze ortools package below 7.5 * fix issue #103 * fix issue #102 * Update setup.py (#112) Co-authored-by: Blake <[email protected]> Co-authored-by: Greg Macfarlane <[email protected]>
ActivitySim · Feb 21, 2020 · 9b29db4 · 9b29db4
1 parent 347e22e
commit 9b29db4
Show file tree

Hide file tree

Showing 29 changed files with 77 additions and 266 deletions.
diff --git a/docs/application_configuration.rst b/docs/application_configuration.rst
@@ -81,7 +81,7 @@ Seed sample
 
 As mentioned in previous section, the seed sample is typically obtained from the ACS PUMS. One of the main requirements for the seed sample is that it should be representative of the modeling region. In case of ACS PUMS, this can be ensured by selecting PUMAs representing the modeling region both demographically and geographically. PUMA boundaries may not perfectly line up against the modeling region boundaries and overlaps are possible. Each sub-seed geography must be assigned to a Seed geography, and each Seed geography must be assigned to a Meta geography.
 
-The seed sample must contain all of the specified control variables, as well as any variables that are needed for the travel model but not specified as controls. For population groups that use completely separate, non-overlapping controls, such as residential population and group-quarter population, separate seed samples are prepared. PopulationSim can be set up and run separately for each population segment using the same geographic system. The outputs from each run can be combined into a unified synthetic population as a post processing step.
+The seed sample must contain all of the specified control variables, as well as any variables that are needed for the travel model but not specified as controls. For population groups that use completely separate, non-overlapping controls, such as residential population and group-quarter population, separate seed samples are prepared. In the ACS PUMS datasets, it is possible to have zero-person households in the raw data table (`NP = 0`); these records must be filtered from the seed data. PopulationSim can be set up and run separately for each population segment using the same geographic system. The outputs from each run can be combined into a unified synthetic population as a post processing step.
 
 Finally, the seed sample must include an initial weight field. The PopulationSim algorithm is designed to assign weights as close to the initial weight as possible to minimize the changes in distribution of uncontrolled variables. All the fields in the seed sample should be appropriately recoded to specify controls (see more details in next section). Household-level population variables must be computed in advance (for e.g., number of workers in each household) and monetary variables must be inflation adjusted to be consistent with year of control data (e.g., Household Income). The ACS PUMS data contain 3 or 5 years of household records, where  each record's income is reported in the year in which it was collected. The ACS PUMS data includes the rolling reference factor for the year and the inflation adjustment factor, these must be used to code each household's income to a common income year.
 

diff --git a/example_calm/run_populationsim.py b/example_calm/run_populationsim.py
@@ -12,7 +12,7 @@
 from activitysim.core.config import handle_standard_args
 from activitysim.core.tracing import print_elapsed_time
 
-from populationsim.util import setting
+from activitysim.core.config import setting
 from populationsim import lp
 from populationsim import multi_integerizer
 

diff --git a/example_calm_repop/run_populationsim.py b/example_calm_repop/run_populationsim.py
@@ -12,7 +12,7 @@
 from activitysim.core.config import handle_standard_args
 from activitysim.core.tracing import print_elapsed_time
 
-from populationsim.util import setting
+from activitysim.core.config import setting
 from populationsim import lp
 from populationsim import multi_integerizer
 

diff --git a/example_survey_weighting/run_populationsim.py b/example_survey_weighting/run_populationsim.py
@@ -12,7 +12,7 @@
 from activitysim.core.config import handle_standard_args
 from activitysim.core.tracing import print_elapsed_time
 
-from populationsim.util import setting
+from activitysim.core.config import setting
 from populationsim import lp
 from populationsim import multi_integerizer
 

diff --git a/example_test/run_populationsim.py b/example_test/run_populationsim.py
@@ -11,7 +11,7 @@
 from activitysim.core.config import handle_standard_args
 
 from populationsim import steps
-from populationsim.util import setting
+from activitysim.core.config import setting
 from populationsim import lp
 from populationsim import multi_integerizer
 

diff --git a/populationsim/balancer.py b/populationsim/balancer.py
@@ -10,7 +10,7 @@
 
 import pandas as pd
 
-from .util import setting
+from activitysim.core.config import setting
 
 
 logger = logging.getLogger(__name__)

diff --git a/populationsim/integerizer.py b/populationsim/integerizer.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import pandas as pd
-from .util import setting
+from activitysim.core.config import setting
 
 from .lp import get_single_integerizer
 from .lp import STATUS_SUCCESS

diff --git a/populationsim/lp.py b/populationsim/lp.py
@@ -4,7 +4,7 @@
 
 import logging
 
-from .util import setting
+from activitysim.core.config import setting
 from . import lp_cvx
 from . import lp_ortools
 

diff --git a/populationsim/lp_cvx.py b/populationsim/lp_cvx.py
@@ -5,7 +5,7 @@
 import logging
 
 import numpy as np
-from .util import setting
+from activitysim.core.config import setting
 
 logger = logging.getLogger(__name__)
 

diff --git a/populationsim/multi_integerizer.py b/populationsim/multi_integerizer.py
@@ -12,7 +12,7 @@
 import pandas as pd
 
 
-from .util import setting
+from activitysim.core.config import setting
 
 from .lp import get_simul_integerizer
 from .lp import STATUS_SUCCESS

diff --git a/populationsim/simul_balancer.py b/populationsim/simul_balancer.py
@@ -11,7 +11,7 @@
 
 import pandas as pd
 
-from .util import setting
+from activitysim.core.config import setting
 
 logger = logging.getLogger(__name__)
 

diff --git a/populationsim/steps/__init__.py b/populationsim/steps/__init__.py
@@ -1,7 +1,10 @@
-from __future__ import absolute_import
 # PopulationSim
 # See full license in LICENSE.txt.
 
+from __future__ import absolute_import
+
+from activitysim.core import inject as _inject
+
 from . import input_pre_processor
 from . import setup_data_structures
 from . import initial_seed_balancing
@@ -11,7 +14,14 @@
 from . import sub_balancing
 from . import expand_households
 from . import summarize
-from . import write_tables
 from . import write_synthetic_population
-
 from . import repop_balancing
+
+from activitysim.core.steps.output import write_data_dictionary
+from activitysim.core.steps.output import write_tables
+
+
+@_inject.injectable(cache=True)
+def preload_injectables():
+    _inject.add_step('write_data_dictionary', write_data_dictionary)
+    _inject.add_step('write_tables', write_tables)
diff --git a/populationsim/steps/expand_households.py b/populationsim/steps/expand_households.py
@@ -11,7 +11,7 @@
 from activitysim.core import pipeline
 from activitysim.core import inject
 
-from populationsim.util import setting
+from activitysim.core.config import setting
 from .helper import get_control_table
 from .helper import get_weight_table
 

diff --git a/populationsim/steps/final_seed_balancing.py b/populationsim/steps/final_seed_balancing.py
@@ -8,7 +8,7 @@
 
 from activitysim.core import inject
 
-from populationsim.util import setting
+from activitysim.core.config import setting
 
 from ..balancer import do_balancing
 from .helper import get_control_table

diff --git a/populationsim/steps/initial_seed_balancing.py b/populationsim/steps/initial_seed_balancing.py
@@ -8,7 +8,7 @@
 from activitysim.core import inject
 from activitysim.core import pipeline
 
-from populationsim.util import setting
+from activitysim.core.config import setting
 
 from ..balancer import do_balancing
 

diff --git a/populationsim/steps/input_pre_processor.py b/populationsim/steps/input_pre_processor.py
@@ -7,12 +7,11 @@
 import pandas as pd
 import numpy as np
 
-from activitysim.core import inject
-from activitysim.core import pipeline
-
-from populationsim.util import data_dir_from_settings
-from populationsim.util import setting
-
+from activitysim.core import (
+    inject,
+    config,
+    input
+)
 
 logger = logging.getLogger(__name__)
 
@@ -30,16 +29,16 @@ def input_pre_processor():
     unless an alternate table_list name is specified as a model step argument 'table_list'.
     (This allows alternate/additional input files to be read for repop)
 
-    In the case of repop, this step is being run after an initial populationsim run has
-    completed, in which case the input_table_list may specify replacement tables.
+    In the case of repop, this step is being run after an initial run has completed,
+    in which case the input_table_list may specify replacement tables.
     (e.g. lowest geography controls that will replace the previous low controls dataframe.)
 
     See input_table_list in settings.yaml in the example folder for a working example
 
     +--------------+----------------------------------------------------------+
     | key          | description                                              |
     +==============+=========================================+================+
-    | tablename    |  ame of pipeline table in which to store dataframe       |
+    | tablename    | name of pipeline table in which to store dataframe       |
     +--------------+----------------------------------------------------------+
     | filename     | name of csv file to read (in data_dir)                   |
     +--------------+----------------------------------------------------------+
@@ -54,85 +53,18 @@ def input_pre_processor():
 
     # alternate table list name may have been provided as a model argument
     table_list_name = inject.get_step_arg('table_list', default='input_table_list')
-    table_list = setting(table_list_name)
-    assert table_list is not None, "table list '%s' not in settings." % table_list_name
+    table_list = config.setting(table_list_name)
 
-    data_dir = data_dir_from_settings()
+    assert table_list is not None, "no table list '%s' found in settings." % table_list_name
+
+    logger.info('Using table list: %s' % table_list)
 
     for table_info in table_list:
 
-        tablename = table_info['tablename']
-
-        logger.info("input_pre_processor processing %s" % tablename)
-
-        # read the csv file
-        data_filename = table_info.get('filename', None)
-        data_file_path = os.path.join(data_dir, data_filename)
-        if not os.path.exists(data_file_path):
-            raise RuntimeError("input_pre_processor %s - input file not found: %s"
-                               % (tablename, data_file_path, ))
-
-        logger.info("Reading csv file %s" % data_file_path)
-        df = read_csv_with_fallback_encoding(data_file_path)
-
-        logger.info("input file columns: %s" % df.columns.values)
-
-        drop_columns = table_info.get('drop_columns', None)
-        if drop_columns:
-            for c in drop_columns:
-                logger.info("dropping column '%s'" % c)
-                del df[c]
-
-        # rename columns
-        column_map = table_info.get('column_map', None)
-        if column_map:
-            df.rename(columns=column_map, inplace=True)
-
-        # set index
-        index_col = table_info.get('index_col', None)
-        if index_col is not None:
-            if index_col in df.columns:
-                assert not df.duplicated(index_col).any()
-                df.set_index(index_col, inplace=True)
-            else:
-                df.index.names = [index_col]
-
-        # read expression file
-        # expression_filename = table_info.get('expression_filename', None)
-        # if expression_filename:
-        #     assert False
-        #     expression_file_path = os.path.join(configs_dir, expression_filename)
-        #     if not os.path.exists(expression_file_path):
-        #         raise RuntimeError("input_pre_processor %s - expression file not found: %s"
-        #                            % (table, expression_file_path, ))
-        #     spec = assign.read_assignment_spec(expression_file_path)
-        #
-        #     df_alias = table_info.get('df_alias', table)
-        #
-        #     locals_d = {}
-        #
-        #     results, trace_results, trace_assigned_locals \
-        #         = assign.assign_variables(spec, df, locals_d, df_alias=df_alias)
-        #     # for column in results.columns:
-        #     #     orca.add_column(table, column, results[column])
-        #
-        #     df = pd.concat([df, results], axis=1)
-
-        logger.info("adding table %s" % tablename)
+        tablename = table_info.get('tablename')
+        df = input.read_from_table_info(table_info)
+        logger.info('registering table %s' % tablename)
 
         # add (or replace) pipeline table
         repop = inject.get_step_arg('repop', default=False)
         inject.add_table(tablename, df, replace=repop)
-
-
-def read_csv_with_fallback_encoding(filepath):
-    """read a CSV to a pandas DataFrame using default utf-8 encoding,
-    but try alternate Windows-compatible cp1252 if unicode fails
-
-    """
-    try:
-        return pd.read_csv(filepath, comment='#')
-    except UnicodeDecodeError:
-        logger.warning(
-            "Reading %s with default utf-8 encoding failed, trying cp1252 instead", filepath)
-        return pd.read_csv(filepath, comment='#', encoding='cp1252')
diff --git a/populationsim/steps/integerize_final_seed_weights.py b/populationsim/steps/integerize_final_seed_weights.py
@@ -13,7 +13,7 @@
 from .helper import get_control_table
 from .helper import weight_table_name
 from .helper import get_weight_table
-from populationsim.util import setting
+from activitysim.core.config import setting
 
 logger = logging.getLogger(__name__)
 

diff --git a/populationsim/steps/repop_balancing.py b/populationsim/steps/repop_balancing.py
@@ -7,7 +7,7 @@
 
 from activitysim.core import inject
 
-from populationsim.util import setting
+from activitysim.core.config import setting
 
 from .helper import get_control_table
 from .helper import weight_table_name

diff --git a/populationsim/steps/setup_data_structures.py b/populationsim/steps/setup_data_structures.py
@@ -12,21 +12,22 @@
 
 from activitysim.core import inject
 from activitysim.core import pipeline
+from activitysim.core import config
 
 from ..assign import assign_variable
 from .helper import control_table_name
 from .helper import get_control_table
 from .helper import get_control_data_table
 
-from populationsim.util import setting
+from activitysim.core.config import setting
 
 logger = logging.getLogger(__name__)
 
 
-def read_control_spec(data_filename, configs_dir):
+def read_control_spec(data_filename):
 
     # read the csv file
-    data_file_path = os.path.join(configs_dir, data_filename)
+    data_file_path = config.config_file_path(data_filename)
     if not os.path.exists(data_file_path):
         raise RuntimeError(
             "initial_seed_balancing - control file not found: %s" % (data_file_path,))
@@ -269,7 +270,7 @@ def filter_households(households_df, persons_df, crosswalk_df):
 
 
 @inject.step()
-def setup_data_structures(settings, configs_dir, households, persons):
+def setup_data_structures(settings, households, persons):
     """
     Setup geographic correspondence (crosswalk), control sets, and incidence tables.
 
@@ -289,7 +290,6 @@ def setup_data_structures(settings, configs_dir, households, persons):
     ----------
     settings: dict
         contents of settings.yaml as dict
-    configs_dir: str
     households: pipeline table
     persons: pipeline table
 
@@ -314,7 +314,7 @@ def setup_data_structures(settings, configs_dir, households, persons):
     crosswalk_df = build_crosswalk_table()
     inject.add_table('crosswalk', crosswalk_df)
 
-    control_spec = read_control_spec(setting('control_file_name', 'controls.csv'), configs_dir)
+    control_spec = read_control_spec(setting('control_file_name', 'controls.csv'))
     inject.add_table('control_spec', control_spec)
 
     geographies = settings['geographies']
@@ -346,7 +346,7 @@ def setup_data_structures(settings, configs_dir, households, persons):
 
 
 @inject.step()
-def repop_setup_data_structures(configs_dir, households, persons):
+def repop_setup_data_structures(households, persons):
     """
     Setup geographic correspondence (crosswalk), control sets, and incidence tables for repop run.
 
@@ -360,7 +360,6 @@ def repop_setup_data_structures(configs_dir, households, persons):
 
     Parameters
     ----------
-    configs_dir : str
     households: pipeline table
     persons: pipeline table
 
@@ -379,7 +378,7 @@ def repop_setup_data_structures(configs_dir, households, persons):
 
     # replace control_spec
     control_file_name = setting('repop_control_file_name', 'repop_controls.csv')
-    control_spec = read_control_spec(control_file_name, configs_dir)
+    control_spec = read_control_spec(control_file_name)
 
     # repop control spec should only specify controls for lowest level geography
     assert control_spec.geography.unique() == [low_geography]

diff --git a/populationsim/steps/sub_balancing.py b/populationsim/steps/sub_balancing.py
@@ -12,7 +12,7 @@
 from activitysim.core import inject
 from activitysim.core import pipeline
 
-from populationsim.util import setting
+from activitysim.core.config import setting
 
 from .helper import get_control_table
 from .helper import weight_table_name