Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create utilities module which is neccessary to fix #104 #115

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
11 changes: 8 additions & 3 deletions cenpy/moe/replicate_table_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ def read_replicate_file(fname):
table = table.drop(["TBLID", "NAME", "ORDER", "CME", "TITLE"], axis=1)
table = table.pivot(index="GEOID", columns="variable")
table.columns.names = ["categories", "variables"]
# Standardize the names of the columns because the ACB's 2014 tables have
# lowercase titles while others are uppercase.
table = table.rename(columns = {"estimate":"ESTIMATE",
"moe": "MOE",
"se": "SE"})
return table


Expand Down Expand Up @@ -472,9 +477,9 @@ def apply_func(func, data, params={}):
Pandas 81 column dataframe, where the first column is the estimates and
the remaining columns are the replicates.
"""
estimates = func(data.estimate, **params)
estimates = func(data.ESTIMATE, **params)
# subset just the replicates
replicates = data.drop(["estimate", "moe", "SE"], axis=1, level=0)
replicates = data.drop(["ESTIMATE", "MOE", "SE"], axis=1, level=0)
# clean out unused column names
replicates.columns = replicates.columns.remove_unused_levels()
# apply the user function to each replicate
Expand All @@ -488,7 +493,7 @@ def apply_func(func, data, params={}):
]
rep_results = pd.concat(rep_results, axis=1, keys=replicates.columns.levels[0])
# cleanup
rep_results["estimate"] = estimates
rep_results["ESTIMATE"] = estimates
rep_results = rep_results.replace([np.inf, -np.inf], 0) # per census documentation
return rep_results

Expand Down
135 changes: 5 additions & 130 deletions cenpy/products.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from .utilities import _replace_missing
from .utilities import _fuzzy_match
from .utilities import _coerce
from .utilities import _can_int
from .remote import APIConnection
from .explorer import fips_table as _ft
from shapely import geometry
from fuzzywuzzy import fuzz
from warnings import warn
import geopandas
import pandas
Expand All @@ -16,8 +19,6 @@

__all__ = ["Decennial2010", "ACS"]

_ACS_MISSING = (-999999999, -888888888, -666666666, -555555555, -333333333, -222222222)


class _Product(object):
"""The fundamental building block to make pre-configured Census Products, like ACS or Decennial2010."""
Expand Down Expand Up @@ -852,7 +853,7 @@ def tables(self):
result = stems.drop("GEO", axis=0, errors="ignore")
self._stems = result
# keep around the main tables only if they're not crosstabs (ending in alphanumeric)
self._tables = result.loc[[ix for ix in result.index if _can_int(ix[-1])]]
self._tables = result.loc[[ix for ix in result.index if can_int(ix[-1])]]
return self._tables

@property
Expand Down Expand Up @@ -882,129 +883,3 @@ def crosstab_tables(self):
]
return self._crosstabs


#############
# UTILITIES #
#############


def _fuzzy_match(matchtarget, matchlist, return_table=False):
"""
Conduct a fuzzy match with matchtarget, within the list of possible match candidates in matchlist.

Parameters
---------
matchtarget : str
a string to be matched to a set of possible candidates
matchlist : list of str
a list (or iterable) containing strings we are interested in matching
return_table: bool
whether to return the full table of scored candidates, or to return only the single
best match. If False (the default), only the best match is returned.

Notes
-----
consult the docstring for Product.check_match for more information on how the actual matching
algorithm works.
"""
split = matchtarget.split(",")
if len(split) == 2:
target, state = split
elif len(split) == 1:
target = split[0]
else:
raise AssertionError(
"Uncertain place identifier {}. The place identifier should "
'look something like "placename, state" or, for larger areas, '
"like Combined Statistical Areas or Metropolitan Statistical Areas,"
"placename1-placename2, state1-state2-state3".format(target)
)

table = pandas.DataFrame({"target": matchlist})
table["score"] = table.target.apply(
lambda x: fuzz.partial_ratio(target.strip().lower(), x.lower())
)
if len(split) == 1:
if (table.score == table.score.max()).sum() > 1:
ixmax, rowmax = _break_ties(matchtarget, table)
else:
ixmax = table.score.idxmax()
rowmax = table.loc[ixmax]
if return_table:
return rowmax, table.sort_values("score")
return rowmax

in_state = table.target.str.lower().str.endswith(state.strip().lower())

assert any(in_state), (
"State {} is not found from place {}. "
"Should be a standard Census abbreviation, like"
" CA, AZ, NC, or PR".format(state, matchtarget)
)
table = table[in_state]
if (table.score == table.score.max()).sum() > 1:
ixmax, rowmax = _break_ties(matchtarget, table)
else:
ixmax = table.score.idxmax()
rowmax = table.loc[ixmax]
if return_table:
return rowmax, table.sort_values("score")
return rowmax


def _coerce(column, kind):
"""
Converty type of column to kind, or keep column unchanged
if that conversion fails.
"""
try:
return column.astype(kind)
except ValueError:
return column


def _replace_missing(column, missings=_ACS_MISSING):
"""
replace ACS missing values using numpy.nan.
"""
for val in _ACS_MISSING:
column.replace(val, numpy.nan, inplace=True)
return column


def _break_ties(matchtarget, table):
"""
break ties in the fuzzy matching algorithm using a second scoring method
which prioritizes full string matches over substring matches.
"""
split = matchtarget.split(",")
if len(split) == 2:
target, state = split
else:
target = split[0]
table["score2"] = table.target.apply(
lambda x: fuzz.ratio(target.strip().lower(), x.lower())
)
among_winners = table[table.score == table.score.max()]
double_winners = among_winners[among_winners.score2 == among_winners.score2.max()]
if double_winners.shape[0] > 1:
ixmax = double_winners.score2.idxmax()
ixmax_row = double_winners.loc[ixmax]
warn(
"Cannot disambiguate placename {}. Picking the shortest, best "
"matched placename, {}, from {}".format(
matchtarget, ixmax_row.target, ", ".join(double_winners.target.tolist())
)
)
return ixmax, ixmax_row
ixmax = double_winners.score2.idxmax()
return ixmax, double_winners.loc[ixmax]


def _can_int(char):
"""check if a character can be turned into an integer"""
try:
int(char)
return True
except ValueError:
return False
3 changes: 2 additions & 1 deletion cenpy/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from . import tiger as tig
import math
from six import iteritems, PY3
from .utilities import _coerce

if PY3:
unicode = str
Expand Down Expand Up @@ -220,7 +221,7 @@ def query(self, cols=None, geo_unit="", geo_filter={}, apikey="", **kwargs):
df = pd.DataFrame().from_records(json_content[1:], columns=json_content[0])
assert all([col in df.columns for col in cols])
if convert_numeric:
df = df.infer_objects()
df[cols] = _coerce(df[cols], int)
if index is not "":
df.index = df[index]
return df
Expand Down
55 changes: 55 additions & 0 deletions cenpy/tests/test_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import unittest
import pandas
import numpy
from cenpy.utilities import _coerce as coerce
from cenpy.utilities import _replace_missing as replace_missing

class TestUtilities(unittest.TestCase):

def test_coerce(self):
# Make sure coerce works on Series and doesn't change them
ser_orig = pandas.Series([3,4,5])
ser_floats = coerce(ser_orig, cast_to = numpy.float64)
self.assertFalse(ser_orig.equals(ser_floats))

# Make sure coerce changes what columns it can and doesn't alter
# original data
df_orig = pandas.DataFrame({"ints": [1,2,3],
"floats": [0.1, 3.79, 14.9],
"strings": ["fst", "sec", "thd"]})
df_floats = coerce(df_orig, cast_to = numpy.float64)
# Correct types of columns after coercion:
float_dtypes = pandas.Series(["float64", "float64", "object"],
index = ["ints", "floats", "strings"])
# Make sure that the coerced dtypes are as expected
self.assertFalse(df_orig.equals(df_floats))
self.assertTrue(float_dtypes.equals(df_floats.dtypes))

# Cast castable columns into strings -
# Confusingly enough, pandas calls them "objects"
df_objects = coerce(df_orig, cast_to = str)
object_dtypes = pandas.Series(["object", "object", "object"],
index = ["ints", "floats", "strings"])
self.assertTrue(object_dtypes.equals(df_objects.dtypes))

# Make sure an error gets raised if a non-Series/DataFrame object is used
arr = numpy.zeros((2,2))
self.assertRaises(TypeError, coerce, arr)


def test_replace_missing(self):
df_orig = pandas.DataFrame({"ints": [-888888888,2,3],
"floats": [-555555555, 3.79, -333333333]})
df_replaced = replace_missing(df_orig)
# Correct output after replacing missing values
df_correct = pandas.DataFrame({"ints": [numpy.nan,2,3],
"floats": [numpy.nan, 3.79, numpy.nan]})
self.assertTrue(df_replaced.equals(df_correct))

# Make sure an error is raised if non-Series/DataFrame types are used
arr = numpy.zeros((2,2))
self.assertRaises(TypeError, replace_missing, arr)


if __name__ == "__main__":
unittest.main()
4 changes: 2 additions & 2 deletions cenpy/tiger.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,9 @@ def query(self, **kwargs):
"""
layer_result = kwargs.pop("layer", None)
if isinstance(layer_result, str):
from .products import _fuzzy_match
from .utilities import _fuzzy_match as fuzzy_match

layer_result = _fuzzy_match(
layer_result = fuzzy_match(
layer_result, [f.__repr__() for f in self.layers]
).index
if layer_result is None:
Expand Down
Loading