Source code for itssutils.itssdata

"""
This module contains two classes: A :class:`RawITSSData` class for reading in raw ITSS
data, processing it, and plotting timeseries of basic counts; and a
:class:`ITSSMetrics` class for computing metrics for various groupings of the raw ITSS
data.

"""

import pandas as pd
import pickle

from .loader.load_raw import load_data, load_multiple_years
from .viz import timeseries, scatterplot, zhist, barplot, ratioplot
from .metrics import metrics, zscores, names


[docs]class RawITSSData(object): """Human-readable wrappers around raw ITSS data manipulations Attributes: raw_data_df (pd.DataFrame): raw dataframe """
[docs] def load_single_year(self, year, filename, fast=True, save=False): """ Load a single year of raw data Args: year (int): The year of interest filename (str): The filename containing raw ITSS data fast (bool): Whether to load from pre-processed pickle file save (bool): Whether to save to a pickle file Returns: None Example: >>> rid.load_single_year(2016, '2016_ITSS_Data.txt') """ self.raw_data_df = load_data(year, filename, fast=fast, save=save)
[docs] def load_multiple_years(self, year_file_list, fast=True, save=False): """Load multiple years worth of raw data into a single object Args: year_file_list (list): List of tuples of the format (year, filename) Example: >>> yf_list = [(2012, '2012_ITSS_Data.txt'), (2013, '2013_ITSS_Data.txt')] >>> rid.load_multiple_years(yf_list) """ self.raw_data_df = load_multiple_years(year_file_list, fast=fast, save=save)
[docs] def get_collected_data(self): """Get a list of all the categories of data collected and processed""" return self.raw_data_df.dtypes
[docs] def get_agencies(self): """Return a list of all reporting agencies""" return self.raw_data_df.AgencyName.unique().tolist()
[docs] def get_raw_dataframe(self): """Return the underlying dataframe""" return self.raw_data_df
[docs] def plot_timeseries(self, frequency='1W', agency=None, filter_cols=None, filter_values=True, group=None, title='All Agencies', savename=None, savecsv=None): """Plot a time series of the counts of raw traffic stop data. Args: frequency (str): the pandas-style sampling frequency; default 1W agency (str): The agency to filter by; default None filter_cols (str or list): The column(s) to filter by; default None filter_values (str or int or list) The selected value(s) to filter by within the filter column group (list of str): The column to group by: default None title (str): Plot title savename (str or path): Path to save figure savecsv (str or path): Path to save csv of data used to create figure Examples: >>> # Find the daily number of stops by the Chicago Police >>> rid.plot_timeseries(frequency='1D', agency='Chicago Police') >>> # Find the weekly number of citations issued across all departments >>> rid.plot_timeseries(filter_cols='ResultOfStop', filter_values='Citation') >>> # Find the monthly number of stops by race >>> rid.plot_timeseries(frequency='1M', group='DriverRace') """ ts = self.raw_data_df.set_index('StopDateTime') ylabel = 'Stop Count' grouped = True if group else False # Select only a single agency if given if agency: ts = ts[ts.AgencyName == agency] if not title: title = agency + ' (' + frequency + ')' # Filter all the stops by a given column/value(s) pair if filter_cols: filter_values = [filter_values] if not isinstance(filter_values, list) else filter_values filter_cols = [filter_cols] if not isinstance(filter_values, list) else filter_cols for col in filter_cols: ts = ts[ts[col].isin(filter_values)] # Group by a given category if chosen if group: group = [group] if not isinstance(group, list) else group ts = ts[group].groupby(group) else: ts = ts.DateOfStop timeseries.raw_timeseries(ts, frequency, title, ylabel, grouped=grouped, savename=savename, savecsv=savecsv)
[docs]class ITSSMetrics(object): """Class to wrap ITSS metrics dataframe Attributes: raw_df (pd.DataFrame): The dataframe of raw ITSS data metrics (pd.DataFrame): The dataframe of calculated metrics grouping (list of str): The grouping of calculated metrics """ def __init__(self, itss_data=None): """Constructor requires a RawITSSData to initialize, or none if loading from a saved csv Args: itss_data (optional, :class:`RawITSSData`) """ if itss_data: self.raw_df = itss_data.raw_data_df else: self.raw_df = None self.grouping = None self.metrics = None
[docs] def calculate_metrics(self, grouping, population_csv=None): """ Calculate the metrics, grouping by different items Args: grouping (str or list of str): Columns by which to group the data population_csv (str or path): Filename of population demographic csv Examples: >>> # Calculate the metrics for each racial group across all traffic stops >>> mdf = metrics_by_group(raw_data_df, 'DriverRace') >>> # Calculate yearly metrics by driver sex for each agency >>> mdf = metrics_by_group(raw_data_df, ['AgencyName', 'Year', 'DriverSex']) """ self.metrics = metrics.metrics_by_group(self.raw_df, grouping, population_csv=population_csv) self.grouping = grouping
[docs] def get_grouping(self): """ Return the grouping used to calculate the metrics""" return self.grouping
[docs] def get_metrics_df(self): """ Return the raw metrics dataframe """ return self.metrics
[docs] def get_metrics(self): """ Return a list of all the calculated metrics """ name_map = names.MetricNames() my_metrics = {met: name_map.get_description(met) for met in self.metrics.columns.tolist()} return my_metrics
def _set_level_last(self, name): indices = list(range(len(self.grouping))) name_index = self.grouping.index(name) indices.append(indices.pop(name_index)) return self.metrics.reorder_levels(indices).sort_index()
[docs] def plot_scatter(self, y_index, x_index, metric, size, population_col=None, logscale=False, limits=None, scale_factor=None, z_threshold=5, z_opacity='binary', as_ratio=False, title=None, savename=None, savecsv=False): """ Scatter plot of all agencies Args: y_index (str or tuple): the top-level index to use for the y-axis data (i.e. all levels except agency name) x_index (str or tuple): the top-level index to use for the x-axis data metric (str): the name of the calculated rate to plot, e.g. SearchRate size (str): the name of the metric to use to size the points, e.g. SearchCount logscale (bool): Plot on a loglog scale limits (list or tuple): the limits on the x and y set_axis scale_factor (float): Scaling factor for size of points z_threshold (float): Cutoff threshold to consider something "statistically significant" z_opacity (str): Type of shading to use ('binary', 'gradient', 'filter') as_ratio (bool): Make a ratio plot title (str): Title of the plot savename (str or path): Where to save the figure savecsv (str or path): Where to save a csv of data used to make the figure Examples: >>> # Compare search rates for black and white drivers >>> met.plot_scatter('Black', 'White', 'SearchRate', 'SearchCount', population_col='StopCount') """ sdf = self._set_level_last('AgencyName') ax = scatterplot.make_scatterplot(sdf, x_index, y_index, metric, size, population_col=population_col, logscaling=logscale, limits=limits, scale_factor=scale_factor, z_threshold=z_threshold, z_opacity=z_opacity, as_ratio=as_ratio, title=title, savename=savename, savecsv=savecsv) return ax
[docs] def plot_zhist(self, target_item, reference_item, event_col, total_obs_col, title=None): """ Z-score histogram for a given event/observation count pairing, e.g. SearchCount/StopCount Must have included 'AgencyName' in grouping and grouping must be at least two categories Args: target_item: index of target item, e.g. 'Black' reference_item: index of reference item, e.g. 'White' event_col: column name for event counts, e.g. SearchCount total_obs_col: column name for total observations, e.g. StopCount Examples: >>> # Compare the deviation of black driver search hit rate relative to white driver search hit rate >>> met.plot_zhist('Black', 'White', 'SearchHitCount', 'SearchCount') """ assert 'AgencyName' in self.grouping and len(self.grouping) > 1 sdf = self._set_level_last('AgencyName') zdf = zscores.get_zscore_df(sdf, target_item, reference_item, event_col, total_obs_col) if not title: title = (event_col, total_obs_col) zhist.plot_zhist(zdf, target_item, title=title) return zdf
[docs] def plot_bars(self, target_top_row, target_column, only_include_rows=None, title=None, savename=None, savecsv=False, xax_label=None): """ Make a bar plot of a certain metric. Requires a multi-level metrics calculation be passed in. Args: target_top_row (str): Examples: >>> met.calculate_metrics(['AgencyName', 'DriverRace']) >>> met.plot_bars('Chicago Police', 'SearchRate') """ barplot.make_barplot(self.metrics, target_top_row, target_column, only_include=only_include_rows, title=title, savename=savename, savecsv=savecsv, xax_label=xax_label)
[docs] def plot_timeseries(self, target_column, only_include_rows=None, only_include_entries=None, title=None, ylabel=None, savename=None, savecsv=None): """ Make a timeseries plot Args: target_column (str): The column you want to make the timeseries for only_include_rows (str or tuple or list): Rows of index to include only_include_entries (str or tuple or list): Filter criteria - only include matching entries from target title (str): Plot title ylabel (str): Plot y-axis label savename (str or path): Path to save the plot savecsv (str or path): Path to save a csv of data used to make the plot Examples: >>> met.plot_timeseries('SearchRate', only_include_rows='Chicago Police', only_include_entries=['Black', 'Hispanic/Latino', 'Asian', 'White'], title='Search Rate 2012-2017') """ sdf = self._set_level_last('Year') timeseries.metrics_timeseries(sdf, target_column, only_include_rows=only_include_rows, only_include_entries=only_include_entries, title=title, ylabel=ylabel, savename=savename, savecsv=savecsv)
[docs] def load(self, filename): """ Load a metrics object from a pickle file pickled object is (grouping, metrics_df) tuple """ self.grouping = None self.metrics = None with open(filename, 'rb') as f: (self.grouping, self.metrics) = pickle.load(f)
[docs] def save(self, filename): """ Pickle a metrics object as a (grouping, metrics_df) tuple """ with open(filename, 'wb') as f: pickle.dump((self.grouping, self.metrics), f)
[docs] def save_csv(self, filename): """ Save the current metrics as a csv file """ self.metrics.to_csv(filename)