Source code for itssutils.itssdata

"""
This module contains two classes: A :class:`RawITSSData` class for reading in raw ITSS
data, processing it, and plotting timeseries of basic counts; and a
:class:`ITSSMetrics` class for computing metrics for various groupings of the raw ITSS
data.

"""

import pandas as pd
import pickle

from .loader.load_raw import load_data, load_multiple_years
from .viz import timeseries, scatterplot, zhist, barplot, ratioplot
from .metrics import metrics, zscores, names


[docs]class RawITSSData(object):
    """Human-readable wrappers around raw ITSS data manipulations

    Attributes:
        raw_data_df (pd.DataFrame): raw dataframe
    """

[docs]    def load_single_year(self, year, filename, fast=True, save=False):
        """ Load a single year of raw data

        Args:
            year (int): The year of interest
            filename (str): The filename containing raw ITSS data
            fast (bool): Whether to load from pre-processed pickle file
            save (bool): Whether to save to a pickle file

        Returns:
            None

        Example:
            >>> rid.load_single_year(2016, '2016_ITSS_Data.txt')

        """
        self.raw_data_df = load_data(year, filename, fast=fast, save=save)

[docs]    def load_multiple_years(self, year_file_list, fast=True, save=False):
        """Load multiple years worth of raw data into a single object

        Args:
            year_file_list (list): List of tuples of the format (year, filename)

        Example:
            >>> yf_list = [(2012, '2012_ITSS_Data.txt'), (2013, '2013_ITSS_Data.txt')]
            >>> rid.load_multiple_years(yf_list)
        """
        self.raw_data_df = load_multiple_years(year_file_list, fast=fast, save=save)

[docs]    def get_collected_data(self):
        """Get a list of all the categories of data collected and processed"""
        return self.raw_data_df.dtypes

[docs]    def get_agencies(self):
        """Return a list of all reporting agencies"""
        return self.raw_data_df.AgencyName.unique().tolist()

[docs]    def get_raw_dataframe(self):
        """Return the underlying dataframe"""
        return self.raw_data_df

[docs]    def plot_timeseries(self,
                        frequency='1W',
                        agency=None,
                        filter_cols=None,
                        filter_values=True,
                        group=None,
                        title='All Agencies',
                        savename=None,
                        savecsv=None):
        """Plot a time series of the counts of raw traffic stop data.

        Args:
            frequency (str): the pandas-style sampling frequency; default 1W
            agency (str): The agency to filter by; default None
            filter_cols (str or list): The column(s) to filter by; default None
            filter_values (str or int or list) The selected value(s) to filter by within the filter column
            group (list of str): The column to group by: default None
            title (str): Plot title
            savename (str or path): Path to save figure
            savecsv (str or path): Path to save csv of data used to create figure

        Examples:
            >>> # Find the daily number of stops by the Chicago Police
            >>> rid.plot_timeseries(frequency='1D', agency='Chicago Police')

            >>> # Find the weekly number of citations issued across all departments
            >>> rid.plot_timeseries(filter_cols='ResultOfStop', filter_values='Citation')

            >>> # Find the monthly number of stops by race
            >>> rid.plot_timeseries(frequency='1M', group='DriverRace')
        """
        ts = self.raw_data_df.set_index('StopDateTime')
        ylabel = 'Stop Count'

        grouped = True if group else False

        # Select only a single agency if given
        if agency:
            ts = ts[ts.AgencyName == agency]
            if not title:
                title = agency + ' (' + frequency + ')'

        # Filter all the stops by a given column/value(s) pair
        if filter_cols:
            filter_values = [filter_values] if not isinstance(filter_values, list) else filter_values
            filter_cols = [filter_cols] if not isinstance(filter_values, list) else filter_cols
            for col in filter_cols:
                ts = ts[ts[col].isin(filter_values)]

        # Group by a given category if chosen
        if group:
            group = [group] if not isinstance(group, list) else group
            ts = ts[group].groupby(group)
        else:
            ts = ts.DateOfStop

        timeseries.raw_timeseries(ts, frequency, title, ylabel,
                                  grouped=grouped,
                                  savename=savename,
                                  savecsv=savecsv)


[docs]class ITSSMetrics(object):
    """Class to wrap ITSS metrics dataframe

    Attributes:
        raw_df (pd.DataFrame): The dataframe of raw ITSS data
        metrics (pd.DataFrame): The dataframe of calculated metrics
        grouping (list of str): The grouping of calculated metrics
    """

    def __init__(self, itss_data=None):
        """Constructor requires a RawITSSData to initialize, or none if loading
        from a saved csv

        Args:
            itss_data (optional, :class:`RawITSSData`)
        """
        if itss_data:
            self.raw_df = itss_data.raw_data_df
        else:
            self.raw_df = None

        self.grouping = None
        self.metrics = None

[docs]    def calculate_metrics(self, grouping, population_csv=None):
        """ Calculate the metrics, grouping by different items

        Args:
            grouping (str or list of str): Columns by which to group the data
            population_csv (str or path): Filename of population demographic csv

        Examples:
            >>> # Calculate the metrics for each racial group across all traffic stops
            >>> mdf = metrics_by_group(raw_data_df, 'DriverRace')

            >>> # Calculate yearly metrics by driver sex for each agency
            >>> mdf = metrics_by_group(raw_data_df, ['AgencyName', 'Year', 'DriverSex'])
        """
        self.metrics = metrics.metrics_by_group(self.raw_df,
                                                grouping,
                                                population_csv=population_csv)
        self.grouping = grouping

[docs]    def get_grouping(self):
        """ Return the grouping used to calculate the metrics"""
        return self.grouping

[docs]    def get_metrics_df(self):
        """ Return the raw metrics dataframe """
        return self.metrics

[docs]    def get_metrics(self):
        """ Return a list of all the calculated metrics """
        name_map = names.MetricNames()
        my_metrics = {met: name_map.get_description(met) for met in self.metrics.columns.tolist()}
        return my_metrics

    def _set_level_last(self, name):
        indices = list(range(len(self.grouping)))
        name_index = self.grouping.index(name)
        indices.append(indices.pop(name_index))
        return self.metrics.reorder_levels(indices).sort_index()

[docs]    def plot_scatter(self, y_index, x_index, metric, size,
                     population_col=None,
                     logscale=False,
                     limits=None,
                     scale_factor=None,
                     z_threshold=5,
                     z_opacity='binary',
                     as_ratio=False,
                     title=None,
                     savename=None,
                     savecsv=False):
        """ Scatter plot of all agencies

            Args:
                y_index (str or tuple): the top-level index to use for the y-axis data (i.e. all levels except agency name)
                x_index (str or tuple): the top-level index to use for the x-axis data
                metric (str): the name of the calculated rate to plot, e.g. SearchRate
                size (str): the name of the metric to use to size the points, e.g. SearchCount
                logscale (bool): Plot on a loglog scale
                limits (list or tuple): the limits on the x and y set_axis
                scale_factor (float): Scaling factor for size of points
                z_threshold (float): Cutoff threshold to consider something "statistically significant"
                z_opacity (str): Type of shading to use ('binary', 'gradient', 'filter')
                as_ratio (bool): Make a ratio plot
                title (str): Title of the plot
                savename (str or path): Where to save the figure
                savecsv (str or path): Where to save a csv of data used to make the figure

            Examples:
                >>> # Compare search rates for black and white drivers
                >>> met.plot_scatter('Black', 'White', 'SearchRate', 'SearchCount', population_col='StopCount')
        """
        sdf = self._set_level_last('AgencyName')
        ax = scatterplot.make_scatterplot(sdf, x_index, y_index, metric, size,
                                     population_col=population_col,
                                     logscaling=logscale, limits=limits, scale_factor=scale_factor,
                                     z_threshold=z_threshold, z_opacity=z_opacity, as_ratio=as_ratio,
                                     title=title, savename=savename, savecsv=savecsv)
        return ax

[docs]    def plot_zhist(self, target_item, reference_item, event_col, total_obs_col, title=None):
        """ Z-score histogram for a given event/observation count pairing,
            e.g. SearchCount/StopCount
            Must have included 'AgencyName' in grouping and grouping must be
            at least two categories

            Args:
                target_item: index of target item, e.g. 'Black'
                reference_item: index of reference item, e.g. 'White'
                event_col: column name for event counts, e.g. SearchCount
                total_obs_col: column name for total observations, e.g. StopCount

            Examples:
                >>> # Compare the deviation of black driver search hit rate relative to white driver search hit rate
                >>> met.plot_zhist('Black', 'White', 'SearchHitCount', 'SearchCount')
        """
        assert 'AgencyName' in self.grouping and len(self.grouping) > 1
        sdf = self._set_level_last('AgencyName')
        zdf = zscores.get_zscore_df(sdf, target_item, reference_item,
                                    event_col, total_obs_col)
        if not title:
            title = (event_col, total_obs_col)
        zhist.plot_zhist(zdf, target_item, title=title)
        return zdf

[docs]    def plot_bars(self, target_top_row, target_column,
                  only_include_rows=None,
                  title=None,
                  savename=None,
                  savecsv=False,
                  xax_label=None):
        """ Make a bar plot of a certain metric.
            Requires a multi-level metrics calculation be passed in.

            Args:
                target_top_row (str):

            Examples:
                >>> met.calculate_metrics(['AgencyName', 'DriverRace'])
                >>> met.plot_bars('Chicago Police', 'SearchRate')

        """
        barplot.make_barplot(self.metrics, target_top_row, target_column,
                             only_include=only_include_rows,
                             title=title,
                             savename=savename,
                             savecsv=savecsv,
                             xax_label=xax_label)

[docs]    def plot_timeseries(self, target_column,
                        only_include_rows=None,
                        only_include_entries=None,
                        title=None,
                        ylabel=None,
                        savename=None,
                        savecsv=None):
        """ Make a timeseries plot

        Args:
            target_column (str): The column you want to make the timeseries for
            only_include_rows (str or tuple or list): Rows of index to include
            only_include_entries (str or tuple or list): Filter criteria - only include matching entries from target
            title (str): Plot title
            ylabel (str): Plot y-axis label
            savename (str or path): Path to save the plot
            savecsv (str or path): Path to save a csv of data used to make the plot

        Examples:
            >>> met.plot_timeseries('SearchRate', only_include_rows='Chicago Police', only_include_entries=['Black', 'Hispanic/Latino', 'Asian', 'White'], title='Search Rate 2012-2017')
        """
        sdf = self._set_level_last('Year')
        timeseries.metrics_timeseries(sdf, target_column,
                                      only_include_rows=only_include_rows,
                                      only_include_entries=only_include_entries,
                                      title=title, ylabel=ylabel,
                                      savename=savename,
                                      savecsv=savecsv)

[docs]    def load(self, filename):
        """ Load a metrics object from a pickle file
            pickled object is (grouping, metrics_df) tuple
         """
        self.grouping = None
        self.metrics = None
        with open(filename, 'rb') as f:
            (self.grouping, self.metrics) = pickle.load(f)

[docs]    def save(self, filename):
        """ Pickle a metrics object as a (grouping, metrics_df) tuple """
        with open(filename, 'wb') as f:
            pickle.dump((self.grouping, self.metrics), f)

[docs]    def save_csv(self, filename):
        """ Save the current metrics as a csv file """
        self.metrics.to_csv(filename)