Source code for mpu.pd

"""Pandas utility functions."""

# Core Library
import datetime as dt
import logging
from typing import Any, Dict, List, Optional, Tuple

# Third party
import pandas as pd
import pkg_resources

# First party
import mpu.shell

countries_file = pkg_resources.resource_filename("mpu", "data/countries.csv")
countries = pd.read_csv(countries_file)
logger = logging.getLogger(__name__)


[docs]def example_df() -> pd.DataFrame: """Create an example dataframe.""" country_names = ["Germany", "France", "Indonesia", "Ireland", "Spain", "Vatican"] population = [82521653, 66991000, 255461700, 4761865, 46549045, None] population_time = [ dt.datetime(2016, 12, 1), dt.datetime(2017, 1, 1), dt.datetime(2017, 1, 1), None, # Ireland dt.datetime(2017, 6, 1), # Spain None, ] euro = [True, True, False, True, True, True] df = pd.DataFrame( { "country": country_names, "population": population, "population_time": population_time, "EUR": euro, } ) df = df[["country", "population", "population_time", "EUR"]] return df
[docs]def describe(df: pd.DataFrame, dtype: Optional[Dict] = None) -> Dict: """ Print a description of a Pandas dataframe. Parameters ---------- df : pd.DataFrame dtype : Optional[Dict] Maps column names to types """ if dtype is None: dtype = {} print(f"Number of datapoints: {len(df)}") column_info, column_info_meta = _get_column_info(df, dtype) if len(column_info["int"]) > 0: _describe_int(df, column_info) if len(column_info["float"]) > 0: _describe_float(df, column_info) if len(column_info["category"]) > 0: _describe_category(df, column_info, column_info_meta) if len(column_info["time"]) > 0: _describe_time(df, column_info, column_info_meta) if len(column_info["other"]) > 0: _describe_other(df, column_info, column_info_meta) column_types = {} for column_type, columns in column_info.items(): for column_name in columns: if column_type == "other": column_type = "str" column_types[column_name] = column_type return column_types
def _get_column_info(df: pd.DataFrame, dtype: Dict[str, str]) -> Tuple[Dict, Dict]: column_info: Dict[str, List[str]] = { "int": [], "float": [], "category": [], "other": [], "time": [], } float_types = ["float64"] integer_types = ["int64", "uint8"] time_types = ["datetime64[ns]"] other_types = ["object", "category"] column_info_meta: Dict[str, Dict[str, Any]] = {} for column_name in df: column_info_meta[column_name] = {} counter_obj = df[column_name].value_counts() value_list = [ key for key, value in sorted( counter_obj.items(), key=lambda n: (str(n[1]), str(n[0])) ) ] value_count = len(value_list) is_suspicious_cat = ( value_count <= 50 and str(df[column_name].dtype) != "category" and column_name not in dtype ) if is_suspicious_cat: logger.warning( f"Column '{column_name}' has only {value_count} different " f"values ({value_list}). " "You might want to make it a 'category'" ) if len(value_list) > 0: top_count_val = counter_obj.tolist()[0] else: top_count_val = None column_info_meta[column_name]["top_count_val"] = top_count_val column_info_meta[column_name]["value_list"] = value_list column_info_meta[column_name]["value_count"] = value_count is_int_type = ( df[column_name].dtype in integer_types or column_name in dtype and dtype[column_name] in integer_types ) is_float_type = ( df[column_name].dtype in float_types or column_name in dtype and dtype[column_name] in float_types ) is_cat_type = ( str(df[column_name].dtype) in ["category", "bool"] or column_name in dtype and dtype[column_name] in ["category", "bool"] ) is_time_type = str(df[column_name].dtype) in time_types is_other_type = ( str(df[column_name].dtype) in other_types or column_name in dtype and dtype[column_name] in other_types ) if is_int_type: column_info["int"].append(column_name) elif is_float_type: column_info["float"].append(column_name) elif is_cat_type: column_info["category"].append(column_name) elif is_other_type: column_info["other"].append(column_name) elif is_time_type: column_info["time"].append(column_name) else: logger.warning( f"mpu.pd.describe does not know type '{df[column_name].dtype}'" ) return column_info, column_info_meta def _describe_int(df: pd.DataFrame, column_info: Dict) -> None: print("\n## Integer Columns") table = [ ["Column name", "Non-nan", "mean", "std", "min", "25%", "50%", "75%", "max"] ] for column_name in column_info["int"]: row = [] row.append(column_name) row.append(sum(df[column_name].notnull())) row.append(df[column_name].mean()) row.append(df[column_name].std()) row.append(df[column_name].min()) row.append(df[column_name].quantile(0.25)) row.append(df[column_name].quantile(0.50)) row.append(df[column_name].quantile(0.75)) row.append(max(df[column_name])) table.append(row) mpu.shell.print_table(table) def _describe_float(df: pd.DataFrame, column_info: Dict) -> None: print("\n## Float Columns") table = [ ["Column name", "Non-nan", "mean", "std", "min", "25%", "50%", "75%", "max"] ] for column_name in column_info["float"]: row = [] row.append(column_name) row.append(sum(df[column_name].notnull())) row.append(f"{df[column_name].mean():0.2f}") row.append(f"{df[column_name].std():0.2f}") row.append(f"{df[column_name].min():0.2f}") row.append(f"{df[column_name].quantile(0.25):0.2f}") row.append(f"{df[column_name].quantile(0.50):0.2f}") row.append(f"{df[column_name].quantile(0.75):0.2f}") row.append(f"{max(df[column_name]):0.2f}") table.append(row) mpu.shell.print_table(table) def _describe_category( df: pd.DataFrame, column_info: Dict, column_info_meta: Dict ) -> None: print("\n## Category Columns") table = [["Column name", "Non-nan", "unique", "top el", "top (count)", "rest"]] for column_name in column_info["category"]: row = [] row.append(column_name) row.append(sum(df[column_name].notnull())) row.append(len(df[column_name].unique())) row.append(column_info_meta[column_name]["value_list"][0]) row.append(column_info_meta[column_name]["top_count_val"]) rest = str(column_info_meta[column_name]["value_list"][1:])[:40] row.append(rest) table.append(row) mpu.shell.print_table(table) def _describe_time(df: pd.DataFrame, column_info: Dict, column_info_meta: Dict) -> None: print("\n## Time Columns") table = [ ["Column name", "Non-nan", "unique", "top el", "top (count)", "min", "max"] ] for column_name in column_info["time"]: row = [] row.append(column_name) row.append(sum(df[column_name].notnull())) row.append(len(df[column_name].unique())) row.append(column_info_meta[column_name]["value_list"][0]) row.append(column_info_meta[column_name]["top_count_val"]) row.append(df[column_name].min()) row.append(df[column_name].max()) table.append(row) mpu.shell.print_table(table) def _describe_other( df: pd.DataFrame, column_info: Dict, column_info_meta: Dict ) -> None: print("\n## Other Columns") table = [["Column name", "Non-nan", "unique", "top", "(count)", "rest"]] for column_name in column_info["other"]: row = [] row.append(column_name) row.append(sum(df[column_name].notnull())) row.append(len(df[column_name].unique())) row.append(column_info_meta[column_name]["value_list"][0]) row.append(column_info_meta[column_name]["top_count_val"]) rest = str(column_info_meta[column_name]["value_list"][1:])[:40] row.append(rest) table.append(row) mpu.shell.print_table(table)