Multi-process Bootstrapping for Pandas DataFrames Statistics

I should mention that there’s a subtle bit of legwork needed to enable random seeding in sampling from pandas DataFrames across Python PoolWorker instances. But besides this, the below bootstrapping code is simple and follows pretty much directly from the definition. See Wasserman’s text All of Statistics for concise pseudocode and explanation.

Here’s a Python implementation that uses a process pool, initializing each worker with a unique random seed. This calls the pandas DataFrame.sample() method.

This is written with Bootstrapping Estimates for Comment Likelihood, Hacker News: EDA II in mind and in estimating odds ratios of a certain statistic; hence, observe that parameters are chosen with that dataset in mind.

Multiprocess bootstrapping module

"""
df_query_stats class -> module
for use in querying a pandas dataframe from a threadpool
"""
from multiprocessing.pool import Pool
import numpy as np

class df_query_stats:
    """
    df_query_stats(X, query)
    X = dataframe
    query = region of dataframe, ie select rows containing keyword "car"
    """
    def __init__(self, X, query=0):
        assert query != 0
        global df_query; global _X
        df_query = query; _X = X

    def __del__(self):
        global _X; global df_query
        del _X; del df_query

def init_worker(X,query, crit):
    """
    prepare Process Pool worker's space
    """
    global DF; global criteria; global random; global excep
    global CAT; global TOTAL
    import pandas as pd; import numpy as np; import random

    criteria = crit; excep = None
    DF = X.query(query)

    if DF.shape[0] == 0:
        excep = Exception("Empty query: " + query)
    CAT = DF.query(criteria).shape[0] # CAT = CAT
    TOTAL = DF.shape[0]

def go(inner,seed):
    """
    Return sample results
    """
    global DF; global criteria; global excep; global TOTAL
    random.seed(seed)
    K = TOTAL // 2

    sample = lambda:\
        DF.sample(n=K,replace=True,random_state=random.randint(0,2**32-1))
    if excep is not None:
        print(excep)
        return np.nan, np.nan

    count = lambda: sample().query(criteria).count()[0]
    counts = [count() for i in range(inner)]
    # a single sampled odds ratio is biased, so we take an avg <u, u+1>
    U = np.squeeze([[(u) / (K-u+1) for u in counts],
                    [(u+1) / (K-u+1) for u in counts]])
    return np.log1p(np.mean(U)), np.var(U)

# run this before go()
def criteria_query_size():
    global DF; global criteria; global excep; global CAT; global TOTAL
    return CAT, TOTAL

def bootstrap_mean_var(outer,inner, specific_criteria, nthr=3):
    """
    bootstrap_mean_var(outer,inner, specific_criteria, nthr=3):
    outer = size of outer bootstrap, inner = ''
    specific_criteria = query such as num_comments > 0 for returning an
    indicator, used in bootstrap to estimate mean and variance of proportion
    specific_criteria / X & query**
       where
    query** used to construct df_query_stats()
    """
    global _X; global df_query
    pool = Pool(nthr,initializer=init_worker,
                initargs=(_X,df_query,specific_criteria))
    sizes = pool.starmap(criteria_query_size, [  ()  ])
    L_ = pool.starmap(go, [(inner, np.random.rand()) for j in range(outer)])
    return L_, sizes
Avatar

By Alexander Wei

BA, MS Mathematics, Tufts University

Leave a comment

Your email address will not be published. Required fields are marked *