I should mention that there’s a subtle bit of legwork needed to enable random seeding in sampling from pandas DataFrames across Python PoolWorker instances. But besides this, the below bootstrapping code is simple and follows pretty much directly from the definition. See Wasserman’s text All of Statistics for concise pseudocode and explanation.
Here’s a Python implementation that uses a process pool, initializing each worker with a unique random seed. This calls the pandas DataFrame.sample() method.
This is written with Bootstrapping Estimates for Comment Likelihood, Hacker News: EDA II in mind and in estimating odds ratios of a certain statistic; hence, observe that parameters are chosen with that dataset in mind.
Multiprocess bootstrapping module
""" df_query_stats class -> module for use in querying a pandas dataframe from a threadpool """ from multiprocessing.pool import Pool import numpy as np class df_query_stats: """ df_query_stats(X, query) X = dataframe query = region of dataframe, ie select rows containing keyword "car" """ def __init__(self, X, query=0): assert query != 0 global df_query; global _X df_query = query; _X = X def __del__(self): global _X; global df_query del _X; del df_query def init_worker(X,query, crit): """ prepare Process Pool worker's space """ global DF; global criteria; global random; global excep global CAT; global TOTAL import pandas as pd; import numpy as np; import random criteria = crit; excep = None DF = X.query(query) if DF.shape[0] == 0: excep = Exception("Empty query: " + query) CAT = DF.query(criteria).shape[0] # CAT = CAT TOTAL = DF.shape[0] def go(inner,seed): """ Return sample results """ global DF; global criteria; global excep; global TOTAL random.seed(seed) K = TOTAL // 2 sample = lambda:\ DF.sample(n=K,replace=True,random_state=random.randint(0,2**32-1)) if excep is not None: print(excep) return np.nan, np.nan count = lambda: sample().query(criteria).count()[0] counts = [count() for i in range(inner)] # a single sampled odds ratio is biased, so we take an avg <u, u+1> U = np.squeeze([[(u) / (K-u+1) for u in counts], [(u+1) / (K-u+1) for u in counts]]) return np.log1p(np.mean(U)), np.var(U) # run this before go() def criteria_query_size(): global DF; global criteria; global excep; global CAT; global TOTAL return CAT, TOTAL def bootstrap_mean_var(outer,inner, specific_criteria, nthr=3): """ bootstrap_mean_var(outer,inner, specific_criteria, nthr=3): outer = size of outer bootstrap, inner = '' specific_criteria = query such as num_comments > 0 for returning an indicator, used in bootstrap to estimate mean and variance of proportion specific_criteria / X & query** where query** used to construct df_query_stats() """ global _X; global df_query pool = Pool(nthr,initializer=init_worker, initargs=(_X,df_query,specific_criteria)) sizes = pool.starmap(criteria_query_size, [ () ]) L_ = pool.starmap(go, [(inner, np.random.rand()) for j in range(outer)]) return L_, sizes