import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, ParameterGrid, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion

import itertools

import concurrent.futures
from itertools import repeat

from sklearn.base import is_classifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection._split import check_cv

from sklearn.externals.joblib import Parallel, delayed

from sklearn.base import clone

from sklearn.model_selection._search import BaseSearchCV

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

The Elegance and Inefficiencies of GridSeaching a Pipeline

The Set Up/Problem

The idea for this comes from my growing habit of using pipelines to organize my data modeling process, as well as using it as a why to try out many different parameters and methodologies at once. Now this by it’s nature is a time intensive process as it is creating many thousands (based on how many parameters you are trying) of models, but I also realized that due to the way in which I format and search my pipelines, I was aggravating the problem.

To understand where the issue comes into play, I will illlustrate the framework I use for pipelines below.

  • Pipeline
    • Feature Union - This Feature Union holds pipelines for cleaning/transforming all the data I want to use for the model.
    • Middle Steps - Whether this is scaling all the data, or a form of feature selection.
    • Modeling - This is the final step of the pipeline where the data is given to the model to be used.

    Insert Drawing Here

The issue here is in the Feauture Union. Since I like to use the pipeline to clean my data, I believe there is time lost as for every variation of a model that is tried the entire pipeline is running which means that the data, which has already been fit, is getting fit again everytime anyway. While this has most likely had trivial impact for my uses so far, using simple cleaning methods like mapping values, as I’ve moved into more intensive processes like Natural Languange Processing I think this becomes a much more significant time issue.

Now a simple solution would be to remove the Feature Union from the pipeline, fit it seperately, and then use the remaining pipeline for the gridsearch, however, that loses the ability to also easily GridSearch pieces of the data cleaning process.

So my goal is to confirm that the problem I think should exits truly has impact, and then implement my own estimator class as a wrapper for GridSearch to better optimize for my specific use case and pipeline format.

Sample Data

I will be using a consistent set of data to “Economic News” data to run these tests. I will not do a Train-Test split because I really don’t care about the actual results of the model here, just the tiem it takes to fit.

df = pd.read_csv('sample_data/economic_news.csv', usecols=[7, 11, 14])
df.text = df.text.apply(lambda x: x.replace('</br>', ''))
df.relevance = df.relevance.apply(lambda x: 1 if x == 'yes' else 0)
(8000, 3)
relevance headline text
0 1 Yields on CDs Fell in the Latest Week NEW YORK -- Yields on most certificates of dep...
1 0 The Morning Brief: White House Seeks to Limit ... The Wall Street Journal OnlineThe Morning Brie...
2 0 Banking Bill Negotiators Set Compromise --- Pl... WASHINGTON -- In an effort to achieve banking ...
3 0 Manager's Journal: Sniffing Out Drug Abusers I... The statistics on the enormous costs of employ...
4 1 Currency Trading: Dollar Remains in Tight Rang... NEW YORK -- Indecision marked the dollar's ton...

Proving the Problem

In this section I will look at the amount of time it takes for a few types of data cleaning/transofrmation of different complexities, and how they scale on their own and within GridSearches in order to see how much time constantly refitting data is costing.

#set up some stuff

#of times to GridSearch something
reps = [1,5,10,25,50,75,100,150,200,250,500,750,1000,2500,5000]

tmp = []
df['nums'] = df.index
rs = 779
df.nums = df.nums.apply(lambda x: 100*x* np.random.rand())
relevance headline text nums
0 1 Yields on CDs Fell in the Latest Week NEW YORK -- Yields on most certificates of dep... 0.000000
1 0 The Morning Brief: White House Seeks to Limit ... The Wall Street Journal OnlineThe Morning Brie... 72.259737
2 0 Banking Bill Negotiators Set Compromise --- Pl... WASHINGTON -- In an effort to achieve banking ... 184.913663
3 0 Manager's Journal: Sniffing Out Drug Abusers I... The statistics on the enormous costs of employ... 94.107106
4 1 Currency Trading: Dollar Remains in Tight Rang... NEW YORK -- Indecision marked the dollar's ton... 336.160412
eff_orig = pd.DataFrame(reps, columns=['reps'])
eff_orig['fits'] = eff_orig.reps.apply(lambda x: x*2)
reps fits
0 1 2
1 5 10
2 10 20
3 25 50
4 50 100
5 75 150
6 100 200
7 150 300
8 200 400
9 250 500
10 500 1000
11 750 1500
12 1000 2000
13 2500 5000
14 5000 10000
eff_cached = eff_orig.copy()
eff_ss = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', StandardScaler())
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =,1), df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_orig['ss_orig'] = eff_ss
eff_pf = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', PolynomialFeatures())
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =,1), df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_orig['pf_orig'] = eff_pf
eff_cv = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', CountVectorizer())
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_orig['cv_orig'] = eff_cv
eff_tv = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', TfidfVectorizer())
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_orig['tv_orig'] = eff_tv
eff_cv_td = []
for x in reps:
    fu = FeatureUnion([
        ('text', Pipeline([
            ('pre-process', CountVectorizer()),
            ('truncate', TruncatedSVD(n_components=1, random_state=rs))
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_orig['cv_td_orig'] = eff_cv_td
eff_tv_td = []
for x in reps:
    fu = FeatureUnion([
        ('text', Pipeline([
            ('pre-process', TfidfVectorizer()),
            ('truncate', TruncatedSVD(n_components=1, random_state=rs))
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_orig['tv_td_orig'] = eff_tv_td

Save Eff table to csv so no need to rerun

reps fits ss_orig pf_orig cv_orig tv_orig cv_td_orig tv_td_orig
0 1 2 0.225094 0.210267 6.232115 4.133495 4.286770 4.524483
1 5 10 0.194651 0.201753 11.817011 8.571318 8.815194 9.548219
2 10 20 0.207121 0.206395 18.046697 13.498752 13.731576 15.187596
3 25 50 0.310750 0.312342 39.263028 31.436065 31.830212 32.558791
4 50 100 0.424310 0.407476 73.355245 58.983665 59.402505 60.533114
5 75 150 0.521270 0.522610 106.176211 86.964126 88.038196 89.465739
6 100 200 0.527857 0.547297 139.342304 115.449600 116.504539 118.238184
7 150 300 0.715433 0.746119 207.939189 172.870768 174.088018 175.437701
8 200 400 0.841306 0.806791 274.387210 228.160459 230.957761 233.329937
9 250 500 0.994794 0.991691 341.709382 285.126249 288.773181 292.719185
10 500 1000 1.674138 1.710237 678.446888 568.165274 573.782917 579.152579
11 750 1500 2.371095 2.430035 1014.473684 851.777699 861.632572 865.554674
12 1000 2000 3.139456 3.147182 1352.139445 1134.881215 1146.537355 1154.976613
13 2500 5000 7.991735 8.531802 3367.759985 2829.689819 2864.428314 2883.357771
14 5000 10000 16.730484 15.057567 6756.219261 5671.877889 5720.519433 5791.198268
eff_orig.to_csv('efficiencyDForig.csv', index=False)

Try with Pipeline Memory Caching

from tempfile import mkdtemp
from sklearn.externals.joblib import Memory
cachedir = mkdtemp()
eff_ss = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', StandardScaler())
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    ], memory=Memory(cachedir=cachedir, verbose=0))
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =,1), df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_cached['ss_cached'] = eff_ss
eff_pf = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', PolynomialFeatures())
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    ], memory=Memory(cachedir=cachedir, verbose=0))
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =,1), df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_cached['pf_cached'] = eff_pf
eff_cv = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', CountVectorizer())
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    ], memory=Memory(cachedir=cachedir, verbose=0))
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_cached['cv_cached'] = eff_cv
eff_tv = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', TfidfVectorizer())
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    ], memory=Memory(cachedir=cachedir, verbose=0))
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_cached['tv_cached'] = eff_tv
eff_cv_td = []
for x in reps:
    fu = FeatureUnion([
        ('text', Pipeline([
            ('pre-process', CountVectorizer()),
            ('truncate', TruncatedSVD(n_components=1, random_state=rs))
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    ], memory=Memory(cachedir=cachedir, verbose=0))
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_cached['cv_td_cached'] = eff_cv_td
eff_tv_td = []
for x in reps:
    fu = FeatureUnion([
        ('text', Pipeline([
            ('pre-process', TfidfVectorizer()),
            ('truncate', TruncatedSVD(n_components=1, random_state=rs))
    pipe = Pipeline([
        ('fu', fu),
        ('model', LogisticRegression())
    ], memory=Memory(cachedir=cachedir, verbose=0))
    params = {
    gs = GridSearchCV(pipe, params, verbose=1, n_jobs=-1, cv=2)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_cached['tv_td_cached'] = eff_tv_td
reps fits ss_cached pf_cached cv_cached tv_cached cv_td_cached tv_td_cached
0 1 2 0.220370 0.213159 8.577361 6.538034 6.739061 6.718483
1 5 10 0.210165 0.217480 10.221625 7.189253 6.298098 6.342444
2 10 20 0.211886 0.215039 15.564601 11.348357 10.216127 10.532659
3 25 50 0.326096 0.325405 34.597346 26.822049 24.723027 25.100113
4 50 100 0.417474 0.424614 64.362823 50.839350 47.331228 48.007842
5 75 150 0.598978 0.532198 94.691714 75.714144 70.753043 71.175799
6 100 200 0.657977 0.604862 125.989611 100.350146 93.655833 94.807464
7 150 300 0.848278 0.831244 187.994630 150.029235 139.665444 141.309259
8 200 400 1.000400 1.069972 249.437291 199.660035 186.383742 188.557826
9 250 500 1.278369 1.291126 310.134293 249.871598 232.914126 235.549512
10 500 1000 2.243304 2.248786 616.757385 499.087750 464.231040 468.925885
11 750 1500 3.232118 3.320747 919.324976 745.893859 696.744014 702.962536
12 1000 2000 4.202490 4.257102 1233.318759 992.546725 932.418590 937.363592
13 2500 5000 10.107030 10.359197 3063.400708 2481.567094 2326.713119 2337.549000
14 5000 10000 19.715110 20.546900 6141.560259 4969.542471 4642.505104 4678.323244
eff_cached.to_csv('efficiencyDFcached.csv', index=False)

Visualize Times

eff_orig = pd.read_csv('efficiencyDForig.csv')
reps fits ss_orig pf_orig cv_orig tv_orig cv_td_orig tv_td_orig
0 1 2 0.225094 0.210267 6.232115 4.133495 4.286770 4.524483
1 5 10 0.194651 0.201753 11.817011 8.571318 8.815194 9.548219
2 10 20 0.207121 0.206395 18.046697 13.498752 13.731576 15.187596
3 25 50 0.310750 0.312342 39.263028 31.436065 31.830212 32.558791
4 50 100 0.424310 0.407476 73.355245 58.983665 59.402505 60.533114
5 75 150 0.521270 0.522610 106.176211 86.964126 88.038196 89.465739
6 100 200 0.527857 0.547297 139.342304 115.449600 116.504539 118.238184
7 150 300 0.715433 0.746119 207.939189 172.870768 174.088018 175.437701
8 200 400 0.841306 0.806791 274.387210 228.160459 230.957761 233.329937
9 250 500 0.994794 0.991691 341.709382 285.126249 288.773181 292.719185
10 500 1000 1.674138 1.710237 678.446888 568.165274 573.782917 579.152579
11 750 1500 2.371095 2.430035 1014.473684 851.777699 861.632572 865.554674
12 1000 2000 3.139456 3.147182 1352.139445 1134.881215 1146.537355 1154.976613
13 2500 5000 7.991735 8.531802 3367.759985 2829.689819 2864.428314 2883.357771
14 5000 10000 16.730484 15.057567 6756.219261 5671.877889 5720.519433 5791.198268
eff_cached = pd.read_csv('efficiencyDFcached.csv')
reps fits ss_cached pf_cached cv_cached tv_cached cv_td_cached tv_td_cached
0 1 2 0.220370 0.213159 8.577361 6.538034 6.739061 6.718483
1 5 10 0.210165 0.217480 10.221625 7.189253 6.298098 6.342444
2 10 20 0.211886 0.215039 15.564601 11.348357 10.216127 10.532659
3 25 50 0.326096 0.325405 34.597346 26.822049 24.723027 25.100113
4 50 100 0.417474 0.424614 64.362823 50.839350 47.331228 48.007842
5 75 150 0.598978 0.532198 94.691714 75.714144 70.753043 71.175799
6 100 200 0.657977 0.604862 125.989611 100.350146 93.655833 94.807464
7 150 300 0.848278 0.831244 187.994630 150.029235 139.665444 141.309259
8 200 400 1.000400 1.069972 249.437291 199.660035 186.383742 188.557826
9 250 500 1.278369 1.291126 310.134293 249.871598 232.914126 235.549512
10 500 1000 2.243304 2.248786 616.757385 499.087750 464.231040 468.925885
11 750 1500 3.232118 3.320747 919.324976 745.893859 696.744014 702.962536
12 1000 2000 4.202490 4.257102 1233.318759 992.546725 932.418590 937.363592
13 2500 5000 10.107030 10.359197 3063.400708 2481.567094 2326.713119 2337.549000
14 5000 10000 19.715110 20.546900 6141.560259 4969.542471 4642.505104 4678.323244
fig, ax = plt.subplots(figsize=(16,8))
plt.plot(eff_orig['fits'], eff_orig['ss_orig'], 
         lw=3, color='red', label='Standard Scaler Original')
plt.plot(eff_orig['fits'], eff_orig['pf_orig'], 
         lw=3, color='red', linestyle='--', label='Polynomial Feature Original')
plt.plot(eff_orig['fits'], eff_orig['cv_orig'], 
         lw=3, color='red', linestyle='-.', label='Count Vectorizer Original')
plt.plot(eff_orig['fits'], eff_orig['tv_orig'], 
         lw=3, color='red', linestyle=':', label='TfidVectorizer Original')

plt.plot(eff_cached['fits'], eff_cached['ss_cached'], 
         lw=3, color='blue', label='Standard Scaler Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['pf_cached'], 
         lw=3, color='blue', linestyle='--', label='Polynomial Feature Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['cv_cached'], 
         lw=3, color='blue', linestyle='-.', label='Count Vectorizer Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['tv_cached'], 
         lw=3, color='blue', linestyle=':', label='TfidVectorizer Memory Cached')

plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Pipelines")
plt.legend(loc = 'best')


fig, ax = plt.subplots(figsize=(16,8))
plt.plot(eff_orig['fits'], eff_orig['ss_orig'], 
         lw=3, color='red', label='Standard Scaler Original')
plt.plot(eff_orig['fits'], eff_orig['pf_orig'], 
         lw=3, color='red', linestyle='--', label='Polynomial Feature Original')

plt.plot(eff_cached['fits'], eff_cached['ss_cached'], 
         lw=3, color='blue', label='Standard Scaler Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['pf_cached'], 
         lw=3, color='blue', linestyle='--', label='Polynomial Feature Memory Cached')

plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Pipelines")
plt.legend(loc = 'best')


fig, ax = plt.subplots(figsize=(16,8))
plt.plot(eff_orig['fits'], eff_orig['cv_orig'], 
         lw=3, color='red', linestyle='-', label='Count Vectorizer Original')
plt.plot(eff_orig['fits'], eff_orig['tv_orig'], 
         lw=3, color='red', linestyle='--', label='TfidVectorizer Original')

plt.plot(eff_cached['fits'], eff_cached['cv_cached'], 
         lw=3, color='blue', linestyle='-', label='Count Vectorizer Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['tv_cached'], 
         lw=3, color='blue', linestyle='--', label='TfidVectorizer Memory Cached')

plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Pipelines")
plt.legend(loc = 'best')


fig = plt.figure(figsize=(20,25))

ax1 = fig.add_subplot(3,2,1)
ax1.plot(eff_orig['fits'], eff_orig['ss_orig'], 
         lw=3, color='red', label='Original')
ax1.plot(eff_cached['fits'], eff_cached['ss_cached'], 
         lw=3, color='blue', label='Memory Cached')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Standard Scalar Pipeline")
plt.legend(loc = 'best')

ax2 = fig.add_subplot(3, 2, 2)
ax2.plot(eff_orig['fits'], eff_orig['pf_orig'], 
         lw=3, color='red', label='Original')
ax2.plot(eff_cached['fits'], eff_cached['pf_cached'], 
         lw=3, color='blue', label='Memory Cached')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Polynomial Feature Pipeline")
plt.legend(loc = 'best')

ax3 = fig.add_subplot(3, 2, 3)
ax3.plot(eff_orig['fits'], eff_orig['cv_orig'], 
         lw=3, color='red', label='Original')
ax3.plot(eff_cached['fits'], eff_cached['cv_cached'], 
         lw=3, color='blue', label='Memory Cached')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Count Vectorizer Pipeline")
plt.legend(loc = 'best')

ax4 = fig.add_subplot(3, 2, 4)
ax4.plot(eff_orig['fits'], eff_orig['tv_orig'], 
         lw=3, color='red', label='Original')
ax4.plot(eff_cached['fits'], eff_cached['tv_cached'], 
         lw=3, color='blue', label='Memory Cached')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit TfidVectorizer Pipeline")
plt.legend(loc = 'best')

ax5 = fig.add_subplot(3, 2, 5)
ax5.plot(eff_orig['fits'], eff_orig['cv_td_orig'], 
         lw=3, color='red', label='Original')
ax5.plot(eff_cached['fits'], eff_cached['cv_td_cached'], 
         lw=3, color='blue', label='Memory Cached')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Count Vectorizer with Truncation Pipeline")
plt.legend(loc = 'best')

ax6 = fig.add_subplot(3, 2, 6)
ax6.plot(eff_orig['fits'], eff_orig['tv_td_orig'], 
         lw=3, color='red', label='Original')
ax6.plot(eff_cached['fits'], eff_cached['tv_td_cached'], 
         lw=3, color='blue', label='Memory Cached')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to TfidVectorizor with Truncation Pipeline")
plt.legend(loc = 'best')


Solving the Issue

import pprint
pp = pprint.PrettyPrinter()
text_pipe = Pipeline([('nlp', CountVectorizer())])

fu = FeatureUnion([
                   ('text', text_pipe),
                   ('text2', text_pipe),
                   ('BLAHBLAH', text_pipe)

modeling_pipe = Pipeline([
                            ('data', fu),
                            ('model', LogisticRegression())
def logan_search_fit_helper(pipe, params, fset):
    scores = []
    for k in fset:
        #k[0]=Xtr, k[1]=Xte, k[2]=ytr, k[3]=yte
        if isinstance(k[0], np.ndarray):
            #Is a numpy array
  [0], k[2])
            scores.append(pipe.score(k[1], k[3]))
            #Is not Numpy Array
  [0].copy(), k[2].copy())
            scores.append(pipe.score(k[1].copy(), k[3].copy()))
    return (np.mean(scores), params)
class LoganSearch(BaseSearchCV):
    def __init__(self, fu, estimator, param_grid, fu_params={}, 
                 n_jobs=1, cv=3, verbose=0):
        #set unique attributes
        self._fus_ = []
        self._base_fu = fu
        self._base_fu_params = fu_params
        self.cv_ = cv
        #set attributes for results
        self.best_estimator_ = None
        self.best_score_ = None
        self.best_params_ = None
        #set attributes for GridSearch
        self._estimator = estimator
        self._param_grid = param_grid
        self._n_jobs = n_jobs
        self._verbose = verbose
    def fit(self, X, y):
        self.__set_all_fus(self._base_fu, self._base_fu_params)
        folder = check_cv(cv=self.cv_, y=y, classifier=is_classifier(self._estimator))
        scores = []
        for fu in self._fus_:
            fset = []
            for tr, te in folder.split(X,y):
                Xtmp_tr = fu.fit_transform(X[tr])
                Xtmp_te = fu.transform(X[te])
                ytmp_tr = y[tr]
                ytmp_te = y[te]
                fset.append((Xtmp_tr.copy(), Xtmp_te.copy(), 
                             ytmp_tr.copy(), ytmp_te.copy()))
            print('Done Transforming Data')
            n_splits = folder.get_n_splits()
            n_candidates = len(ParameterGrid(self._param_grid))
            if self._verbose > 0:
                print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

            tmp_results = Parallel(n_jobs=self._n_jobs, verbose=self._verbose
                                    (clone(self._estimator), params, fset) 
                                    for params in ParameterGrid(self._param_grid))
        scores.sort(reverse=True, key=lambda x: x[0])
        self.best_score_ = scores[0][0]
        self.best_params_ = scores[0][1]
        return None
    def score(self, X, y):
        if not self.best_estimator_:
            print('Model must first be fit')
            return None
            return self.best_estimator_.score(X, y)
    def predict(self, X, y):
        if not self.best_estimator_:
            print('Model must first be fit')
            return None
            return self.best_estimator_.predict(X, y)
    def predict_proba(self, X, y):
        if not self.best_estimator_:
            print('Model must first be fit')
            return None
            return self.best_estimator_.predict_proba(X, y)
    def __set_all_fus(self, base_fu, fu_params):
        tmp_fus = []
        for p_name, pipe in base_fu.transformer_list:
            tmp_params = sorted([k for k in fu_params.keys() if (k.split("__")[0]==p_name)], 
                                key=lambda x: x.count('__'))
            #are there parameters to change for this pipe?
            #there are no parameters matching this pipe, so add as is
            if not tmp_params: 
                #list of feature unions is empty
                if not tmp_fus: 
                    tmp_fus.append(FeatureUnion([(p_name, pipe)]))
                #already feature unions, append to them
                    for tmp_fu in tmp_fus:
                        tmp_fu.transformer_list.append((p_name, pipe))
            #there are paramters to change   
                tmp_pipes = [pipe]
                if tmp_params[0] == p_name:
                    tmp_pipes = fu_params[p_name]
#                     tmp_pipes = [self.add_params(tmp_pipe, {p_name: tmp_pipe}) 
#                                  for tmp_pipe in fu_params[p_name]]
                    tmp_params = tmp_params[1:]
                new_pipes = []
                if tmp_params:
                    for pipe in tmp_pipes:
                        param_dict = {k:v for k, v in fu_params.items() if k in tmp_params}
                        for param_comb in ParameterGrid(param_dict):
                            tmp_pipe = clone(pipe)
                            #tmp_pipe = self.add_params(tmp_pipe, param_comb)
                            params_edited = {k.lstrip(p_name+"__"):v for k,v in param_comb.items()}
                    new_pipes = tmp_pipes

                #add new pipes to feature unions
                new_fus = []
                #list of feature unions is empty
                if not tmp_fus: 
                    for pipe in new_pipes:
                            new_fu = FeatureUnion([(p_name, pipe)])
                #already feature unions, append to them
                    for tmp_fu in tmp_fus:
                        for pipe in new_pipes:
                            new_fu = clone(tmp_fu)
                            new_fu.transformer_list.append((p_name, pipe))
                tmp_fus = new_fus
        self._fus_ = tmp_fus
        return None
    def add_params(obj, params):
        if hasattr(obj, "params"):
            obj.params = params
        return obj
text_pipe = Pipeline([('nlp', CountVectorizer())])

fu = FeatureUnion([
                   ('text', text_pipe),
                   ('text2', text_pipe),
                   ('BLAHBLAH', text_pipe)

modeling_pipe = Pipeline([
                            ('data', fu),
                            ('model', LogisticRegression())
params = {

gs = GridSearchCV(modeling_pipe, params, cv=3, n_jobs=-1, verbose=1)

fu_params1 = {
    'text__nlp': [CountVectorizer(),

fu_params2 = {
    'text': [Pipeline([('nlp2a', CountVectorizer())]),
            Pipeline([('nlp2b', TfidfVectorizer())])

fu_params3 = {
    'text__nlp': [CountVectorizer(),
    'text__nlp__ngram_range':[(1,1), (1,2)],
    'text2': [Pipeline([('nlp2a', CountVectorizer())]),
            Pipeline([('nlp2b', TfidfVectorizer())])
tfu = FeatureUnion([
    ('text', StandardScaler())

tfpipe = Pipeline([
    ('model', LogisticRegression())

tpipe = Pipeline([
    ('data', tfu),
    ('model', LogisticRegression())

tparams = {

cv = 2

ls = LoganSearch(tfu, tfpipe, tparams, n_jobs=-1, cv=2, verbose=1),1), df.relevance)
pp.pprint((ls.best_score_, ls.best_params_))

gs = GridSearchCV(tpipe, tparams, cv=cv, n_jobs=-1, verbose=1),1), df.relevance)
pp.pprint((gs.best_score_, gs.best_params_))
Fitting 2 folds for each of 5000 candidates, totalling 10000 fits

[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:    8.0s finished

(0.82250000000000001, {'model__random_state': 0})
Fitting 2 folds for each of 5000 candidates, totalling 10000 fits

[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 2368 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 5868 tasks      | elapsed:    8.5s

(0.82250000000000001, {'model__random_state': 0})

[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:   14.0s finished

Run Test with New Function

I will now run the same tests as previously performed, however, I will be using the new LoganSearch class instead of GridSearchCV, and the pipeline will be formatted slightly different to account for that, but will still have the same format.

Reset Params to make it easier to run only parts of the notebook

df = pd.read_csv('sample_data/economic_news.csv', usecols=[7, 11, 14])
df.text = df.text.apply(lambda x: x.replace('</br>', ''))
df.relevance = df.relevance.apply(lambda x: 1 if x == 'yes' else 0)
(8000, 3)
relevance headline text
0 1 Yields on CDs Fell in the Latest Week NEW YORK -- Yields on most certificates of dep...
1 0 The Morning Brief: White House Seeks to Limit ... The Wall Street Journal OnlineThe Morning Brie...
2 0 Banking Bill Negotiators Set Compromise --- Pl... WASHINGTON -- In an effort to achieve banking ...
3 0 Manager's Journal: Sniffing Out Drug Abusers I... The statistics on the enormous costs of employ...
4 1 Currency Trading: Dollar Remains in Tight Rang... NEW YORK -- Indecision marked the dollar's ton...
#set up some stuff

#of times to GridSearch something
reps = [1,5,10,25,50,75,100,150,200,250,500,750,1000,2500,5000]

tmp = []
df['nums'] = df.index
rs = 779
df.nums = df.nums.apply(lambda x: 100*x* np.random.rand())
relevance headline text nums
0 1 Yields on CDs Fell in the Latest Week NEW YORK -- Yields on most certificates of dep... 0.000000
1 0 The Morning Brief: White House Seeks to Limit ... The Wall Street Journal OnlineThe Morning Brie... 72.259737
2 0 Banking Bill Negotiators Set Compromise --- Pl... WASHINGTON -- In an effort to achieve banking ... 184.913663
3 0 Manager's Journal: Sniffing Out Drug Abusers I... The statistics on the enormous costs of employ... 94.107106
4 1 Currency Trading: Dollar Remains in Tight Rang... NEW YORK -- Indecision marked the dollar's ton... 336.160412
eff_logan = pd.DataFrame(reps, columns=['reps'])
eff_logan['fits'] = eff_logan.reps.apply(lambda x: x*2)
reps fits
0 1 2
1 5 10
2 10 20
3 25 50
4 50 100
5 75 150
6 100 200
7 150 300
8 200 400
9 250 500
10 500 1000
11 750 1500
12 1000 2000
13 2500 5000
14 5000 10000
eff_ss = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', StandardScaler())
    pipe = Pipeline([
        ('model', LogisticRegression())
    params = {
    ls = LoganSearch(fu, pipe, params, n_jobs=-1, cv=2, verbose=1)
    #start timer
    start =,1), df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_logan['ss_logan'] = eff_ss
eff_pf = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', PolynomialFeatures())
    pipe = Pipeline([
        ('model', LogisticRegression())
    params = {
    ls = LoganSearch(fu, pipe, params, n_jobs=-1, cv=2, verbose=1)
    #start timer
    start =,1), df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_logan['pf_logan'] = eff_pf
eff_cv = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', CountVectorizer())
    pipe = Pipeline([
        ('model', LogisticRegression())
    params = {
    ls = LoganSearch(fu, pipe, params, n_jobs=-1, cv=2, verbose=1)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_logan['cv_logan'] = eff_cv
Done Transforming Data
Fitting 2 folds for each of 1 candidates, totalling 2 fits

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.1s finished


NameError                                 Traceback (most recent call last)

<ipython-input-144-a8aa73131bd1> in <module>()
     21     stop =
     22     diff = stop - start
---> 23     eff_tv.append(diff.total_seconds())
     25 eff_logan['cv_logan'] = eff_cv

NameError: name 'eff_tv' is not defined
eff_tv = []
for x in reps:
    fu = FeatureUnion([
        ('pre-process', TfidfVectorizer())
    pipe = Pipeline([
        ('model', LogisticRegression())
    params = {
    ls = LoganSearch(fu, pipe, params, n_jobs=-1, cv=2, verbose=1)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_logan['tv_logan'] = eff_tv
eff_cv_td = []
for x in reps:
    fu = FeatureUnion([
        ('text', Pipeline([
            ('pre-process', CountVectorizer()),
            ('truncate', TruncatedSVD(n_components=1, random_state=rs))
    pipe = Pipeline([
        ('model', LogisticRegression())
    params = {
    ls = LoganSearch(fu, pipe, params, n_jobs=-1, cv=2, verbose=1)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_logan['cv_td_logan'] = eff_cv_td
eff_tv_td = []
for x in reps:
    fu = FeatureUnion([
        ('text', Pipeline([
            ('pre-process', TfidfVectorizer()),
            ('truncate', TruncatedSVD(n_components=1, random_state=rs))
    pipe = Pipeline([
        ('model', LogisticRegression())
    params = {
    ls = LoganSearch(fu, pipe, params, n_jobs=-1, cv=2, verbose=1)
    #start timer
    start =, df.relevance)
    #end timer
    stop =
    diff = stop - start
eff_logan['tv_td_logan'] = eff_tv_td

Save to file

reps fits ss_logan pf_logan cv_logan tv_logan cv_td_logan tv_td_logan
0 1 2 0.223909 0.221798 6.232115 3.607295 3.185836 3.321109
1 5 10 0.242797 0.226656 11.817011 3.725415 3.517862 3.260362
2 10 20 0.224017 0.232498 18.046697 4.194502 3.550491 3.297210
3 25 50 0.331112 0.346223 39.263028 5.359049 3.521202 3.375086
4 50 100 0.334166 0.456564 73.355245 7.684017 3.552729 3.456631
5 75 150 0.331425 0.460818 106.176211 9.456448 3.465066 3.469402
6 100 200 0.433914 0.566037 139.342304 11.695089 3.420275 3.470165
7 150 300 0.570316 0.570319 207.939189 15.477273 3.633281 3.938534
8 200 400 0.630874 0.773557 274.387210 18.273176 3.749651 3.728653
9 250 500 0.757504 0.880124 341.709382 22.261672 3.892925 3.772603
10 500 1000 1.172229 1.387544 678.446888 37.435749 4.684382 4.733261
11 750 1500 1.525951 1.821197 1014.473684 57.568446 4.804530 5.297128
12 1000 2000 1.926059 2.506788 1352.139445 79.162025 5.350899 5.342550
13 2500 5000 4.827360 5.549650 3367.759985 186.087186 8.388797 7.705426
14 5000 10000 8.561794 9.462130 6756.219261 356.825039 14.033000 12.286837
eff_logan.to_csv('efficiencyDFlogan.csv', index=False)

Visualization and Conclusion

eff_orig = pd.read_csv('efficiencyDForig.csv')
eff_cached = pd.read_csv('efficiencyDFcached.csv')
eff_logan = pd.read_csv('efficiencyDFlogan.csv')
reps fits ss_logan pf_logan cv_logan tv_logan cv_td_logan tv_td_logan
0 1 2 0.223909 0.221798 6.232115 3.607295 3.185836 3.321109
1 5 10 0.242797 0.226656 11.817011 3.725415 3.517862 3.260362
2 10 20 0.224017 0.232498 18.046697 4.194502 3.550491 3.297210
3 25 50 0.331112 0.346223 39.263028 5.359049 3.521202 3.375086
4 50 100 0.334166 0.456564 73.355245 7.684017 3.552729 3.456631
5 75 150 0.331425 0.460818 106.176211 9.456448 3.465066 3.469402
6 100 200 0.433914 0.566037 139.342304 11.695089 3.420275 3.470165
7 150 300 0.570316 0.570319 207.939189 15.477273 3.633281 3.938534
8 200 400 0.630874 0.773557 274.387210 18.273176 3.749651 3.728653
9 250 500 0.757504 0.880124 341.709382 22.261672 3.892925 3.772603
10 500 1000 1.172229 1.387544 678.446888 37.435749 4.684382 4.733261
11 750 1500 1.525951 1.821197 1014.473684 57.568446 4.804530 5.297128
12 1000 2000 1.926059 2.506788 1352.139445 79.162025 5.350899 5.342550
13 2500 5000 4.827360 5.549650 3367.759985 186.087186 8.388797 7.705426
14 5000 10000 8.561794 9.462130 6756.219261 356.825039 14.033000 12.286837
fig, ax = plt.subplots(figsize=(16,8))
plt.plot(eff_orig['fits'], eff_orig['ss_orig'], 
         lw=2.5, color='red', label='Standard Scaler Original')
plt.plot(eff_orig['fits'], eff_orig['pf_orig'], 
         lw=2.5, color='red', linestyle='--', label='Polynomial Feature Original')
plt.plot(eff_orig['fits'], eff_orig['cv_orig'], 
         lw=2.5, color='red', linestyle='-.', label='Count Vectorizer Original')
plt.plot(eff_orig['fits'], eff_orig['tv_orig'], 
         lw=3, color='red', linestyle=':', label='TfidVectorizer Original')

plt.plot(eff_cached['fits'], eff_cached['ss_cached'], 
         lw=2.5, color='blue', label='Standard Scaler Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['pf_cached'], 
         lw=2.5, color='blue', linestyle='--', label='Polynomial Feature Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['cv_cached'], 
         lw=2.5, color='blue', linestyle='-.', label='Count Vectorizer Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['tv_cached'], 
         lw=3, color='blue', linestyle=':', label='TfidVectorizer Memory Cached')

plt.plot(eff_logan['fits'], eff_logan['ss_logan'], 
         lw=2.5, color='black', label='Standard Scaler New')
plt.plot(eff_logan['fits'], eff_logan['pf_logan'], 
         lw=2.5, color='black', linestyle='--', label='Polynomial Feature New')
plt.plot(eff_logan['fits'], eff_logan['cv_logan'], 
         lw=2.5, color='black', linestyle='-.', label='Count Vectorizer New')
plt.plot(eff_logan['fits'], eff_logan['tv_logan'], 
         lw=3, color='black', linestyle=':', label='TfidVectorizer New')

plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Pipelines")
plt.legend(loc = 'best')


fig, ax = plt.subplots(figsize=(16,8))
plt.plot(eff_orig['fits'], eff_orig['ss_orig'], 
         lw=2.5, color='red', label='Standard Scaler Original')
plt.plot(eff_orig['fits'], eff_orig['pf_orig'], 
         lw=2.5, color='red', linestyle='--', label='Polynomial Feature Original')

plt.plot(eff_cached['fits'], eff_cached['ss_cached'], 
         lw=2.5, color='blue', label='Standard Scaler Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['pf_cached'], 
         lw=2.5, color='blue', linestyle='--', label='Polynomial Feature Memory Cached')

plt.plot(eff_logan['fits'], eff_logan['ss_logan'], 
         lw=2.5, color='black', label='Standard Scaler New')
plt.plot(eff_logan['fits'], eff_logan['pf_logan'], 
         lw=2.5, color='black', linestyle='--', label='Polynomial Feature New')

plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Pipelines")
plt.legend(loc = 'best')


fig, ax = plt.subplots(figsize=(16,8))
plt.plot(eff_orig['fits'], eff_orig['cv_orig'], 
         lw=2.5, color='red', linestyle='-', label='Count Vectorizer Original')
plt.plot(eff_orig['fits'], eff_orig['tv_orig'], 
         lw=3, color='red', linestyle='--', label='TfidVectorizer Original')

plt.plot(eff_cached['fits'], eff_cached['cv_cached'], 
         lw=2.5, color='blue', linestyle='-', label='Count Vectorizer Memory Cached')
plt.plot(eff_cached['fits'], eff_cached['tv_cached'], 
         lw=3, color='blue', linestyle='--', label='TfidVectorizer Memory Cached')

plt.plot(eff_logan['fits'], eff_logan['cv_logan'], 
         lw=2.5, color='black', linestyle='-', label='Count Vectorizer New')
plt.plot(eff_logan['fits'], eff_logan['tv_logan'], 
         lw=3, color='black', linestyle='--', label='TfidVectorizer New')

plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Pipelines")
plt.legend(loc = 'best')


Visualization by Pipline

reps fits ss_logan pf_logan cv_logan tv_logan cv_td_logan tv_td_logan
0 1 2 0.337648 0.329182 4.815905 2.197995 2.043521 2.265842
1 5 10 0.336465 0.344149 6.568332 2.512184 2.158687 2.040282
2 10 20 0.341959 0.336109 8.806089 2.928364 2.333201 2.061443
3 25 50 0.437900 0.423550 16.460682 4.176842 2.383285 2.382574
4 50 100 0.434349 0.542314 28.065589 6.179202 2.338781 2.343176
5 75 150 0.562190 0.641041 40.617662 8.128802 2.553450 2.371559
6 100 200 0.619316 0.646805 50.271060 10.255086 2.554965 2.452167
7 150 300 0.785087 0.839261 71.012454 14.410503 2.780835 2.431730
8 200 400 0.887184 0.883647 90.469659 18.092645 2.831297 2.582043
9 250 500 1.088669 1.099337 110.879169 22.177632 2.991412 2.716854
10 500 1000 1.838825 1.630478 226.336279 42.359731 4.005885 3.290292
11 750 1500 2.096881 2.278316 334.308263 62.534262 4.399962 3.818105
12 1000 2000 2.996025 2.956987 453.728119 82.371572 5.327830 4.707286
13 2500 5000 5.997752 6.383000 1120.259022 202.750489 9.461915 7.908639
14 5000 10000 11.603223 13.809455 2160.935821 406.772957 16.150460 13.800006
fig = plt.figure(figsize=(20,25))

ax1 = fig.add_subplot(3,2,1)
ax1.plot(eff_orig['fits'], eff_orig['ss_orig'], 
         lw=3, color='red', label='Original')
ax1.plot(eff_cached['fits'], eff_cached['ss_cached'], 
         lw=3, color='blue', label='Memory Cached')
ax1.plot(eff_logan['fits'], eff_logan['ss_logan'], 
         lw=3, color='black', label='New Method')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Standard Scalar Pipeline")
plt.legend(loc = 'best')

ax2 = fig.add_subplot(3, 2, 2)
ax2.plot(eff_orig['fits'], eff_orig['pf_orig'], 
         lw=3, color='red', label='Original')
ax2.plot(eff_cached['fits'], eff_cached['pf_cached'], 
         lw=3, color='blue', label='Memory Cached')
ax2.plot(eff_logan['fits'], eff_logan['pf_logan'], 
         lw=3, color='black', label='New Method')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Polynomial Feature Pipeline")
plt.legend(loc = 'best')

ax3 = fig.add_subplot(3, 2, 3)
ax3.plot(eff_orig['fits'], eff_orig['cv_orig'], 
         lw=3, color='red', label='Original')
ax3.plot(eff_cached['fits'], eff_cached['cv_cached'], 
         lw=3, color='blue', label='Memory Cached')
ax3.plot(eff_logan['fits'], eff_logan['cv_logan'], 
         lw=3, color='black', label='New Method')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Count Vectorizer Pipeline")
plt.legend(loc = 'best')

ax4 = fig.add_subplot(3, 2, 4)
ax4.plot(eff_orig['fits'], eff_orig['tv_orig'], 
         lw=3, color='red', label='Original')
ax4.plot(eff_cached['fits'], eff_cached['tv_cached'], 
         lw=3, color='blue', label='Memory Cached')
ax4.plot(eff_logan['fits'], eff_logan['tv_logan'], 
         lw=3, color='black', label='New Method')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit TfidVectorizer Pipeline")
plt.legend(loc = 'best')

ax5 = fig.add_subplot(3, 2, 5)
ax5.plot(eff_orig['fits'], eff_orig['cv_td_orig'], 
         lw=3, color='red', label='Original')
ax5.plot(eff_cached['fits'], eff_cached['cv_td_cached'], 
         lw=3, color='blue', label='Memory Cached')
ax5.plot(eff_logan['fits'], eff_logan['cv_td_logan'], 
         lw=3, color='black', label='New Method')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to Fit Count Vectorizer with Truncation Pipeline")
plt.legend(loc = 'best')

ax6 = fig.add_subplot(3, 2, 6)
ax6.plot(eff_orig['fits'], eff_orig['tv_td_orig'], 
         lw=3, color='red', label='Original')
ax6.plot(eff_cached['fits'], eff_cached['tv_td_cached'], 
         lw=3, color='blue', label='Memory Cached')
ax6.plot(eff_logan['fits'], eff_logan['tv_td_logan'], 
         lw=3, color='black', label='New Method')
plt.xlabel('# Fits')
plt.ylabel('Time (s)')
plt.title("Time to TfidVectorizor with Truncation Pipeline")
plt.legend(loc = 'best')
