ValueError al tratar de adaptarse a una scikit-learn Pipeline

votos
0

He estado recibiendo una ValueError continuación cuando trate de encajar mi Pipeline.

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.

Me han encargado de construir un modelo que combina las business_features de hogares de ancianos con sus resultados del ciclo 1 de la encuesta, así como el tiempo entre el estudio de los ciclos 1 y 2 para predecir la puntuación total del ciclo 2.

Este es mi código que estoy usando para realizar la tarea anterior.

# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, t1_col, t2_col):
        self.t1_col = t1_col
        self.t2_col = t2_col

    def fit(self, X, y=None):
        self.col_1 = X[self.t1_col].apply(pd.to_datetime)
        self.col_2 = X[self.t2_col].apply(pd.to_datetime)
        return self

    def transform(self, X):
        difference = self.col_1 - self.col_2
        return difference.values

# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)

# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
                'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
                'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
    ('cst2', ColumnSelectTransformer(cycle_1_cols)),
    ])

# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming 
# and combining the business features, cycle_1 features as well as time   
# feature; followed by fitting the transformed features into a            
# RandomForestRegressor
survey_model = Pipeline([
    ('features', FeatureUnion([
        ('business', business_features),
        ('survey', cycle_1_features),
        ('time', time_feature),
    ])),
    ('forest', RandomForestRegressor()),
])

# Trying to fit my Pipeline throws the ValueError described above
survey_model.fit(data, cycle_2_score.astype(int))

Algunos contexto adicional: Estoy construyendo este modelo para tener su método predict_proba hace pasar a un grado personalizado para un proyecto. El grado pasa una lista de los diccionarios al método de predecir o predict_proba estimador de mi, no una trama de datos. Esto significa que el modelo debe trabajar con ambos tipos de datos. Por esta razón, necesito proporcionar un ColumnSelectTransformer costumbre de usar en su lugar scikit-learn la propia ColumnTransformer.

A continuación se muestra el código adicional relacionada con las características de negocio y ColumnSelectTransformer

# Custom transformer to select columns from a dataframe and returns the   
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        return X[self.columns].values

simple_features = Pipeline([
    ('cst', ColumnSelectTransformer(simple_cols)),
    ('imputer', SimpleImputer(strategy='mean')),
])

owner_onehot = Pipeline([
    ('cst', ColumnSelectTransformer(['OWNERSHIP'])),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])

cert_onehot = Pipeline([
    ('cst', ColumnSelectTransformer(['CERTIFICATION'])),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])

categorical_features = FeatureUnion([
    ('owner_onehot', owner_onehot),
    ('cert_onehot', cert_onehot),
])

business_features = FeatureUnion([
    ('simple', simple_features),
    ('categorical', categorical_features)
])

Por último, a continuación es el error total recaudado

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-218-046724d81b69> in <module>()
----> 1 survey_model.fit(data, cycle_2_score.astype(int))

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    350             This estimator
    351         
--> 352         Xt, fit_params = self._fit(X, y, **fit_params)
    353         with _print_elapsed_time('Pipeline',
    354                                  self._log_message(len(self.steps) - 1)):

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    315                 message_clsname='Pipeline',
    316                 message=self._log_message(step_idx),
--> 317                 **fit_params_steps[name])
    318             # Replace the transformer of the step with the fitted
    319             # transformer. This is necessary when loading the transformer

/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    353 
    354     def __call__(self, *args, **kwargs):
--> 355         return self.func(*args, **kwargs)
    356 
    357     def call_and_shelve(self, *args, **kwargs):

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    714     with _print_elapsed_time(message_clsname, message):
    715         if hasattr(transformer, 'fit_transform'):
--> 716             res = transformer.fit_transform(X, y, **fit_params)
    717         else:
    718             res = transformer.fit(X, y, **fit_params).transform(X)

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    919 
    920         if any(sparse.issparse(f) for f in Xs):
--> 921             Xs = sparse.hstack(Xs).tocsr()
    922         else:
    923             Xs = np.hstack(Xs)

/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
    463 
    464     
--> 465     return bmat([blocks], format=format, dtype=dtype)
    466 
    467 

/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
    584                                                     exp=brow_lengths[i],
    585                                                     got=A.shape[0]))
--> 586                     raise ValueError(msg)
    587 
    588                 if bcol_lengths[j] == 0:

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.

Además, los datos y metadatos se pueden conseguir aquí

%%bash
mkdir data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-train.csv -nc -P ./ml-data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-metadata.csv -nc -P ./ml-data
Publicado el 09/10/2019 a las 18:56
fuente por usuario
En otros idiomas...                            

Cookies help us deliver our services. By using our services, you agree to our use of cookies. Learn more