from utils.mf_predictions import *
Author: Ryan Harper
Data Source: Bosch Dataset via Kaggle
Background: Bosch is a home appliance and industrial tools manufacturing company. In 2017, Bosch supplied Kaggle.com with manufacturing data to promote a competition. The goal of the competition was to determine factors that influence whether or not the product passes the final response stage of manufacturing and to predict which products are likely to fail based on this manufacturing process.
The Data: Early exploration of this data will use a subset of the big data provided by Bosch. The data is provided by Hitesh, John, and Matthew via PDX Data Science Meetup. The data subset is divided into 2 groups of 3 files (3 training, 3 test). Each group has one csv file each for numerical features ('numeric'), dates ('date'), and the manufacturing path ('cat'). The data subset includes a larger percentage of products that failed the response test, but not much more is known about this subsampling method.
Assumptions: ID # represents a specific product and that there is only one product. The differences in assembly are due to customization and/or differences between lines.
Goal: Predict which products will fail the response test.
# Retrieve vars from mf_stats_analysis.ipynb | quick var import
%store -r skewed_features
%store -r sig_diff_list
sig_diff_list.append(len(mf_num_data.columns)-1) # Adding response column index
# mf_merged_train = merge_dfs(mf_num_data, mf_date_data, True)
# del mf_merged_train['L1_S24_D1562']
# mf_merged_test = merge_dfs(mf_num_data_test, mf_date_data_test, False)
## Code to find the random column that is not included in the test set
np.setdiff1d(np.array(mf_num_data.columns),np.array(mf_num_data_test.columns))
ms_mcc = make_scorer(matthews_corrcoef)
TEST_PER = .3
def process_predictions(df):
ytr = df['Response'].values
Xtr = df.iloc[:,~df.columns.isin(['Response'])]
Xtr.fillna(0, inplace=True)
Xte = mf_num_data
Xte.fillna(0, inplace=True)
return Xtr, ytr, Xte
def save_to_csv(pred, title):
final_predic = pd.DataFrame(mf_num_data_test.iloc[:,0])
final_predic.index = final_predic['Id']
final_predic['Response'] = pred
pd.DataFrame(final_predic).to_csv(title)
def final_predictions(data, title):
X_train, y_train, X_test = process_predictions(data)
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', n_jobs=4, class_weight='balanced', verbose=1)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
save_to_csv(predictions, title)
transformed_df = mf_num_data
for f in skewed_features:
transformed_df[f] = np.log(1+ transformed_df[f])
all_transformed_df = mf_num_data
for f in list(all_transformed_df.columns[1:-1]):
all_transformed_df[f] = np.log(1+ all_transformed_df[f])
Function for Outputting Predictions to CSV:
NOTE: Best Prediction:
0.41037390395473183
RandomForestClassifier(n_estimators=100, criterion='entropy', n_jobs=4,class_weight='balanced', verbose=1)
RFE(estimator, step=.05)