What's Cooking in Python
rpi.analyticsdojo.com
35. What’s Cooking in Python¶
This was adopted from. https://www.kaggle.com/manuelatadvice/whats-cooking/noname/code
#This imports a bunch of packages.
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from collections import Counter
import json
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
#from sklearn import grid_search
#If you import the codes locally, this seems to cause some issues.
import json
from urllib.request import urlopen
urltrain= 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/whatscooking/whatscookingtrain.json'
urltest = 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/whatscooking/whatscookingtest.json'
train = pd.read_json(urlopen(urltrain))
test = pd.read_json(urlopen(urltest))
#First we want to see the most popular cuisine for the naive model.
train.groupby('cuisine').size()
cuisine
brazilian 467
british 804
cajun_creole 1546
chinese 2673
filipino 755
french 2646
greek 1175
indian 3003
irish 667
italian 7838
jamaican 526
japanese 1423
korean 830
mexican 6438
moroccan 821
russian 489
southern_us 4320
spanish 989
thai 1539
vietnamese 825
dtype: int64
#Here we write the most popular selection. This is the baseline by which we will judge other models.
test['cuisine']='italian'
#THis is a much more simple version that selects out the columns ID and cuisinte
submission=test[['id' , 'cuisine' ]]
#This is a more complex method I showed that gives same.
#submission=pd.DataFrame(test.ix[:,['id' , 'cuisine' ]])
#This outputs the file.
submission.to_csv("1_cookingSubmission.csv",index=False)
from google.colab import files
files.download('1_cookingSubmission.csv')
#So it seems there is some data we need to use the NLTK leemmatizer.
stemmer = WordNetLemmatizer()
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Unzipping corpora/wordnet.zip.
True
train
id | cuisine | ingredients | |
---|---|---|---|
0 | 10259 | greek | [romaine lettuce, black olives, grape tomatoes... |
1 | 25693 | southern_us | [plain flour, ground pepper, salt, tomatoes, g... |
2 | 20130 | filipino | [eggs, pepper, salt, mayonaise, cooking oil, g... |
3 | 22213 | indian | [water, vegetable oil, wheat, salt] |
4 | 13162 | indian | [black pepper, shallots, cornflour, cayenne pe... |
... | ... | ... | ... |
39769 | 29109 | irish | [light brown sugar, granulated sugar, butter, ... |
39770 | 11462 | italian | [KRAFT Zesty Italian Dressing, purple onion, b... |
39771 | 2238 | irish | [eggs, citrus fruit, raisins, sourdough starte... |
39772 | 41882 | chinese | [boneless chicken skinless thigh, minced garli... |
39773 | 2362 | mexican | [green chile, jalapeno chilies, onions, ground... |
39774 rows × 3 columns
#We see this in a Python Solution.
train['ingredients_clean_string1'] = [','.join(z).strip() for z in train['ingredients']]
#We also know that we can do something similar though a Lambda function.
strip = lambda x: ' , '.join(x).strip()
#Finally, we call the function for name
train['ingredients_clean_string2'] = train['ingredients'].map(strip)
#Now that we used the lambda function, we can reuse this for the test dataset.
test['ingredients_clean_string1'] = test['ingredients'].map(strip)
#We see this in one of the solutions. We can reconstruct it in a way that makes it abit easier to follow, but I found when doing that it took forever.
#To interpret this, read from right to left.
train['ingredients_string1'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in train['ingredients']]
test['ingredients_string1'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in test['ingredients']]
train['ingredients_string1']
0 romaine lettuce black olives grape tomatoes ga...
1 plain flour ground pepper salt tomato ground b...
2 egg pepper salt mayonaise cooking oil green ch...
3 water vegetable oil wheat salt
4 black pepper shallot cornflour cayenne pepper ...
...
39769 light brown sugar granulated sugar butter warm...
39770 KRAFT Zesty Italian Dressing purple onion broc...
39771 egg citrus fruit raisin sourdough starter flou...
39772 boneless chicken skinless thigh minced garlic ...
39773 green chile jalapeno chilies onion ground blac...
Name: ingredients_string1, Length: 39774, dtype: object
ingredients = train['ingredients'].apply(lambda x:','.join(x))
ingredients
0 romaine lettuce,black olives,grape tomatoes,ga...
1 plain flour,ground pepper,salt,tomatoes,ground...
2 eggs,pepper,salt,mayonaise,cooking oil,green c...
3 water,vegetable oil,wheat,salt
4 black pepper,shallots,cornflour,cayenne pepper...
...
39769 light brown sugar,granulated sugar,butter,warm...
39770 KRAFT Zesty Italian Dressing,purple onion,broc...
39771 eggs,citrus fruit,raisins,sourdough starter,fl...
39772 boneless chicken skinless thigh,minced garlic,...
39773 green chile,jalapeno chilies,onions,ground bla...
Name: ingredients, Length: 39774, dtype: object
#Now we will create a corpus.
corpustr = train['ingredients_string1']
corpusts = test['ingredients_string1']
corpustr
0 romaine lettuce black olives grape tomatoes ga...
1 plain flour ground pepper salt tomato ground b...
2 egg pepper salt mayonaise cooking oil green ch...
3 water vegetable oil wheat salt
4 black pepper shallot cornflour cayenne pepper ...
...
39769 light brown sugar granulated sugar butter warm...
39770 KRAFT Zesty Italian Dressing purple onion broc...
39771 egg citrus fruit raisin sourdough starter flou...
39772 boneless chicken skinless thigh minced garlic ...
39773 green chile jalapeno chilies onion ground blac...
Name: ingredients_string1, Length: 39774, dtype: object
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
#You could develop an understanding based on each.
vectorizertr = TfidfVectorizer(stop_words='english',
ngram_range = ( 1 , 1 ),analyzer="word",
max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
vectorizerts = TfidfVectorizer(stop_words='english')
#Note that this doesn't work with the #todense option.
tfidftr=vectorizertr.fit_transform(corpustr)
predictors_tr = tfidftr
#Note that this doesn't work with the #todense option. This creates a matrix of predictors from the corpus.
tfidfts=vectorizertr.transform(corpusts)
predictors_ts= tfidfts
#This is target variable.
targets_tr = train['cuisine']
35.1. Logistic Regression and Regularization.¶
Regularlization can help us with the large matrix by adding a penalty for each parameter.
Finding out how much regularization via grid search (search across hyperparameters.)
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.```
#Logistic Regression.
parameters = {'C':[1, 10]}
#clf = LinearSVC()
clf = LogisticRegression()
predictors_tr
<39774x2963 sparse matrix of type '<class 'numpy.float64'>'
with 727921 stored elements in Compressed Sparse Row format>
from sklearn.model_selection import GridSearchCV
#This uses that associated paramters to search a grid space.
classifier = GridSearchCV(clf, parameters)
classifier=classifier.fit(predictors_tr,targets_tr)
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning. warnings.warn(CV_WARNING, FutureWarning) /usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning) /usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning. "this warning.", FutureWarning)
#This predicts the outcome for the test set.
predictions=classifier.predict(predictors_ts)
#This adds it to the resulting dataframe.
test['cuisine'] = predictions
#This creates the submision dataframe
submission2=test[['id' , 'cuisine' ]]
#This outputs the file.
submission2.to_csv("2_logisticSubmission.csv",index=False)
from google.colab import files
files.download('2_logisticSubmission.csv')
from sklearn.ensemble import RandomForestClassifier
# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 10)
# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(predictors_tr,targets_tr)
# Take the same decision trees and run it on the test data
predictions = forest.predict(predictors_ts)
#This adds it to the resulting dataframe.
test['cuisine'] = predictions
#This creates the submision dataframe
submission3=test[['id' , 'cuisine' ]]
submission3.to_csv("3_random_submission.csv",index=False)
from google.colab import files
files.download('3_random_submission.csv')
ingredients = train['ingredients'].apply(lambda x:','.join(x))
ingredients
train
id | cuisine | ingredients | ingredients_clean_string1 | ingredients_clean_string2 | ingredients_string1 | |
---|---|---|---|---|---|---|
0 | 10259 | greek | [romaine lettuce, black olives, grape tomatoes... | romaine lettuce,black olives,grape tomatoes,ga... | romaine lettuce , black olives , grape tomatoe... | romaine lettuce black olives grape tomatoes ga... |
1 | 25693 | southern_us | [plain flour, ground pepper, salt, tomatoes, g... | plain flour,ground pepper,salt,tomatoes,ground... | plain flour , ground pepper , salt , tomatoes ... | plain flour ground pepper salt tomato ground b... |
2 | 20130 | filipino | [eggs, pepper, salt, mayonaise, cooking oil, g... | eggs,pepper,salt,mayonaise,cooking oil,green c... | eggs , pepper , salt , mayonaise , cooking oil... | egg pepper salt mayonaise cooking oil green ch... |
3 | 22213 | indian | [water, vegetable oil, wheat, salt] | water,vegetable oil,wheat,salt | water , vegetable oil , wheat , salt | water vegetable oil wheat salt |
4 | 13162 | indian | [black pepper, shallots, cornflour, cayenne pe... | black pepper,shallots,cornflour,cayenne pepper... | black pepper , shallots , cornflour , cayenne ... | black pepper shallot cornflour cayenne pepper ... |
... | ... | ... | ... | ... | ... | ... |
39769 | 29109 | irish | [light brown sugar, granulated sugar, butter, ... | light brown sugar,granulated sugar,butter,warm... | light brown sugar , granulated sugar , butter ... | light brown sugar granulated sugar butter warm... |
39770 | 11462 | italian | [KRAFT Zesty Italian Dressing, purple onion, b... | KRAFT Zesty Italian Dressing,purple onion,broc... | KRAFT Zesty Italian Dressing , purple onion , ... | KRAFT Zesty Italian Dressing purple onion broc... |
39771 | 2238 | irish | [eggs, citrus fruit, raisins, sourdough starte... | eggs,citrus fruit,raisins,sourdough starter,fl... | eggs , citrus fruit , raisins , sourdough star... | egg citrus fruit raisin sourdough starter flou... |
39772 | 41882 | chinese | [boneless chicken skinless thigh, minced garli... | boneless chicken skinless thigh,minced garlic,... | boneless chicken skinless thigh , minced garli... | boneless chicken skinless thigh minced garlic ... |
39773 | 2362 | mexican | [green chile, jalapeno chilies, onions, ground... | green chile,jalapeno chilies,onions,ground bla... | green chile , jalapeno chilies , onions , grou... | green chile jalapeno chilies onion ground blac... |
39774 rows × 6 columns