AnalyticsDojo

What's Cooking in Python

rpi.analyticsdojo.com

35. What’s Cooking in Python

This was adopted from. https://www.kaggle.com/manuelatadvice/whats-cooking/noname/code

#This imports a bunch of packages.  
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from collections import Counter
import json
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
#from sklearn import grid_search


#If you import the codes locally, this seems to cause some issues.  
import json
from urllib.request import urlopen

urltrain= 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/whatscooking/whatscookingtrain.json'
urltest = 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/whatscooking/whatscookingtest.json'


train = pd.read_json(urlopen(urltrain))
test = pd.read_json(urlopen(urltest))
#First we want to see the most popular cuisine for the naive model. 
train.groupby('cuisine').size()
cuisine
brazilian        467
british          804
cajun_creole    1546
chinese         2673
filipino         755
french          2646
greek           1175
indian          3003
irish            667
italian         7838
jamaican         526
japanese        1423
korean           830
mexican         6438
moroccan         821
russian          489
southern_us     4320
spanish          989
thai            1539
vietnamese       825
dtype: int64
#Here we write the most popular selection.  This is the baseline by which we will judge other models. 
test['cuisine']='italian'
#THis is a much more simple version that selects out the columns ID and cuisinte
submission=test[['id' ,  'cuisine' ]]
#This is a more complex method I showed that gives same.
#submission=pd.DataFrame(test.ix[:,['id' ,  'cuisine' ]])
#This outputs the file.
submission.to_csv("1_cookingSubmission.csv",index=False)
from google.colab import files
files.download('1_cookingSubmission.csv')
#So it seems there is some data we need to use the NLTK leemmatizer.  
stemmer = WordNetLemmatizer()
nltk.download('wordnet')

 
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
True
train
id cuisine ingredients
0 10259 greek [romaine lettuce, black olives, grape tomatoes...
1 25693 southern_us [plain flour, ground pepper, salt, tomatoes, g...
2 20130 filipino [eggs, pepper, salt, mayonaise, cooking oil, g...
3 22213 indian [water, vegetable oil, wheat, salt]
4 13162 indian [black pepper, shallots, cornflour, cayenne pe...
... ... ... ...
39769 29109 irish [light brown sugar, granulated sugar, butter, ...
39770 11462 italian [KRAFT Zesty Italian Dressing, purple onion, b...
39771 2238 irish [eggs, citrus fruit, raisins, sourdough starte...
39772 41882 chinese [boneless chicken skinless thigh, minced garli...
39773 2362 mexican [green chile, jalapeno chilies, onions, ground...

39774 rows × 3 columns

#We see this in a Python Solution. 
train['ingredients_clean_string1'] = [','.join(z).strip() for z in train['ingredients']] 

#We also know that we can do something similar though a Lambda function. 
strip = lambda x: ' , '.join(x).strip() 
#Finally, we call the function for name
train['ingredients_clean_string2'] = train['ingredients'].map(strip)

#Now that we used the lambda function, we can reuse this for the test dataset. 
test['ingredients_clean_string1'] = test['ingredients'].map(strip)
 
#We see this in one of the solutions.  We can reconstruct it in a way that makes it abit easier to follow, but I found when doing that it took forever.  

#To interpret this, read from right to left. 
train['ingredients_string1'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in train['ingredients']]       
test['ingredients_string1'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in test['ingredients']]       


train['ingredients_string1']
0        romaine lettuce black olives grape tomatoes ga...
1        plain flour ground pepper salt tomato ground b...
2        egg pepper salt mayonaise cooking oil green ch...
3                           water vegetable oil wheat salt
4        black pepper shallot cornflour cayenne pepper ...
                               ...                        
39769    light brown sugar granulated sugar butter warm...
39770    KRAFT Zesty Italian Dressing purple onion broc...
39771    egg citrus fruit raisin sourdough starter flou...
39772    boneless chicken skinless thigh minced garlic ...
39773    green chile jalapeno chilies onion ground blac...
Name: ingredients_string1, Length: 39774, dtype: object
ingredients = train['ingredients'].apply(lambda x:','.join(x))
ingredients
0        romaine lettuce,black olives,grape tomatoes,ga...
1        plain flour,ground pepper,salt,tomatoes,ground...
2        eggs,pepper,salt,mayonaise,cooking oil,green c...
3                           water,vegetable oil,wheat,salt
4        black pepper,shallots,cornflour,cayenne pepper...
                               ...                        
39769    light brown sugar,granulated sugar,butter,warm...
39770    KRAFT Zesty Italian Dressing,purple onion,broc...
39771    eggs,citrus fruit,raisins,sourdough starter,fl...
39772    boneless chicken skinless thigh,minced garlic,...
39773    green chile,jalapeno chilies,onions,ground bla...
Name: ingredients, Length: 39774, dtype: object
#Now we will create a corpus.
corpustr = train['ingredients_string1']
corpusts = test['ingredients_string1']
corpustr
0        romaine lettuce black olives grape tomatoes ga...
1        plain flour ground pepper salt tomato ground b...
2        egg pepper salt mayonaise cooking oil green ch...
3                           water vegetable oil wheat salt
4        black pepper shallot cornflour cayenne pepper ...
                               ...                        
39769    light brown sugar granulated sugar butter warm...
39770    KRAFT Zesty Italian Dressing purple onion broc...
39771    egg citrus fruit raisin sourdough starter flou...
39772    boneless chicken skinless thigh minced garlic ...
39773    green chile jalapeno chilies onion ground blac...
Name: ingredients_string1, Length: 39774, dtype: object
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
#You could develop an understanding based on each.  
vectorizertr = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
vectorizerts = TfidfVectorizer(stop_words='english')
#Note that this doesn't work with the #todense option.  
tfidftr=vectorizertr.fit_transform(corpustr)
predictors_tr = tfidftr
#Note that this doesn't work with the #todense option.  This creates a matrix of predictors from the corpus. 
tfidfts=vectorizertr.transform(corpusts)
predictors_ts= tfidfts
#This is target variable.  
targets_tr = train['cuisine']

35.1. Logistic Regression and Regularization.

  • Regularlization can help us with the large matrix by adding a penalty for each parameter.

  • Finding out how much regularization via grid search (search across hyperparameters.)

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.```
#Logistic Regression. 
parameters = {'C':[1, 10]}
#clf = LinearSVC()
clf = LogisticRegression()

predictors_tr
<39774x2963 sparse matrix of type '<class 'numpy.float64'>'
	with 727921 stored elements in Compressed Sparse Row format>
from sklearn.model_selection import GridSearchCV
#This uses that associated paramters to search a grid space. 
classifier = GridSearchCV(clf, parameters)
classifier=classifier.fit(predictors_tr,targets_tr)

/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
  warnings.warn(CV_WARNING, FutureWarning)
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.
  "this warning.", FutureWarning)
#This predicts the outcome for the test set. 
predictions=classifier.predict(predictors_ts)
#This adds it to the resulting dataframe. 
test['cuisine'] = predictions
#This creates the submision dataframe
submission2=test[['id' ,  'cuisine' ]]
#This outputs the file.
submission2.to_csv("2_logisticSubmission.csv",index=False)
from google.colab import files
files.download('2_logisticSubmission.csv')
from sklearn.ensemble import RandomForestClassifier 

# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 10)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(predictors_tr,targets_tr)

# Take the same decision trees and run it on the test data
predictions = forest.predict(predictors_ts)
#This adds it to the resulting dataframe. 
test['cuisine'] = predictions
#This creates the submision dataframe
submission3=test[['id' ,  'cuisine' ]]
submission3.to_csv("3_random_submission.csv",index=False)
from google.colab import files
files.download('3_random_submission.csv')
ingredients = train['ingredients'].apply(lambda x:','.join(x))
ingredients
train
id cuisine ingredients ingredients_clean_string1 ingredients_clean_string2 ingredients_string1
0 10259 greek [romaine lettuce, black olives, grape tomatoes... romaine lettuce,black olives,grape tomatoes,ga... romaine lettuce , black olives , grape tomatoe... romaine lettuce black olives grape tomatoes ga...
1 25693 southern_us [plain flour, ground pepper, salt, tomatoes, g... plain flour,ground pepper,salt,tomatoes,ground... plain flour , ground pepper , salt , tomatoes ... plain flour ground pepper salt tomato ground b...
2 20130 filipino [eggs, pepper, salt, mayonaise, cooking oil, g... eggs,pepper,salt,mayonaise,cooking oil,green c... eggs , pepper , salt , mayonaise , cooking oil... egg pepper salt mayonaise cooking oil green ch...
3 22213 indian [water, vegetable oil, wheat, salt] water,vegetable oil,wheat,salt water , vegetable oil , wheat , salt water vegetable oil wheat salt
4 13162 indian [black pepper, shallots, cornflour, cayenne pe... black pepper,shallots,cornflour,cayenne pepper... black pepper , shallots , cornflour , cayenne ... black pepper shallot cornflour cayenne pepper ...
... ... ... ... ... ... ...
39769 29109 irish [light brown sugar, granulated sugar, butter, ... light brown sugar,granulated sugar,butter,warm... light brown sugar , granulated sugar , butter ... light brown sugar granulated sugar butter warm...
39770 11462 italian [KRAFT Zesty Italian Dressing, purple onion, b... KRAFT Zesty Italian Dressing,purple onion,broc... KRAFT Zesty Italian Dressing , purple onion , ... KRAFT Zesty Italian Dressing purple onion broc...
39771 2238 irish [eggs, citrus fruit, raisins, sourdough starte... eggs,citrus fruit,raisins,sourdough starter,fl... eggs , citrus fruit , raisins , sourdough star... egg citrus fruit raisin sourdough starter flou...
39772 41882 chinese [boneless chicken skinless thigh, minced garli... boneless chicken skinless thigh,minced garlic,... boneless chicken skinless thigh , minced garli... boneless chicken skinless thigh minced garlic ...
39773 2362 mexican [green chile, jalapeno chilies, onions, ground... green chile,jalapeno chilies,onions,ground bla... green chile , jalapeno chilies , onions , grou... green chile jalapeno chilies onion ground blac...

39774 rows × 6 columns