What's Cooking in Python

rpi.analyticsdojo.com

35. What’s Cooking in Python¶

This was adopted from. https://www.kaggle.com/manuelatadvice/whats-cooking/noname/code

#This imports a bunch of packages.  
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from collections import Counter
import json
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
#from sklearn import grid_search

#If you import the codes locally, this seems to cause some issues.  
import json
from urllib.request import urlopen

urltrain= 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/whatscooking/whatscookingtrain.json'
urltest = 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/whatscooking/whatscookingtest.json'


train = pd.read_json(urlopen(urltrain))
test = pd.read_json(urlopen(urltest))

#First we want to see the most popular cuisine for the naive model. 
train.groupby('cuisine').size()

cuisine
brazilian        467
british          804
cajun_creole    1546
chinese         2673
filipino         755
french          2646
greek           1175
indian          3003
irish            667
italian         7838
jamaican         526
japanese        1423
korean           830
mexican         6438
moroccan         821
russian          489
southern_us     4320
spanish          989
thai            1539
vietnamese       825
dtype: int64

#Here we write the most popular selection.  This is the baseline by which we will judge other models. 
test['cuisine']='italian'

#THis is a much more simple version that selects out the columns ID and cuisinte
submission=test[['id' ,  'cuisine' ]]
#This is a more complex method I showed that gives same.
#submission=pd.DataFrame(test.ix[:,['id' ,  'cuisine' ]])

#This outputs the file.
submission.to_csv("1_cookingSubmission.csv",index=False)
from google.colab import files
files.download('1_cookingSubmission.csv')

#So it seems there is some data we need to use the NLTK leemmatizer.  
stemmer = WordNetLemmatizer()
nltk.download('wordnet')

 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.

True

train

	id	cuisine	ingredients
0	10259	greek	[romaine lettuce, black olives, grape tomatoes...
1	25693	southern_us	[plain flour, ground pepper, salt, tomatoes, g...
2	20130	filipino	[eggs, pepper, salt, mayonaise, cooking oil, g...
3	22213	indian	[water, vegetable oil, wheat, salt]
4	13162	indian	[black pepper, shallots, cornflour, cayenne pe...
...	...	...	...
39769	29109	irish	[light brown sugar, granulated sugar, butter, ...
39770	11462	italian	[KRAFT Zesty Italian Dressing, purple onion, b...
39771	2238	irish	[eggs, citrus fruit, raisins, sourdough starte...
39772	41882	chinese	[boneless chicken skinless thigh, minced garli...
39773	2362	mexican	[green chile, jalapeno chilies, onions, ground...

39774 rows × 3 columns

#We see this in a Python Solution. 
train['ingredients_clean_string1'] = [','.join(z).strip() for z in train['ingredients']] 

#We also know that we can do something similar though a Lambda function. 
strip = lambda x: ' , '.join(x).strip() 
#Finally, we call the function for name
train['ingredients_clean_string2'] = train['ingredients'].map(strip)

#Now that we used the lambda function, we can reuse this for the test dataset. 
test['ingredients_clean_string1'] = test['ingredients'].map(strip)
 

#We see this in one of the solutions.  We can reconstruct it in a way that makes it abit easier to follow, but I found when doing that it took forever.  

#To interpret this, read from right to left. 
train['ingredients_string1'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in train['ingredients']]       
test['ingredients_string1'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in test['ingredients']]       

train['ingredients_string1']

      romaine lettuce black olives grape tomatoes ga...
      plain flour ground pepper salt tomato ground b...
      egg pepper salt mayonaise cooking oil green ch...
                         water vegetable oil wheat salt
      black pepper shallot cornflour cayenne pepper ...
                               ...                        
  light brown sugar granulated sugar butter warm...
  KRAFT Zesty Italian Dressing purple onion broc...
  egg citrus fruit raisin sourdough starter flou...
  boneless chicken skinless thigh minced garlic ...
  green chile jalapeno chilies onion ground blac...
Name: ingredients_string1, Length: 39774, dtype: object

ingredients = train['ingredients'].apply(lambda x:','.join(x))
ingredients

      romaine lettuce,black olives,grape tomatoes,ga...
      plain flour,ground pepper,salt,tomatoes,ground...
      eggs,pepper,salt,mayonaise,cooking oil,green c...
                         water,vegetable oil,wheat,salt
      black pepper,shallots,cornflour,cayenne pepper...
                               ...                        
  light brown sugar,granulated sugar,butter,warm...
  KRAFT Zesty Italian Dressing,purple onion,broc...
  eggs,citrus fruit,raisins,sourdough starter,fl...
  boneless chicken skinless thigh,minced garlic,...
  green chile,jalapeno chilies,onions,ground bla...
Name: ingredients, Length: 39774, dtype: object

#Now we will create a corpus.
corpustr = train['ingredients_string1']
corpusts = test['ingredients_string1']
corpustr

      romaine lettuce black olives grape tomatoes ga...
      plain flour ground pepper salt tomato ground b...
      egg pepper salt mayonaise cooking oil green ch...
                         water vegetable oil wheat salt
      black pepper shallot cornflour cayenne pepper ...
                               ...                        
  light brown sugar granulated sugar butter warm...
  KRAFT Zesty Italian Dressing purple onion broc...
  egg citrus fruit raisin sourdough starter flou...
  boneless chicken skinless thigh minced garlic ...
  green chile jalapeno chilies onion ground blac...
Name: ingredients_string1, Length: 39774, dtype: object

#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
#You could develop an understanding based on each.  
vectorizertr = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
vectorizerts = TfidfVectorizer(stop_words='english')

#Note that this doesn't work with the #todense option.  
tfidftr=vectorizertr.fit_transform(corpustr)
predictors_tr = tfidftr

#Note that this doesn't work with the #todense option.  This creates a matrix of predictors from the corpus. 
tfidfts=vectorizertr.transform(corpusts)
predictors_ts= tfidfts

#This is target variable.  
targets_tr = train['cuisine']

35.1. Logistic Regression and Regularization.¶

Regularlization can help us with the large matrix by adding a penalty for each parameter.
Finding out how much regularization via grid search (search across hyperparameters.)

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.```

#Logistic Regression. 
parameters = {'C':[1, 10]}
#clf = LinearSVC()
clf = LogisticRegression()

predictors_tr

<39774x2963 sparse matrix of type '<class 'numpy.float64'>'
	with 727921 stored elements in Compressed Sparse Row format>

from sklearn.model_selection import GridSearchCV
#This uses that associated paramters to search a grid space. 
classifier = GridSearchCV(clf, parameters)
classifier=classifier.fit(predictors_tr,targets_tr)

/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
  warnings.warn(CV_WARNING, FutureWarning)
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.
  "this warning.", FutureWarning)

#This predicts the outcome for the test set. 
predictions=classifier.predict(predictors_ts)

#This adds it to the resulting dataframe. 
test['cuisine'] = predictions

#This creates the submision dataframe
submission2=test[['id' ,  'cuisine' ]]

#This outputs the file.
submission2.to_csv("2_logisticSubmission.csv",index=False)
from google.colab import files
files.download('2_logisticSubmission.csv')

from sklearn.ensemble import RandomForestClassifier 

# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 10)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(predictors_tr,targets_tr)

# Take the same decision trees and run it on the test data
predictions = forest.predict(predictors_ts)

#This adds it to the resulting dataframe. 
test['cuisine'] = predictions

#This creates the submision dataframe
submission3=test[['id' ,  'cuisine' ]]
submission3.to_csv("3_random_submission.csv",index=False)

from google.colab import files
files.download('3_random_submission.csv')

ingredients = train['ingredients'].apply(lambda x:','.join(x))
ingredients
train

	id	cuisine	ingredients	ingredients_clean_string1	ingredients_clean_string2	ingredients_string1
0	10259	greek	[romaine lettuce, black olives, grape tomatoes...	romaine lettuce,black olives,grape tomatoes,ga...	romaine lettuce , black olives , grape tomatoe...	romaine lettuce black olives grape tomatoes ga...
1	25693	southern_us	[plain flour, ground pepper, salt, tomatoes, g...	plain flour,ground pepper,salt,tomatoes,ground...	plain flour , ground pepper , salt , tomatoes ...	plain flour ground pepper salt tomato ground b...
2	20130	filipino	[eggs, pepper, salt, mayonaise, cooking oil, g...	eggs,pepper,salt,mayonaise,cooking oil,green c...	eggs , pepper , salt , mayonaise , cooking oil...	egg pepper salt mayonaise cooking oil green ch...
3	22213	indian	[water, vegetable oil, wheat, salt]	water,vegetable oil,wheat,salt	water , vegetable oil , wheat , salt	water vegetable oil wheat salt
4	13162	indian	[black pepper, shallots, cornflour, cayenne pe...	black pepper,shallots,cornflour,cayenne pepper...	black pepper , shallots , cornflour , cayenne ...	black pepper shallot cornflour cayenne pepper ...
...	...	...	...	...	...	...
39769	29109	irish	[light brown sugar, granulated sugar, butter, ...	light brown sugar,granulated sugar,butter,warm...	light brown sugar , granulated sugar , butter ...	light brown sugar granulated sugar butter warm...
39770	11462	italian	[KRAFT Zesty Italian Dressing, purple onion, b...	KRAFT Zesty Italian Dressing,purple onion,broc...	KRAFT Zesty Italian Dressing , purple onion , ...	KRAFT Zesty Italian Dressing purple onion broc...
39771	2238	irish	[eggs, citrus fruit, raisins, sourdough starte...	eggs,citrus fruit,raisins,sourdough starter,fl...	eggs , citrus fruit , raisins , sourdough star...	egg citrus fruit raisin sourdough starter flou...
39772	41882	chinese	[boneless chicken skinless thigh, minced garli...	boneless chicken skinless thigh,minced garlic,...	boneless chicken skinless thigh , minced garli...	boneless chicken skinless thigh minced garlic ...
39773	2362	mexican	[green chile, jalapeno chilies, onions, ground...	green chile,jalapeno chilies,onions,ground bla...	green chile , jalapeno chilies , onions , grou...	green chile jalapeno chilies onion ground blac...

39774 rows × 6 columns

34. Bag-of-Words Using Scikit Learn 36. Bag of Words