Lecture Notes, Fri 08/05

Assignment Calendar for Week 1

A second look at the Data Mining code from Julian McAuley’s Facet’s presentation.

Complete Code from Julian’s lecture

import numpy
import urllib
import scipy.optimize
import random

def parseData(fname):
for l in urllib.urlopen(fname):
yield eval(l)

data = list(parseData("http://jmcauley.ucsd.edu/cse255/data/beer/beer_50000.json"))
print "done"

def feature(datum):
feat = [1]
return feat

X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]
theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

### Convince ourselves that basic linear algebra operations yield the same answer ###

X = numpy.matrix(X)
y = numpy.matrix(y)
numpy.linalg.inv(X.T * X) * X.T * y.T

### Do older people rate beer more highly? ###

data2 = [d for d in data if d.has_key('user/ageInSeconds')]

def feature(datum):
feat = [1]
feat.append(datum['user/ageInSeconds'])
return feat

X = [feature(d) for d in data2]
y = [d['review/overall'] for d in data2]
theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

### How much do women prefer beer over men? ###

data2 = [d for d in data if d.has_key('user/gender')]

def feature(datum):
feat = [1]
if datum['user/gender'] == "Male":
feat.append(0)
else:
feat.append(1)
return feat

X = [feature(d) for d in data2]
y = [d['review/overall'] for d in data2]
theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

# Objective
def f(theta, X, y, lam):
theta = numpy.matrix(theta).T
X = numpy.matrix(X)
y = numpy.matrix(y).T
diff = X*theta - y
diffSq = diff.T*diff
diffSqReg = diffSq / len(X) + lam*(theta.T*theta)
print "offset =", diffSqReg.flatten().tolist()
return diffSqReg.flatten().tolist()[0]

# Derivative
def fprime(theta, X, y, lam):
theta = numpy.matrix(theta).T
X = numpy.matrix(X)
y = numpy.matrix(y).T
diff = X*theta - y
res = 2*X.T*diff / len(X) + 2*lam*theta
return numpy.array(res.flatten().tolist()[0])

scipy.optimize.fmin_l_bfgs_b(f, [0,0], fprime, args = (X, y, 0.1))

### Random features ###

def feature(datum):
return [random.random() for x in range(30)]

X = [feature(d) for d in data2]
y = [d['review/overall'] for d in data2]
theta,residuals,rank,s = numpy.linalg.lstsq(X, y)


Generators

This piece of Julian McAuley’s code is called a Generator. The Guttag textbook discusses generators in (insert section here.)

def parseData(fname):
for l in urllib.urlopen(fname):
yield eval(l)


#Incomplete code from Miles’s Lecture

Reading data...
done
>>> len(data)
50000
>>> data[0]
{'beer/style': 'Hefeweizen', 'beer/ABV': 5.0, 'beer/beerId': '47986', 'review/timeStruct': {'wday': 0, 'isdst': 0, 'mday': 16, 'hour': 20, 'min': 57, 'sec': 3, 'year': 2009, 'yday': 47, 'mon': 2}, 'review/aroma': 2.0, 'review/appearance': 2.5, 'review/timeUnix': 1234817823, 'review/palate': 1.5, 'review/taste': 1.5, 'beer/name': 'Sausa Weizen', 'beer/brewerId': '10325', 'review/overall': 1.5, 'review/text': 'A lot of foam. But a lot.\tIn the smell some banana, and then lactic and tart. Not a good start.\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\tAgain tending to lactic sourness.\tSame for the taste. With some yeast and banana.', 'user/profileName': 'stcules'}
>>> X=[[1] for i in data]
>>> len(X)
50000
>>> X[0]
[1]
>>> X[4000]
[1]
>>> y = [d['review/overall'] for d in data]
>>> len(y)
50000
>>> y[0]
1.5
>>> sum(y)/len(y)
3.88871
>>> def predictor(user):
return 3.8871

>>> numpy.linalg.lstsq(X, y)
(array([ 3.88871]), array([ 24621.476795]), 1, array([ 223.60679775]))
>>> X = [38, 57, 56, 65, 45, 66, 58, 62, 64, 92, 73, 87, 59, 38, 39, 41]
>>> y = [11.5, 13.0, 13.0, 13.0, 14.0, 13.0, 13.5, 13.0, 14.0, 14.5, 15.0, 14.0, 14.0, 13.5, 13.0, 12.0]
>>> import matplotlib.pyplot as plt
>>> Xage = [[1,age] for age in X]
>>> Xage
[[1, 38], [1, 57], [1, 56], [1, 65], [1, 45], [1, 66], [1, 58], [1, 62], [1, 64], [1, 92], [1, 73], [1, 87], [1, 59], [1, 38], [1, 39], [1, 41]]
>>> numpy.linalg.lstsq(X, y)

Traceback (most recent call last):
File "<pyshell#96>", line 1, in <module>
numpy.linalg.lstsq(X, y)
File "C:\Python27\lib\site-packages\numpy\linalg\linalg.py", line 1874, in lstsq
_assertRank2(a, b)
File "C:\Python27\lib\site-packages\numpy\linalg\linalg.py", line 196, in _assertRank2
'two-dimensional' % len(a.shape))
LinAlgError: 1-dimensional array given. Array must be two-dimensional
>>> numpy.linalg.lstsq(Xage, y)
(array([ 11.37165876,   0.03409943]), array([ 7.09542843]), 2, array([ 243.39868301,    1.03976349]))
>>> plt.scatter(X,y)
<matplotlib.collections.PathCollection object at 0x000000002571EA58>
>>> plt.plot([0.03409943*i+11.37165876 for i in range(100)])
[<matplotlib.lines.Line2D object at 0x00000000253C5AC8>]
>>> plt.show()
>>> Xage2 = [[age] for age in X]
>>> Xage2
[[38], [57], [56], [65], [45], [66], [58], [62], [64], [92], [73], [87], [59], [38], [39], [41]]
>>> numpy.linalg.lstsq(Xage2, y)
(array([ 0.21457756]), array([ 146.93373067]), 1, array([ 243.36803406]))
>>> plt.scatter(X,y)
<matplotlib.collections.PathCollection object at 0x0000000025A3E080>
>>> plt.plot([0.03409943*i+11.37165876 for i in range(100)])
[<matplotlib.lines.Line2D object at 0x00000000257DB748>]
>>> plt.plot([0.21457756*i for i in range(100)])
[<matplotlib.lines.Line2D object at 0x0000000025A3EEF0>]
>>> plt.show()
>>> Xagequad = [[1,age,age**2] for age in X]