"""
By Kirby Urner. GPL license.
As a check:
>>> import getmovie
>>> getmovie.test()
Latest revision:
April 25, 2004 -- added plot line and actors
April 23, 2004 -- realized that some searches
take you straight to the movie page w/o passing go.
Added a few more attributes (Country, Language, Runtime).
Added command line ability
Initial version: April 23, 2004 (written at Powell's Books on Hawthorne)
Looking up a movie is a two-part process. The search string (title) gets
you to a screen that we hope has a section entitled 'Exact Matches' --
that's what I'm looking for. The URLs in that section feed into Movie
objects, which are then invoked in turn to search for themselves by URL.
At this point, another parser is invoked to scrape the individual movie
page, and populate the object's attributes
"""
import urlparse, urllib2, HTMLParser
import sys
def search(srchtitle):
"""
Top level searcher -- takes you to an interim page of zero or
more exact matches (need to add logic for 'no matches found'),
OR to the movie page itself (in which case just one movie object
is returned by the parser.
"""
srchtitle = " ".join(srchtitle.split())
srchurl = geturl(srchtitle)
parser = Parser(srchtitle, srchurl)
thepage = getdata(srchurl)
for line in thepage:
try:
parser.feed(line)
except HTMLParser.HTMLParseError:
pass
results = parser.movies
for r in results:
print '--------------'
r.getmovie()
for attr in ['Title','Director','Tagline','Plot_Outline','Country',
'Language','Runtime','Cast']:
print "%s: %s" % (attr, eval('r.%s' % attr))
def getdata(srchurl):
"""
Grab all the HTML from the movie database -- no parsing yet
"""
f = urllib2.urlopen(srchurl)
txt = f.readlines()
f.close()
return txt
def geturl(srchtitle):
"""
Builds the right search URL for a given title string
"""
ttl = titlepart(srchtitle)
return urlparse.urlunparse(('http', 'www.imdb.com', '/find', '',
'tt=on;nm=on;mx=20;q=%s' % ttl, ''))
def titlepart(srchtitle):
"""
Just messes with the title string itself
"""
return "%20".join(srchtitle.split())
class Parser(HTMLParser.HTMLParser):
"""
This parser is for dealing with the first screen returned --
maybe an individual movie, but also maybe a page which includes
an 'Exact Matches' section
The logic is a bit tricky. Data is scanned only if we're in
the Exact Matches section -- and that gets turned off when we
hit a concluding </table> tag. Within such a section, we have
one or more urls to worry about -- each unique entry will
be used to instantiate a new Movie object
However, if this is already the movie in question (we went
straight there) then we'll discover this by looking at the
title tag
"""
def __init__(self, srchtitle, srchurl):
HTMLParser.HTMLParser.__init__(self)
self.srchtitle = srchtitle
self.srchurl = srchurl
self.getdata = False
self.geturl = False
self.geturls = False
self.gettitle = False
self.attrs = {}
self.movies = []
def handle_starttag(self, tag, attrs):
if tag == 'b':
self.getdata = True
if tag == 'a' and self.geturls:
self.geturl = True
self.attrs = dict(attrs)
if tag == 'title':
self.gettitle = True
def handle_endtag(self, tag):
if tag == 'b':
self.getdata = False
if tag == 'table' and self.geturls:
self.geturls = False
self.geturl = False
if tag == 'a' and self.geturls:
self.attrs = {}
if tag == 'title':
self.gettitle = False
def handle_data(self, data):
if self.getdata:
if 'Exact Matches' in data:
self.geturls = True
if self.geturl:
if len(self.attrs)>0:
theurl = 'http://www.imdb.com%s' % self.attrs['href']
self.movies.append(Movie(data,theurl))
if self.gettitle:
if data[0] in self.srchtitle:
self.movies.append(Movie(data,self.srchurl))
class MovieParser(HTMLParser.HTMLParser):
"""
This parser is looking at an individual movie page
"""
def __init__(self, themovie):
HTMLParser.HTMLParser.__init__(self)
self.themovie = themovie
self.getdata = False
self.geturl = False
self.getmore = False
self.getcast = False
self.char = False
def handle_starttag(self, tag, attrs):
if tag == 'b':
self.getdata = True
if tag == 'br':
self.getmore = False
def handle_endtag(self, tag):
if tag == 'b':
self.getdata = False
if tag == 'a':
self.geturl = False
if tag == 'table':
self.getcast = False
def handle_data(self, data):
if self.getdata:
if 'Directed by' in data:
self.geturl = True
self.attr = 'Director'
if 'Tagline' in data:
self.getmore = True
self.attr = 'Tagline'
if 'Plot Outline' in data:
self.getmore = True
self.attr = 'Plot_Outline'
if 'Cast overview' in data:
self.getcast = True
self.attr = 'Cast'
self.themovie.Cast = []
if 'Country' in data:
self.geturl = True
self.attr = 'Country'
if 'Language' in data:
self.geturl = True
self.attr = 'Language'
if 'Runtime' in data:
self.getmore = True
self.attr = 'Runtime'
elif self.getcast:
if '...' in data:
self.char = True
elif self.char and type(data)==type(' '):
self.char = False
if len(data.strip())>0:
self.themovie.Cast[-1][1] = data.strip()
elif len(self.themovie.Cast)>0:
self.themovie.Cast.pop()
elif type(data)==type(' '):
if len(data.strip())>0:
self.themovie.Cast.append([data.strip(),''])
elif self.getmore or self.geturl:
self.themovie.__dict__[self.attr] = self.themovie.__dict__.get(self.attr,'') + data.strip()
class Movie:
def __init__(self, title, theurl):
self.Title = title
self.theurl = theurl
def getmovie(self):
f = urllib2.urlopen(self.theurl)
thepage = f.readlines()
f.close()
parser = MovieParser(self)
for line in thepage:
try:
parser.feed(line)
except HTMLParser.HTMLParseError:
pass
def __getattr__(self,attr):
pass
def test():
search("West Side Story")
search("Soylent Green")
search("Bullet")
search("The School of Rock")
search("The Cable Guy")
if __name__ == "__main__":
search(sys.argv[1])
# code highlighted using py2html.py version 0.8