#!/usr/bin/python """ By Kirby Urner. GPL license. As a check: >>> import getmovie >>> getmovie.test() Latest revision: April 25, 2004 -- added plot line and actors April 23, 2004 -- realized that some searches take you straight to the movie page w/o passing go. Added a few more attributes (Country, Language, Runtime). Added command line ability Initial version: April 23, 2004 (written at Powell's Books on Hawthorne) Looking up a movie is a two-part process. The search string (title) gets you to a screen that we hope has a section entitled 'Exact Matches' -- that's what I'm looking for. The URLs in that section feed into Movie objects, which are then invoked in turn to search for themselves by URL. At this point, another parser is invoked to scrape the individual movie page, and populate the object's attributes """ import urlparse, urllib2, HTMLParser import sys def search(srchtitle): """ Top level searcher -- takes you to an interim page of zero or more exact matches (need to add logic for 'no matches found'), OR to the movie page itself (in which case just one movie object is returned by the parser. """ srchtitle = " ".join(srchtitle.split()) # remove any extraneous spaces srchurl = geturl(srchtitle) parser = Parser(srchtitle, srchurl) thepage = getdata(srchurl) for line in thepage: # feed HTML line-by-line try: parser.feed(line) except HTMLParser.HTMLParseError: pass # results is a list of Movie objects (unless none were found) results = parser.movies for r in results: print '--------------' r.getmovie() for attr in ['Title','Director','Tagline','Plot_Outline','Country', 'Language','Runtime','Cast']: print "%s: %s" % (attr, eval('r.%s' % attr)) def getdata(srchurl): """ Grab all the HTML from the movie database -- no parsing yet """ f = urllib2.urlopen(srchurl) txt = f.readlines() f.close() return txt def geturl(srchtitle): """ Builds the right search URL for a given title string """ ttl = titlepart(srchtitle) return urlparse.urlunparse(('http', 'www.imdb.com', '/find', '', 'tt=on;nm=on;mx=20;q=%s' % ttl, '')) def titlepart(srchtitle): """ Just messes with the title string itself """ return "%20".join(srchtitle.split()) class Parser(HTMLParser.HTMLParser): """ This parser is for dealing with the first screen returned -- maybe an individual movie, but also maybe a page which includes an 'Exact Matches' section The logic is a bit tricky. Data is scanned only if we're in the Exact Matches section -- and that gets turned off when we hit a concluding tag. Within such a section, we have one or more urls to worry about -- each unique entry will be used to instantiate a new Movie object However, if this is already the movie in question (we went straight there) then we'll discover this by looking at the title tag """ def __init__(self, srchtitle, srchurl): HTMLParser.HTMLParser.__init__(self) self.srchtitle = srchtitle self.srchurl = srchurl self.getdata = False self.geturl = False self.geturls = False self.gettitle = False self.attrs = {} self.movies = [] def handle_starttag(self, tag, attrs): if tag == 'b': self.getdata = True if tag == 'a' and self.geturls: self.geturl = True self.attrs = dict(attrs) if tag == 'title': self.gettitle = True def handle_endtag(self, tag): if tag == 'b': self.getdata = False if tag == 'table' and self.geturls: self.geturls = False self.geturl = False if tag == 'a' and self.geturls: self.attrs = {} if tag == 'title': self.gettitle = False def handle_data(self, data): if self.getdata: if 'Exact Matches' in data: self.geturls = True if self.geturl: # here's where we actually append a new Movie object # data contains the title, attrs['href'] the url if len(self.attrs)>0: theurl = 'http://www.imdb.com%s' % self.attrs['href'] self.movies.append(Movie(data,theurl)) if self.gettitle: if data[0] in self.srchtitle: # this is a movie page self.movies.append(Movie(data,self.srchurl)) class MovieParser(HTMLParser.HTMLParser): """ This parser is looking at an individual movie page """ def __init__(self, themovie): HTMLParser.HTMLParser.__init__(self) self.themovie = themovie self.getdata = False self.geturl = False self.getmore = False self.getcast = False self.char = False def handle_starttag(self, tag, attrs): if tag == 'b': self.getdata = True if tag == 'br': self.getmore = False def handle_endtag(self, tag): if tag == 'b': self.getdata = False if tag == 'a': self.geturl = False if tag == 'table': self.getcast = False def handle_data(self, data): if self.getdata: if 'Directed by' in data: self.geturl = True self.attr = 'Director' if 'Tagline' in data: self.getmore = True self.attr = 'Tagline' if 'Plot Outline' in data: self.getmore = True self.attr = 'Plot_Outline' if 'Cast overview' in data: self.getcast = True self.attr = 'Cast' self.themovie.Cast = [] # initialize actor list-of-tuples if 'Country' in data: self.geturl = True self.attr = 'Country' if 'Language' in data: self.geturl = True self.attr = 'Language' if 'Runtime' in data: self.getmore = True self.attr = 'Runtime' elif self.getcast: if '...' in data: self.char = True elif self.char and type(data)==type(' '): self.char = False if len(data.strip())>0: self.themovie.Cast[-1][1] = data.strip() elif len(self.themovie.Cast)>0: self.themovie.Cast.pop() elif type(data)==type(' '): if len(data.strip())>0: self.themovie.Cast.append([data.strip(),'']) elif self.getmore or self.geturl: self.themovie.__dict__[self.attr] = self.themovie.__dict__.get(self.attr,'') + data.strip() class Movie: def __init__(self, title, theurl): self.Title = title self.theurl = theurl def getmovie(self): f = urllib2.urlopen(self.theurl) thepage = f.readlines() f.close() # notice that the movie object gets passed to the parser # for populating with additional attributes parser = MovieParser(self) for line in thepage: try: parser.feed(line) except HTMLParser.HTMLParseError: pass def __getattr__(self,attr): pass def test(): search("West Side Story") search("Soylent Green") search("Bullet") search("The School of Rock") search("The Cable Guy") if __name__ == "__main__": search(sys.argv[1])