#!/usr/bin/python
"""
By Kirby Urner.  GPL license.

As a check:
    >>> import getmovie
    >>> getmovie.test()

Latest revision:  
    April 25, 2004 -- added plot line and actors
    
    April 23, 2004 -- realized that some searches
    take you straight to the movie page w/o passing go.  
    Added a few more attributes (Country, Language, Runtime).  
    Added command line ability
    
Initial version:  April 23, 2004 (written at Powell's Books on Hawthorne)

Looking up a movie is a two-part process.  The search string (title) gets
you to a screen that we hope has a section entitled 'Exact Matches' --
that's what I'm looking for.  The URLs in that section feed into Movie
objects, which are then invoked in turn to search for themselves by URL.
At this point, another parser is invoked to scrape the individual movie
page, and populate the object's attributes
"""

import urlparse, urllib2, HTMLParser
import sys

def search(srchtitle):
    """
    Top level searcher -- takes you to an interim page of zero or
    more exact matches (need to add logic for 'no matches found'),
    OR to the movie page itself (in which case just one movie object
    is returned by the parser.    
    """
    srchtitle = " ".join(srchtitle.split()) # remove any extraneous spaces
    srchurl = geturl(srchtitle)
    parser = Parser(srchtitle, srchurl) 
    thepage = getdata(srchurl)
    for line in thepage: # feed HTML line-by-line
        try:
            parser.feed(line)
        except HTMLParser.HTMLParseError:
            pass
                
    # results is a list of Movie objects (unless none were found)
    results = parser.movies
    
    for r in results:
        print '--------------'
        r.getmovie()
        for attr in ['Title','Director','Tagline','Plot_Outline','Country',
                     'Language','Runtime','Cast']:
            print "%s: %s" % (attr, eval('r.%s' % attr))
        
def getdata(srchurl):
    """
    Grab all the HTML from the movie database -- no parsing yet
    """
    f = urllib2.urlopen(srchurl)
    txt = f.readlines()
    f.close()
    return txt

def geturl(srchtitle):
    """
    Builds the right search URL for a given title string
    """
    ttl = titlepart(srchtitle)
    return urlparse.urlunparse(('http', 'www.imdb.com', '/find', '',
                               'tt=on;nm=on;mx=20;q=%s' % ttl, ''))

def titlepart(srchtitle):
    """
    Just messes with the title string itself
    """
    return "%20".join(srchtitle.split())

class Parser(HTMLParser.HTMLParser):
    """
    This parser is for dealing with the first screen returned --
    maybe an individual movie, but also maybe a page which includes
    an 'Exact Matches' section

    The logic is a bit tricky.  Data is scanned only if we're in
    the Exact Matches section -- and that gets turned off when we
    hit a concluding </table> tag.  Within such a section, we have
    one or more urls to worry about -- each unique entry will 
    be used to instantiate a new Movie object

    However, if this is already the movie in question (we went 
    straight there) then we'll discover this by looking at the
    title tag
    """
    
    def __init__(self, srchtitle, srchurl):
        HTMLParser.HTMLParser.__init__(self)
        self.srchtitle = srchtitle
        self.srchurl   = srchurl
        self.getdata = False
        self.geturl  = False
        self.geturls = False
        self.gettitle = False
        self.attrs = {}
        self.movies = []
    
    def handle_starttag(self, tag, attrs):
        if tag == 'b':
            self.getdata = True
        if tag == 'a' and self.geturls:
            self.geturl = True
            self.attrs = dict(attrs)
        if tag == 'title':
            self.gettitle = True

    def handle_endtag(self, tag):
        if tag == 'b':
            self.getdata = False
        if tag == 'table' and self.geturls:
            self.geturls = False
            self.geturl  = False
        if tag == 'a' and self.geturls:
            self.attrs = {}
        if tag == 'title':
            self.gettitle = False

    def handle_data(self, data):
        if self.getdata:
            if 'Exact Matches' in data:
                self.geturls = True

        if self.geturl:
            # here's where we actually append a new Movie object
            # data contains the title, attrs['href'] the url
            if len(self.attrs)>0:
                theurl = 'http://www.imdb.com%s' % self.attrs['href']
                self.movies.append(Movie(data,theurl))

        if self.gettitle:
            if data[0] in self.srchtitle:  # this is a movie page
                self.movies.append(Movie(data,self.srchurl))

class MovieParser(HTMLParser.HTMLParser):
    """
    This parser is looking at an individual movie page
    """
    
    def __init__(self, themovie):
        HTMLParser.HTMLParser.__init__(self)
        self.themovie = themovie
        self.getdata = False
        self.geturl = False
        self.getmore = False
        self.getcast = False
        self.char = False
        
    def handle_starttag(self, tag, attrs):
        if tag == 'b':
            self.getdata = True
        if tag == 'br':
            self.getmore = False
    
    def handle_endtag(self, tag):
        if tag == 'b':            
            self.getdata = False
        if tag == 'a':
            self.geturl = False
        if tag == 'table':
            self.getcast = False

    def handle_data(self, data):
        if self.getdata:
            if 'Directed by' in data:
                self.geturl   = True
                self.attr     = 'Director'
            if 'Tagline' in data:
                self.getmore  = True
                self.attr     = 'Tagline'
            if 'Plot Outline' in data:
                self.getmore  = True
                self.attr     = 'Plot_Outline'
            if 'Cast overview' in data:
                self.getcast  = True                
                self.attr     = 'Cast'
                self.themovie.Cast = [] # initialize actor list-of-tuples
            if 'Country' in data:
                self.geturl   = True
                self.attr     = 'Country'
            if 'Language' in data:
                self.geturl   = True
                self.attr     = 'Language'
            if 'Runtime' in data:
                self.getmore  = True
                self.attr     = 'Runtime'

        elif self.getcast:
            if '...' in data:
                self.char = True

            elif self.char and type(data)==type(' '):
                self.char = False

                if len(data.strip())>0:
                    self.themovie.Cast[-1][1] = data.strip()
                    
                elif len(self.themovie.Cast)>0:
                    self.themovie.Cast.pop()

            elif type(data)==type(' '):
                if len(data.strip())>0:
                    self.themovie.Cast.append([data.strip(),''])
                                
        elif self.getmore or self.geturl:
            self.themovie.__dict__[self.attr] = self.themovie.__dict__.get(self.attr,'') + data.strip() 
                    
class Movie:
    
    def __init__(self, title, theurl):
        self.Title = title
        self.theurl = theurl

    def getmovie(self):
        f = urllib2.urlopen(self.theurl)
        thepage = f.readlines()
        f.close()
        # notice that the movie object gets passed to the parser
        # for populating with additional attributes
        parser = MovieParser(self)
        for line in thepage:
            try:
                parser.feed(line)
            except HTMLParser.HTMLParseError:
                pass
                
    def __getattr__(self,attr):
        pass

def test():
    search("West Side Story")
    search("Soylent Green")
    search("Bullet")
    search("The School of Rock")
    search("The Cable Guy")

if __name__ == "__main__":
    search(sys.argv[1])
# code highlighted using py2html.py version 0.8