import HTMLParser
import sys, glob

thepath = 'D:/cygwin/home/Kirby/bf/buckminster.info/Biblio/By/'
outpath = 'C:/Documents and Settings/Kirby/My Documents/Bucky/'

class Parser(HTMLParser.HTMLParser):

    def __init__(self, thefile):
        HTMLParser.HTMLParser.__init__(self)
        self.row = []
        self.col = ''
        self.trtag = False
        self.tdtag = False
        self.out = thefile

    def handle_starttag(self, tag, attrs):
        if tag=='tr': self.start_tr(attrs)
        if tag=='td': self.start_td(attrs)

    def handle_endtag(self, tag):
        if tag=='tr': self.end_tr()
        if tag=='td': self.end_td()

    def start_tr(self, attrs):
        self.trtag = True
        # print "<tr>"


    def start_td(self, attrs):
        self.tdtag = True
        # print "<td>"


    def end_tr(self):
        self.trtag=False
        if len(self.row)>0 and not (len(self.row)==1 and len(self.row[0])==0):
            if len(self.row)>1:
                self.row[1] = (' '.join(self.row[1].split())).title()
                
            if len(self.row)>=3 and (self.row[-1].isdigit() or self.row[-1].isalpha()):
                if self.row[0]<>'-':
                    self.out.write('   <chapter id="%s">\n      <title>%s</title>\n      <page>%s</page>\n   </chapter>\n'
                              % (self.row[0], self.row[1], self.row[-1]) )
                else:
                    self.out.write('   <chapter>\n      <title>%s</title>\n      <page>%s</page>\n   </chapter>\n'
                              % (self.row[1], self.row[-1]) )                    
            if self.row[0]=='-' and self.row[-1]=='-':
                if self.row[1][:2]<>"By":
                    self.out.write('</book>\n')
                    self.out.write('<book>\n')                    
                    self.out.write('   <title>%s<title>\n' % self.row[1])                      
        self.row = []
                
    def end_td(self):
        # print "</td>"

        self.row.append(self.col.strip())
        self.col = ""
        self.tdtag = False
            
    def handle_data(self, data):
        if self.trtag:
            if self.tdtag:
                self.col = self.col + ' '+ data.strip()

    def handle_charref(self, name):
        if self.trtag: pass
            # print name


    def handle_entityref(self, name):
        if self.trtag: pass
            # print name


def getpage(thefile, outfile):
    thepage = open(thefile)
    parser = Parser(outfile)
    while True:
        line = thepage.read(5068)        
        if not line:
            break
        parser.feed(line)
    parser.close()

def test():
    outfile = file(outpath + 'buckybooks.xml','w')    
    files = files = glob.glob(thepath + "*.htm")
    #files.append('By-BkTOC-GrunchOfGiants.htm')

    #files.append('By-BkTOC-HumansInUniverse.htm')

    #files.append('By-BkTOC-OperatingManualForSpaceshipEarth.htm')

    for f in files:
        getpage(f, outfile)
    outfile.close()
    
# code highlighted using py2html.py version 0.8