import HTMLParser import sys, glob thepath = 'D:/cygwin/home/Kirby/bf/buckminster.info/Biblio/By/' outpath = 'C:/Documents and Settings/Kirby/My Documents/Bucky/' class Parser(HTMLParser.HTMLParser): def __init__(self, thefile): HTMLParser.HTMLParser.__init__(self) self.row = [] self.col = '' self.trtag = False self.tdtag = False self.out = thefile def handle_starttag(self, tag, attrs): if tag=='tr': self.start_tr(attrs) if tag=='td': self.start_td(attrs) def handle_endtag(self, tag): if tag=='tr': self.end_tr() if tag=='td': self.end_td() def start_tr(self, attrs): self.trtag = True # print "" def start_td(self, attrs): self.tdtag = True # print "" def end_tr(self): self.trtag=False if len(self.row)>0 and not (len(self.row)==1 and len(self.row[0])==0): if len(self.row)>1: self.row[1] = (' '.join(self.row[1].split())).title() if len(self.row)>=3 and (self.row[-1].isdigit() or self.row[-1].isalpha()): if self.row[0]<>'-': self.out.write(' \n %s\n %s\n \n' % (self.row[0], self.row[1], self.row[-1]) ) else: self.out.write(' \n %s\n %s\n \n' % (self.row[1], self.row[-1]) ) if self.row[0]=='-' and self.row[-1]=='-': if self.row[1][:2]<>"By": self.out.write('\n') self.out.write('\n') self.out.write(' %s<title>\n' % self.row[1]) self.row = [] def end_td(self): # print "</td>" self.row.append(self.col.strip()) self.col = "" self.tdtag = False def handle_data(self, data): if self.trtag: if self.tdtag: self.col = self.col + ' '+ data.strip() def handle_charref(self, name): if self.trtag: pass # print name def handle_entityref(self, name): if self.trtag: pass # print name def getpage(thefile, outfile): thepage = open(thefile) parser = Parser(outfile) while True: line = thepage.read(5068) if not line: break parser.feed(line) parser.close() def test(): outfile = file(outpath + 'buckybooks.xml','w') files = files = glob.glob(thepath + "*.htm") #files.append('By-BkTOC-GrunchOfGiants.htm') #files.append('By-BkTOC-HumansInUniverse.htm') #files.append('By-BkTOC-OperatingManualForSpaceshipEarth.htm') for f in files: getpage(f, outfile) outfile.close()