import HTMLParser
import sys, glob
thepath = 'D:/cygwin/home/Kirby/bf/buckminster.info/Biblio/By/'
outpath = 'C:/Documents and Settings/Kirby/My Documents/Bucky/'
class Parser(HTMLParser.HTMLParser):
def __init__(self, thefile):
HTMLParser.HTMLParser.__init__(self)
self.row = []
self.col = ''
self.trtag = False
self.tdtag = False
self.out = thefile
def handle_starttag(self, tag, attrs):
if tag=='tr': self.start_tr(attrs)
if tag=='td': self.start_td(attrs)
def handle_endtag(self, tag):
if tag=='tr': self.end_tr()
if tag=='td': self.end_td()
def start_tr(self, attrs):
self.trtag = True
# print "
"
def start_td(self, attrs):
self.tdtag = True
# print "| "
def end_tr(self):
self.trtag=False
if len(self.row)>0 and not (len(self.row)==1 and len(self.row[0])==0):
if len(self.row)>1:
self.row[1] = (' '.join(self.row[1].split())).title()
if len(self.row)>=3 and (self.row[-1].isdigit() or self.row[-1].isalpha()):
if self.row[0]<>'-':
self.out.write(' \n %s\n %s\n \n'
% (self.row[0], self.row[1], self.row[-1]) )
else:
self.out.write(' \n %s\n %s\n \n'
% (self.row[1], self.row[-1]) )
if self.row[0]=='-' and self.row[-1]=='-':
if self.row[1][:2]<>"By":
self.out.write('\n')
self.out.write('\n')
self.out.write(' %s\n' % self.row[1])
self.row = []
def end_td(self):
# print " | "
self.row.append(self.col.strip())
self.col = ""
self.tdtag = False
def handle_data(self, data):
if self.trtag:
if self.tdtag:
self.col = self.col + ' '+ data.strip()
def handle_charref(self, name):
if self.trtag: pass
# print name
def handle_entityref(self, name):
if self.trtag: pass
# print name
def getpage(thefile, outfile):
thepage = open(thefile)
parser = Parser(outfile)
while True:
line = thepage.read(5068)
if not line:
break
parser.feed(line)
parser.close()
def test():
outfile = file(outpath + 'buckybooks.xml','w')
files = files = glob.glob(thepath + "*.htm")
#files.append('By-BkTOC-GrunchOfGiants.htm')
#files.append('By-BkTOC-HumansInUniverse.htm')
#files.append('By-BkTOC-OperatingManualForSpaceshipEarth.htm')
for f in files:
getpage(f, outfile)
outfile.close()