: I need to convert several files that consist of multiple line HTML records
: into files that are single line, pipe-sign delimited records.
: The records always (I think!) start with a common string and end with a common
: string, but may have varying numbers/names of fields in between.
: Here is a sample record:
: <h3><FONT COLOR=BLUE><A NAME=Dennis>James A Dennis</font></h3> E-mail
:>></b><br>
: Home Page Address:<a
: href="http://www.netset.com/~jdennis">http://www.netset.com /~jdennis</a><br>
: No Street address given:<b>City: Columbus, OH USA 43228</b><BR>
: Employer:<b>Podunk Systems Inc.</b><br> Occupation:<b>System
: Administrator</b><br> Comments:<b>Testing 1-2-3 blah blah blah </b><br>
: Visited here via the CHS Alumni Page on: Tue Jul 21 23:04:34 EDT 1998 <hr>
: I need this to come out without the titles, just the pure data, i.e.
: s|OH|43228|System Administrator|Alliance Data Systems|testing
: 1-2-3|jdennis|05-Jan-1999
I recently had to do the same thing for a web-based sysadm tool that my
company wants us to use. Runs a UNIX command on multiple hosts than
makes one (ill-formatted) document with the output from each. I got
fed up with continually editting the output in vi, so I wrote a little
program in Python using its HTMLParser module. You can probably use it
to help you make something more appropriate for you.
#!/usr/local/bin/python
import fileinput, getopt, os, string, sys
# base class for parsing HTML
from htmllib import HTMLParser
# base classes for writing parsed/formatted SGML/HTML documents
from formatter import AbstractFormatter, DumbWriter
# a memory-based file (native implimentation is preferred)
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
# directory to write parsed data files to
dir = os.curdir
# write the data parsed, determine if the data is a system error or output
# from the program on that host
class TOSSWriter(DumbWriter):
# if single=.... is given, then put the data in the given dictionary
# and do not write to files
def __init__(self, file, **dict):
if dict.has_key('single'):
single = dict['single']
del dict['single']
else:
single = None
apply(DumbWriter.__init__, (self, file), dict)
if single != None:
self.dict = single
else:
self.dict = None
def write_data(self):
data = self.file.getvalue()
if data[-2:] == '\n\n':
data = data[:-1]
if data[:2] != ' \n':
data, ext = data[1:], '.err'
else:
data, ext = data[2:], '.out'
if self.dict != None:
self.dict[self.filename] = (ext, data)
else:
file = open('%s%s' % (self.filename, ext), 'w')
file.write(data)
file.close()
# write data to a new file
def new_file(self, name):
if hasattr(self, 'flushed') and self.flushed == 0:
self.write_data()
self.flushed = 0
self.file = StringIO()
self.filename = name
def close_file(self):
self.write_data()
self.flushed = 1
# how should the data be formatted after it is parsed
class TOSSFormatter(AbstractFormatter):
def new_file(self, name):
fname = os.path.join(dir, name)
self.writer.new_file(fname)
def close_file(self):
self.writer.close_file()
# subclass to seperate into host-based output
class TOSSParser(HTMLParser):
# handle </HTML> tags - finish processing the document
def end_html(self):
HTMLParser.end_html(self)
self.done = 1
# handle <B> tags - start saving the text for later processing
def start_b(self, attrs):
HTMLParser.start_b(self, attrs)
self.save_bgn()
# handle </B> tags - send the saved text to the formatter
def end_b(self):
HTMLParser.end_b(self)
data = self.save_end()
self.formatter.new_file(data[:-1])
# handle </PRE> tag - finish up the last host-based data segment
def end_pre(self):
self.formatter.close_file()
try: HTMLParser.end_pre(self)
except: pass
def adjust_perms(perm):
"""Change the given permissions based on the umask."""
umask = os.umask(0)
os.umask(umask)
return perm & ~umask
if __name__ == '__main__':
singleoutput = None
# get command-line arguments and process the options
try:
opts, args = getopt.getopt(sys.argv[1:], 'hd:s')
except getopt.error, value:
raise SystemExit, "%s: %s" % (sys.argv[0], value)
for opt, val in opts:
if opt == '--':
break
elif opt == '-h':
print 'decodetoss [-h] [-s | -d <outdir>] <file> ...'
raise SystemExit
elif opt == '-d':
dir = val
# create the directory if it does not exist
if not os.path.isdir(dir):
os.mkdir(dir, adjust_perms(0777))
elif opt == '-s':
singleoutput = {}
# create HTML parser
writer = TOSSWriter(StringIO(), single=singleoutput)
parser = TOSSParser(TOSSFormatter(writer))
if not args:
args.insert(0, '-')
# parse the HTML document from standard input or filenames on command line
for line in fileinput.input(args):
parser.feed(line)
# did we finish?
if not hasattr(parser, 'done'):
raise SystemExit, 'not a valid TOSS (HTML) output file'
try:
# output the data in a formatted strings
if singleoutput:
hosts = singleoutput.keys()
hosts.sort()
for host in hosts:
type, data = singleoutput[host]
sys.stdout.write('%s\n' % host)
if type == '.out':
lines = string.joinfields(string.splitfields(data, '\n'), '\n ')
sys.stdout.write(' ' + lines + '\n')
else:
lines = string.joinfields(string.splitfields(data, '\n'), '\n !')
sys.stdout.write(' !' + lines + '\n')
# this is so the program exits gracefully on pipe breaks and Ctnl-C keys
except (KeyboardInterrupt, IOError):
raise SystemExit
# code end
You can get Python at http://www.python.org. It shouldn't be too hard
to modify the TOSSParser and TOSSFormatter classes to scan and read
fields instead of raw text. Based on what you have, you know that
records are enclosed in <H3>...</H3> tags, and each field is delimited
with a <B> (actually enclosed by them but...).
def start_h3(self, attrs):
HTMLParser.start_h3(self, attrs)
self.formatter.start_new_record()
def end_h3(self):
HTMLParser.end_h3(self)
self.formatter.end_existing_record()
def start_b(self, attrs):
HTMLParser.start_b(self, attrs)
self.save_bgn()
def end_b(self, attrs):
HTMLParser.env_b(self)
field_data = self.save_end()
# field position would be determined by order of previous fields
self.formatter.add_field(field_data)
def do_br(self, attrs):
pass # just ignore line break tags (<BR>)
You might have to get some extra logic in there to handle the rest of
the parsing, but I hope this helps.
-Arcege