converting multi-line records to single line record

converting multi-line records to single line record

Post by Michael P. Reill » Thu, 07 Jan 1999 04:00:00



: I need to convert several files that consist of multiple line HTML records
: into files that are single line, pipe-sign delimited records.

: The records always (I think!) start with a common string and end with a common
: string, but may have varying numbers/names of fields in between.

: Here is a sample record:

: <h3><FONT COLOR=BLUE><A NAME=Dennis>James A Dennis</font></h3> E-mail

:>&gt;</b><br>

: Home Page Address:<a
: href="http://www.netset.com/~jdennis">http://www.netset.com /~jdennis</a><br>
: No Street address given:<b>City: Columbus, OH USA 43228</b><BR>
: Employer:<b>Podunk Systems Inc.</b><br> Occupation:<b>System
: Administrator</b><br> Comments:<b>Testing 1-2-3 blah blah blah </b><br>
: Visited here via the CHS Alumni Page on:  Tue Jul 21 23:04:34 EDT 1998 <hr>

: I need this to come out without the titles, just the pure data, i.e.


: s|OH|43228|System Administrator|Alliance Data Systems|testing
: 1-2-3|jdennis|05-Jan-1999

I recently had to do the same thing for a web-based sysadm tool that my
company wants us to use.  Runs a UNIX command on multiple hosts than
makes one (ill-formatted) document with the output from each.  I got
fed up with continually editting the output in vi, so I wrote a little
program in Python using its HTMLParser module.  You can probably use it
to help you make something more appropriate for you.

#!/usr/local/bin/python

import fileinput, getopt, os, string, sys
# base class for parsing HTML
from htmllib import HTMLParser
# base classes for writing parsed/formatted SGML/HTML documents
from formatter import AbstractFormatter, DumbWriter
# a memory-based file (native implimentation is preferred)
try:
  from cStringIO import StringIO
except ImportError:
  from StringIO import StringIO

# directory to write parsed data files to
dir = os.curdir

# write the data parsed, determine if the data is a system error or output
# from the program on that host
class TOSSWriter(DumbWriter):
  # if single=.... is given, then put the data in the given dictionary
  # and do not write to files
  def __init__(self, file, **dict):
    if dict.has_key('single'):
      single = dict['single']
      del dict['single']
    else:
      single = None
    apply(DumbWriter.__init__, (self, file), dict)
    if single != None:
      self.dict = single
    else:
      self.dict = None
  def write_data(self):
    data = self.file.getvalue()
    if data[-2:] == '\n\n':
      data = data[:-1]
    if data[:2] != ' \n':
      data, ext = data[1:], '.err'
    else:
      data, ext = data[2:], '.out'
    if self.dict != None:
      self.dict[self.filename] = (ext, data)
    else:
      file = open('%s%s' % (self.filename, ext), 'w')
      file.write(data)
      file.close()

  # write data to a new file
  def new_file(self, name):
    if hasattr(self, 'flushed') and self.flushed == 0:
      self.write_data()
    self.flushed = 0
    self.file = StringIO()
    self.filename = name
  def close_file(self):
    self.write_data()
    self.flushed = 1

# how should the data be formatted after it is parsed
class TOSSFormatter(AbstractFormatter):
  def new_file(self, name):
    fname = os.path.join(dir, name)
    self.writer.new_file(fname)
  def close_file(self):
    self.writer.close_file()

# subclass to seperate into host-based output
class TOSSParser(HTMLParser):
  # handle </HTML> tags - finish processing the document
  def end_html(self):
    HTMLParser.end_html(self)
    self.done = 1
  # handle <B> tags - start saving the text for later processing
  def start_b(self, attrs):
    HTMLParser.start_b(self, attrs)
    self.save_bgn()
  # handle </B> tags - send the saved text to the formatter
  def end_b(self):
    HTMLParser.end_b(self)
    data = self.save_end()
    self.formatter.new_file(data[:-1])
  # handle </PRE> tag - finish up the last host-based data segment
  def end_pre(self):
    self.formatter.close_file()
    try:    HTMLParser.end_pre(self)
    except: pass

def adjust_perms(perm):
  """Change the given permissions based on the umask."""
  umask = os.umask(0)
  os.umask(umask)
  return perm & ~umask

if __name__ == '__main__':
  singleoutput = None
  # get command-line arguments and process the options
  try:
    opts, args = getopt.getopt(sys.argv[1:], 'hd:s')
  except getopt.error, value:
    raise SystemExit, "%s: %s" % (sys.argv[0], value)
  for opt, val in opts:
    if opt == '--':
      break
    elif opt == '-h':
      print 'decodetoss [-h] [-s | -d <outdir>] <file> ...'
      raise SystemExit
    elif opt == '-d':
      dir = val
      # create the directory if it does not exist
      if not os.path.isdir(dir):
        os.mkdir(dir, adjust_perms(0777))
    elif opt == '-s':
      singleoutput = {}

  # create HTML parser
  writer = TOSSWriter(StringIO(), single=singleoutput)
  parser = TOSSParser(TOSSFormatter(writer))
  if not args:
    args.insert(0, '-')
  # parse the HTML document from standard input or filenames on command line
  for line in fileinput.input(args):
    parser.feed(line)
  # did we finish?
  if not hasattr(parser, 'done'):
    raise SystemExit, 'not a valid TOSS (HTML) output file'

  try:
    # output the data in a formatted strings
    if singleoutput:
      hosts = singleoutput.keys()
      hosts.sort()
      for host in hosts:
        type, data = singleoutput[host]
        sys.stdout.write('%s\n' % host)
        if type == '.out':
          lines = string.joinfields(string.splitfields(data, '\n'), '\n  ')
          sys.stdout.write('  ' + lines + '\n')
        else:
          lines = string.joinfields(string.splitfields(data, '\n'), '\n  !')
          sys.stdout.write('  !' + lines + '\n')
  # this is so the program exits gracefully on pipe breaks and Ctnl-C keys
  except (KeyboardInterrupt, IOError):
    raise SystemExit
# code end

You can get Python at http://www.python.org.  It shouldn't be too hard
to modify the TOSSParser and TOSSFormatter classes to scan and read
fields instead of raw text.  Based on what you have, you know that
records are enclosed in <H3>...</H3> tags, and each field is delimited
with a <B> (actually enclosed by them but...).
  def start_h3(self, attrs):
    HTMLParser.start_h3(self, attrs)
    self.formatter.start_new_record()
  def end_h3(self):
    HTMLParser.end_h3(self)
    self.formatter.end_existing_record()
  def start_b(self, attrs):
    HTMLParser.start_b(self, attrs)
    self.save_bgn()
  def end_b(self, attrs):
    HTMLParser.env_b(self)
    field_data = self.save_end()
    # field position would be determined by order of previous fields
    self.formatter.add_field(field_data)
  def do_br(self, attrs):
    pass  # just ignore line break tags (<BR>)

You might have to get some extra logic in there to handle the rest of
the parsing, but I hope this helps.

  -Arcege

 
 
 

converting multi-line records to single line record

Post by Cal Duniga » Thu, 14 Jan 1999 04:00:00



Quote:> Hello gurus

> I need to convert several files that consist of multiple line HTML records
> into files that are single line, pipe-sign delimited records.

> The records always (I think!) start with a common string and end with a common
> string, but may have varying numbers/names of fields in between.

I'm not certain I have the problem right, but awk is great for such
conversions.  Create an awk program like this:

    BEGIN{ ORS="|" }
    /start_string/,/end_string/{print $0}

Invoke it from the shell like this:
  for fil in *.html; do
    awk -f awkfile $fil > $fil.1
  done

If you do this often put the whole thing in a shell script.  The shell
quoting rules are left as an exercise for the student.

\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

      Consulting                   wrong with a world in which Ken
      Modeling                     Thompson lives in obscurity and
      Training                     Bill Gates is a famous billionaire.
//////////////////////////////////////////////////////////////////////