The xmlib
module provides a simple XML parser, using regular expressions to
pull the XML data apart, as shown in Example 5-1. The parser does basic checks on the
document, such as a check to see that there is only one top-level element
and a check to see that all tags are balanced.
You feed XML data to this parser piece by piece (as data arrives over a network, for example). The parser calls methods in itself for start tags, data sections, end tags, and entities, among other things.
If you’re only interested in a few tags, you can define special
start_tag
and end_tag
methods, where tag
is the tag name. The
start
functions are called with the attributes
given as a dictionary.
Example 5-1. Using the xmllib Module to Extract Information from an Element
File: xmllib-example-1.py
import xmllib
class Parser(xmllib.XMLParser):
# get quotation number
def _ _init_ _(self, file=None):
xmllib.XMLParser._ _init_ _(self)
if file:
self.load(file)
def load(self, file):
while 1:
s = file.read(512)
if not s:
break
self.feed(s)
self.close()
def start_quotation(self, attrs):
print "id =>", attrs.get("id")
raise EOFError
try:
c = Parser()
c.load(open("samples/sample.xml"))
except EOFError:
pass
id => 031
Example 5-2 contains a simple (and incomplete) rendering
engine. The parser maintains an element stack
(_ _tags
), which it passes to the renderer, together
with text fragments. The renderer looks up the current tag hierarchy
in a style dictionary, and if it isn’t already there, it creates a new
style descriptor by combining bits and pieces from the stylesheet.
Example 5-2. Using the xmllib Module
File: xmllib-example-2.py import xmllib import string, sys STYLESHEET = { # each element can contribute one or more style elements "quotation": {"style": "italic"}, "lang": {"weight": "bold"}, "name": {"weight": "medium"}, } class Parser(xmllib.XMLParser): # a simple styling engine def _ _init_ _(self, renderer): xmllib.XMLParser._ _init_ _(self) self._ _data = [] self._ _tags = [] self._ _renderer = renderer def load(self, file): while 1: s = file.read(8192) if not s: break self.feed(s) self.close() def handle_data(self, data): self._ _data.append(data) def unknown_starttag(self, tag, attrs): if self._ _data: text = string.join(self._ _data, "") self._ _renderer.text(self._ _tags, text) self._ _tags.append(tag) self._ _data = [] def unknown_endtag(self, tag): self._ _tags.pop() if self._ _data: text = string.join(self._ _data, "") self._ _renderer.text(self._ _tags, text) self._ _data = [] class DumbRenderer: def _ _init_ _(self): self.cache = {} def text(self, tags, text): # render text in the style given by the tag stack tags = tuple(tags) style = self.cache.get(tags) if style is None: # figure out a combined style style = {} for tag in tags: s = STYLESHEET.get(tag) if s: style.update(s) self.cache[tags] = style # update cache # write to standard output sys.stdout.write("%s => " % style) sys.stdout.write(" " + repr(text) + " ") # # try it out r = DumbRenderer() c = Parser(r) c.load(open("samples/sample.xml")){'style': 'italic'} =>
'I've had a lot of developers come up to me and 12say,
"I haven't had this much fun in a long time. It sure
beats 12writing '
{'style': 'italic', 'weight': 'bold'} =>
'Cobol'
{'style': 'italic'} =>
'" -- '
{'style': 'italic', 'weight': 'medium'} =>
'James Gosling'
{'style': 'italic'} =>
', on 12'
{'weight': 'bold'} =>
'Java'
{'style': 'italic'} =>
'.'
3.142.12.207