import BeautifulSoup import cStringIO Any text input to ZTM needs to be processed. It is read as a filestrem for disk or sent as a stream from the browser. ZTM could just store it directly but we often want to do some extra processing, both before saving and before outputting. 1. Sanitize HTML to remove any scripts and other unwanted security issues. 2. Check for internal links and add the required associations. 3. Add sectionwrappers if we want to insert a table of contents. 4. Prepare for indexing. markup = """\

Document title

text

subheading

paragraph

othersubheading

para

sub sub

empty

asdf

""" import ztm.htmlsanitizer sanitizer = ztm.htmlsanitizer.sanitizer.Sanitizer() normalized = sanitizer.normalize(imput, input_encoding) # libtidy or BeautifulSoup tree = sanitizer.parse(tidied) sanitizer.whitelist(tree) sectionwrap() import string def anchorid(text): """Convert the text of a header to a anchor. >>> anchorid('\tSome heading to reduce \xF8 ') 'some_heading_to_reduce_' """ allowedchars = string.ascii_letters + string.digits + "_" text = text.lower().strip().replace(" ","_") return ''.join([char for char in text.lower() if char in allowedchars]) def sectionwrap(elementtree): def sectionwrap(soup): """Divide an HTML text into logical sections by headers. :param soup: Markup to be wrapped in sections. :type soup: string or :class:BeautifulSoup.BeautifulSoup :returns: markup with wrapped sections. """ if not isinstance(soup, BeautifulSoup.BeautifulSoup): soup = BeautifulSoup.BeautifulSoup(markup) text = [] stack = [] anchors = {} for element in soup: if getattr(element, 'name', None) in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: # This is a heading and we need to update the stack. while stack and stack[-1]>=element.name: # Close existing elements. stack.pop() text.append('') print repr(dir(element)) stack.append(element.name) import pdb; pdb.set_trace() id = anchorid(element.string) if id in anchors: while id in anchors: id+='_' anchors[id] = None text.append(u'
'%(id)) text.append(str(element)) for item in stack: text.append('
') return BeautifulSoup.BeautifulSoup(''.join(text)) print BeautifulSoup.BeautifulSoup(sectionwrapper(markup)).prettify()