import BeautifulSoup
import cStringIO
Any text input to ZTM needs to be processed. It is read as a filestrem for disk
or sent as a stream from the browser. ZTM could just store it directly but we
often want to do some extra processing, both before saving and before
outputting.
1. Sanitize HTML to remove any scripts and other unwanted security issues.
2. Check for internal links and add the required associations.
3. Add sectionwrappers if we want to insert a table of contents.
4. Prepare for indexing.
markup = """\
text
subheading
paragraph
othersubheading
sub sub
empty
asdf
"""
import ztm.htmlsanitizer
sanitizer = ztm.htmlsanitizer.sanitizer.Sanitizer()
normalized = sanitizer.normalize(imput, input_encoding) # libtidy or BeautifulSoup
tree = sanitizer.parse(tidied)
sanitizer.whitelist(tree)
sectionwrap()
import string
def anchorid(text):
"""Convert the text of a header to a anchor.
>>> anchorid('\tSome heading to reduce \xF8 ')
'some_heading_to_reduce_'
"""
allowedchars = string.ascii_letters + string.digits + "_"
text = text.lower().strip().replace(" ","_")
return ''.join([char for char in text.lower() if char in allowedchars])
def sectionwrap(elementtree):
def sectionwrap(soup):
"""Divide an HTML text into logical sections by headers.
:param soup: Markup to be wrapped in sections.
:type soup: string or :class:BeautifulSoup.BeautifulSoup
:returns: markup with wrapped sections.
"""
if not isinstance(soup, BeautifulSoup.BeautifulSoup):
soup = BeautifulSoup.BeautifulSoup(markup)
text = []
stack = []
anchors = {}
for element in soup:
if getattr(element, 'name', None) in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
# This is a heading and we need to update the stack.
while stack and stack[-1]>=element.name:
# Close existing elements.
stack.pop()
text.append('')
print repr(dir(element))
stack.append(element.name)
import pdb; pdb.set_trace()
id = anchorid(element.string)
if id in anchors:
while id in anchors:
id+='_'
anchors[id] = None
text.append(u''%(id))
text.append(str(element))
for item in stack:
text.append('
')
return BeautifulSoup.BeautifulSoup(''.join(text))
print BeautifulSoup.BeautifulSoup(sectionwrapper(markup)).prettify()