#!/usr/bin/env python
#
# HtmlTableOfContents.py
# (c) 2006-1007 Gene Thomas
# http://genethomas.com
# http://genethomas.com/software/HtmlTableOfContents.py
#
# v1.3
#
# o Updates a html file inserting a table of contents between the comments
# and
# o Adds numbering to the headers. Removing number put there previously be this program.
# o Can be re-run on html file again.
# o Saves original as .bak
# o If fails, e.g. no does not overwrite file
# o Paranoidly checks all HTML tags match up and will not overwrite an invlid html file
#
# Version History:
#
# 1.3
# o Add as non closing tag
# o Handle and manually inserted in headings
# Stripped in heading in table of contents
import sys
import HTMLParser
import os
import re
import cgi # for escape()
import string
numbering = True
debugFlag = os.getenv("DEBUG") != None
def debug(text):
if debugFlag:
print text
debug("DEBUG set, writing debug information")
def comment(text):
return ""
def quote(text):
return cgi.escape(text, 1)
def decl(text):
return ""
def indent(num):
return " " * num
def entity(text):
return "&" + text + ";"
def charref(text):
return "" + text + ";"
def stripATags(text):
debug("stripATags(" + text + ")")
text = re.sub("]*>", "", text)
text = re.sub("", "", text)
debug(" " + text)
return text
class TOCParser(HTMLParser.HTMLParser):
def __init__(self, filename):
# parsing constants
self.HEADERS = ["", "h1", "h2", "h3", "h4", "h5", "h6"] # index is number
self.START_ENDS = ["hr", "br", "meta", "link"] # tags that don't has close tags
self.OPTIONAL_ENDS = ["p", "img", "frame"] # tags that don't have to have close tags
HTMLParser.HTMLParser.__init__(self)
self.filename = filename
self.prelude = ""
self.toc = ""
self.rest = ""
self.TOC = "toc"
self.END_TOC = "end toc"
self.path = [] # list of tags, e.g. ["html", "body"]
self.tagLines = [] # list of (int line, int column) positionsself.path[x] tags appeared on
# STATES
self.sPRELUDE = "prelude"
self.sTOC = "toc"
self.sREST = "rest"
self.sHEADER = "header"
self.state = self.sPRELUDE
self.topHeader = None # int level
self.lastHeader = None # last header written to TOC, int level form tag
self.currentHeaderText = "" # in sHEADER: include
self.lastHeaderNumber = [] # list, e.g. [1, 2, 1] for 1.2.1
def getHeaderAnchor(self):
# [1, 2, 3] -> "1.2.3"
return string.join([str(n) for n in self.lastHeaderNumber], ".")
def close(self):
if self.lastHeader != None:
while self.topHeader <= self.lastHeader:
self.lastHeader = self.lastHeader - 1
self.toc = self.toc + indent(self.lastHeader-self.topHeader+1) + "\n"
HTMLParser.HTMLParser.close(self)
def debug(self, text):
line, col = self.getpos()
debug(self.state + " " + str(line) + " " + str(self.path) + " " + text)
def read(self):
self.close()
if self.state == self.sPRELUDE:
self.fail("Missing tags")
elif self.state == self.sTOC:
self.fail("Missing tag")
elif self.state == self.sHEADER:
self.fail("Missing tag")
elif self.state == self.sREST:
if self.path != []:
self.fail("Missing close tags for open tags: " + str(self.path));
else:
self.fail("bad state " + self.state)
# don't add \ns
return self.prelude + comment(self.TOC) + self.toc + comment(self.END_TOC) + self.rest
def fail(self, reason):
line, col = self.getpos()
raise Exception(filename + ":" + str(line) + ":" + str(col) + " " + reason)
def handle_starttag(self, tag, attrs):
self.debug("handle_starttag(" + tag + ")")
if tag in self.START_ENDS:
# tags that aren't ended as per XML
self.handle_startendtag(tag, attrs)
return
self.path.append(tag)
self.tagLines.append(self.getpos())
if self.state == self.sPRELUDE:
self.prelude = self.prelude + self.get_starttag_text()
elif self.state == self.sTOC:
pass
elif self.state == self.sREST:
if tag in self.HEADERS:
while self.path[:-1] != ["html", "body"] :
if len(self.path) < 2:
self.fail("<" + tag + "> outside ")
# e.g. ["html", "body", "p", "h3"]
if self.path[-2] in self.OPTIONAL_ENDS:
del self.path[-2]
del self.tagLines[-2]
else:
self.fail("<" + tag + "> outside , is in " + string.join(["<" + t + ">" for t in self.path[:-1]], "") + " from lines " + \
string.join([str(line) + ":" + str(col) for (line, col) in self.tagLines[:-1]], ", "))
self.state = self.sHEADER
num = self.HEADERS.index(tag)
if self.topHeader != None and num < self.topHeader:
self.fail("first header is not to level header, e.g. followed by ")
self.currentHeaderText = ""
else:
self.rest = self.rest + self.get_starttag_text()
elif self.state == self.sHEADER:
if tag == "a":
for (at, av) in attrs:
if at == "name" and av.startswith("part"):
# ignore we put in titles
return
if tag in self.HEADERS:
self.fail("nested headers")
self.currentHeaderText = self.currentHeaderText + self.get_starttag_text()
else:
self.fail("bad state " + self.state)
def handle_startendtag(self, tag, attrs):
self.debug("handle_startendtag(" + tag + ")")
if self.state == self.sPRELUDE:
self.prelude = self.prelude + self.get_starttag_text()
elif self.state == self.sTOC:
pass
elif self.state == self.sREST:
self.rest = self.rest + self.get_starttag_text()
elif self.state == self.sHEADER:
self.currentHeaderText = self.currentHeaderText + self.get_starttag_text()
else:
self.fail("bad state " + self.state)
def handle_endtag(self, tag):
self.debug("handle_endtag(" + tag + ")")
if self.path == []:
self.fail("<" + tag + "> missing")
while self.path[-1] != tag:
if self.path[-1] in self.OPTIONAL_ENDS:
del self.path[-1]
del self.tagLines[-1]
if self.path == []:
self.fail("<" + tag + "> missing")
else:
self.fail("" + tag + "> does not match <" + self.path[-1] + ">")
del self.path[-1]
del self.tagLines[-1]
if self.state == self.sPRELUDE:
self.prelude = self.prelude + "" + tag + ">"
elif self.state == self.sTOC:
pass
elif self.state == self.sHEADER:
if tag == "a":
if self.currentHeaderText.count(" self.currentHeaderText.count(" we put in titles
return
if tag in self.HEADERS:
self.debug("END HEADER " + self.currentHeaderText)
if self.path != ["html", "body"] :
self.fail("" + tag + "> outside , is in " + string.join(["<" + t + ">" for t in self.path[:-1]], "") + " from lines " + \
string.join([str(line) + ":" + str(col) for (line, col) in self.tagLines[:-1]], ", "))
self.state = self.sREST
num = self.HEADERS.index(tag)
if self.lastHeader == None:
self.topHeader = num
self.toc = "\nContents\n" + \
"\n" + \
"\n"
self.lastHeaderNumber = [1]
else:
while num > self.lastHeader:
self.toc = self.toc + indent(self.lastHeader-self.topHeader+1) + "\n"
self.lastHeader = self.lastHeader + 1
self.lastHeaderNumber.append(0)
while num < self.lastHeader:
self.lastHeader = self.lastHeader - 1
self.toc = self.toc + indent(self.lastHeader-self.topHeader+1) + "
\n"
del self.lastHeaderNumber[-1]
self.lastHeaderNumber[-1] = self.lastHeaderNumber[-1] + 1
if numbering:
id = self.getHeaderAnchor() + ". "
else:
id = ""
# HTML names must begin with a letter
self.toc = self.toc + indent(num-self.topHeader+1) + "" + id + stripATags(self.currentHeaderText) + "
\n"
self.rest = self.rest + "" + id + "" + self.currentHeaderText + "\n"
self.lastHeader = num
else:
self.currentHeaderText = self.currentHeaderText + "" + tag + ">"
elif self.state == self.sREST:
self.rest = self.rest + "" + tag + ">"
else:
self.fail("bad state " + self.state)
def handle_data(self, text):
self.debug("handle_data(\"" + text.replace("\n", "\\n") + "\")")
if self.state == self.sPRELUDE:
self.prelude = self.prelude + text
elif self.state == self.sTOC:
pass
elif self.state == self.sHEADER:
if self.currentHeaderText == "" and numbering:
# remove numbering, e.g. "1.2.3 "
# we write "1.2.3 title
pos = text.find(" ")
if pos == len(text)-1 and pos > 0:
id = text[0:pos]
isId = True
for c in id:
if not c.isdigit() and c != ".":
# not id
isId = False
break
if isId:
# ignore numbering we put in last time
return
self.currentHeaderText = self.currentHeaderText + text
elif self.state == self.sREST:
self.rest = self.rest + text
else:
self.fail("bad state " + self.state)
def handle_comment(self, text):
self.debug("handle_comment(\"" + text + "\")")
if self.state == self.sPRELUDE:
if text == self.TOC:
self.state = self.sTOC
else:
self.prelude = self.prelude + comment(text)
elif self.state == self.sTOC:
if text == self.END_TOC:
self.state = self.sREST
elif self.state == self.sHEADER:
self.currentHeaderText = self.currentHeaderText + comment(text)
elif self.state == self.sREST:
self.rest = self.rest + comment(text)
else:
self.fail("bad state " + self.state)
def handle_charref(self, text):
self.debug("handle_charref(" + text + ")")
self.handle_data(charref(text))
def handle_entityref(self, text):
self.debug("handle_entiryref(" + text + ")")
self.handle_data(entity(text))
def handle_decl(self, text):
self.debug("handle_decl(" + text + ")")
if self.state == self.sPRELUDE:
self.prelude = self.prelude + decl(text)
elif self.state == self.sTOC:
pass
elif self.state == self.sHEADER:
self.fail("SGML declaratin inside header")
elif self.state == self.sREST:
self.rest = self.rest + decl(text)
else:
self.fail("bad state " + self.state)
filename = sys.argv[1]
output = filename + ".out"
parser = TOCParser(output)
data = file(filename).read()
#print "data = "+ data
parser.feed(data)
out = file(output, "w")
out.write(parser.read())
out.close()
backup = filename + ".bak"
try:
os.remove(backup)
except:
pass
os.rename(filename, backup)
os.rename(output, filename)