"""REX/Python

Based on Robert D. Cameron's REX/Perl 1.0 with the following changes:
  * Added named matching groups for each piece

Original copyright notice follows:

REX/Perl 1.0

Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
Technical Report TR 1998-17, School of Computing Science, Simon Fraser 
University, November, 1998.
Copyright (c) 1998, Robert D. Cameron. 
The following code may be freely used and distributed provided that
this copyright and citation notice remains intact and that modifications
or additions are clearly identified.

"""

import re

TextSE = "(?P<TextSE>[^<]+)"
UntilHyphen = "(?P<UntilHyphen>[^-]*-)"
Until2Hyphens = "(?P<Until2Hyphens>" + UntilHyphen + "(?:[^-]" + UntilHyphen + ")*-)"
CommentCE = "(?P<CommentCE>" + Until2Hyphens + ">?)"
UntilRSBs = "(?P<UntilRSBs>[^\\]]*](?:[^\\]]+])*]+)"
CDATA_CE = "(?P<CDATA_CE>" + UntilRSBs + "(?:[^\\]>]" + UntilRSBs + ")*>)"
S = "(?P<S>[ \\n\\t\\r]+)"
NameStrt = "(?P<NameStrt>[A-Za-z_:]|[^\\x00-\\x7F])"
NameChar = "(?P<NameChar>[A-Za-z0-9_:.-]|[^\\x00-\\x7F])"
Name = "(?P<Name>(?:" + NameStrt + ")(?:" + NameChar + ")*)"
QuoteSE = "(?P<QuoteSE>\"[^\"]*\"|'[^']*')"
DT_IdentSE = "(?P<DT_IdentSE>" + S + Name + "(?:" + S + "(?:" + Name + "|" + QuoteSE + "))*)"
MarkupDeclCE = "(?P<MarkupDeclCE>(?:[^\\]\"'><]+|" + QuoteSE + ")*>)"
S1 = "(?P<S1>[\\n\\r\\t ])"
UntilQMs = "(?P<UntilQMs>[^?]*\\?+)"
PI_Tail = "(?P<PI_Tail>\\?>|" + S1 + UntilQMs + "(?:[^>?]" + UntilQMs + ")*>)"
DT_ItemSE = "(?P<DT_ItemSE><(?:!(?:--" + Until2Hyphens + ">|[^-]" + MarkupDeclCE + ")|\\?" + Name + "(?:" + PI_Tail + "))|%" + Name + ";|" + S + ")"
DocTypeCE = "(?P<DocTypeCE>" + DT_IdentSE + "(?:" + S + ")?(?:\\[(?:" + DT_ItemSE + ")*](?:" + S + ")?)?>?)"
DeclCE = "(?P<DeclCE>--(?:" + CommentCE + ")?|\\[CDATA\\[(?:" + CDATA_CE + ")?|DOCTYPE(?:" + DocTypeCE + ")?)"
PI_CE = "(?P<PI_CE>" + Name + "(?:" + PI_Tail + ")?)"
EndTagCE = "(?P<EndTagCE>" + Name + "(?:" + S + ")?>?)"
AttValSE = "(?P<AttValSE>\"[^<\"]*\"|'[^<']*')"
ElemTagCE = "(?P<ElemTagCE>" + Name + "(?:" + S + Name + "(?:" + S + ")?=(?:" + S + ")?(?:" + AttValSE + "))*(?:" + S + ")?/?>?)"
MarkupSPE = "(?P<MarkupSPE><(?:!(?:" + DeclCE + ")?|\\?(?:" + PI_CE + ")?|/(?:" + EndTagCE + ")?|(?:" + ElemTagCE + ")?))"
XML_SPE = TextSE + "|" + MarkupSPE

XML_SPE_ = re.compile(XML_SPE)

def shallow_parse(XML_document):
	return re.findall(XML_SPE, XML_document)

def descriptive_parse(XML_document):
	result =[]
	m = XML_SPE_.search(XML_document)
	while m:
		gd = m.groupdict()
		gi = XML_SPE_.groupindex
		l = []
		for name, value in gd.items():
			if value is not None:
				index = gi[name]
				l.append(index, name, value)
		result.append(l)
		m = XML_SPE_.search(XML_document, m.end())
	return result

_test0 = '<MYTAG>This is <I>shallow parsing</I>.</MYTAG>'

_test1 = '<tag1 att="123" att2="456">my &first; <i>shallow parse</i></tag1>'

_test2 = """\
<?xml version="1.0"?>
<!DOCTYPE PLAY SYSTEM "D:\DOM_tut\Samples\play.dtd">
<PLAY type='special' value='12345'>
	<TITLE CODE="1" PATH="c:\\abc">The &amp; Comedy of Errors</TITLE
	<FM>
		<P>FM Text.</P>
		<!-- my comment -->
		<?mypi mypi value?>
	</FM>
	<PERSONAE>
		<TITLE>Dramatis Personae</TITLE>
		<PERSONA>SOLINUS, Duke of Ephesus.</PERSONA>
	
		<PGROUP>
		<PERSONA>ANTIPHOLUS OF EPHESUS</PERSONA>
		<GRPDESCR>twin brothers, and sons to Aegeon and Aemilia.</GRPDESCR>
		</PGROUP>
	</PERSONAE>
	<SCNDESCR>SCENE  Ephesus.</SCNDESCR>
	<PLAYSUBT>THE COMEDY OF ERRORS</PLAYSUBT>
	<ACT><TITLE>ACT I</TITLE>
		<SCENE>
			<TITLE>SCENE I.  A hall in DUKE SOLINUS'S palace.</TITLE>
			<STAGEDIR>Enter DUKE</STAGEDIR>
			<SPEECH>
				<SPEAKER>AEGEON</SPEAKER>
					<LINE>Proceed, Solinus, to procure my fall</LINE>
			</SPEECH>
		</SCENE>
	</ACT>
</PLAY>
"""

def _test():
	result = descriptive_parse(_test1)
	for l in result:
		# Sorting result on group index (just to make it easier to read).
		l.sort()
		for index, name, value in l:
			print "%10s: %s" % (name, repr(value))
		print

if __name__ == '__main__':
	_test()
