(original) (raw)

#! /usr/bin/python # # A parser for RDF/XML built on the xmllib XML parser. # # To do: Passing on namesapce bindings! # # - Add parsing of RDF bags # # parses DAML_ONT_NS or DPO_NS lists, generates DPO_NS import xmllib # Comes with python 1.5 and greater import notation3 # http://www.w3.org/2000/10/swap/notation3.py import urlparse # Comes with python 1.6, lacks file<->url mapping import urllib # Opening resources in load() import string # States: STATE_NOT_RDF = "not RDF" # Before STATE_NO_SUBJECT = "no context" # @@@@@@@@@ use numbers for speed STATE_DESCRIPTION = "Description (have subject)" # STATE_LITERAL = "within literal" STATE_VALUE = "plain value" STATE_NOVALUE = "no value" STATE_LIST = "within list" RESOURCE = notation3.RESOURCE LITERAL = notation3.LITERAL RDF_NS_URI = notation3.RDF_NS_URI # As per the spec RDF_Specification = "http://www.w3.org/TR/REC-rdf-syntax/" # Must come in useful :-) DAML_ONT_NS = "http://www.daml.org/2000/10/daml-ont#" # DAML early version DPO_NS = "http://www.daml.org/2001/03/daml+oil#" # DAML plus oil chatty = 0 RDF_IS = RESOURCE, RDF_NS_URI + "is" # Used with quoting class RDFXMLParser(xmllib.XMLParser): def __init__(self, sink, thisURI, **kw): self.testdata = "" apply(xmllib.XMLParser.__init__, (self,), kw) self._stack =[] # Stack of states self.sink = sink self._thisURI = thisURI self._state = STATE_NOT_RDF # Maybe should ignore RDF poutside ?? self._context = thisURI + "#_formula" # Context of current statements, change in bags self._formula = RESOURCE, self._context self._subject = None self._predicate = None self._genPrefix = "#_g" # @@@ allow parameter override self._nextId = 0 # For generation of arbitrary names for anonymous nodes self.sink.startDoc() version = "$Id: xml2rdf.py,v 1.20 2002/12/25 20:01:32 timbl Exp $" self.sink.makeComment("RDF parsed by "+version[1:-1]) def load(self, uri, _baseURI=""): if uri: _inputURI = urlparse.urljoin(_baseURI, uri) # Make abs from relative netStream = urllib.urlopen(_inputURI) self.feed(netStream.read()) # @ May be big - buffered in memory! self.close() else: _inputURI = urlparse.urljoin(_baseURI, "STDIN") # Make abs from relative self.feed(sys.stdin.read()) # May be big - buffered in memory! self.close() def handle_xml(self, encoding, standalone): self.flush() #self.sink.makeComment('xml parse: encoding ='+`encoding`+'standalone ='+`standalone`) def handle_doctype(self, tag, pubid, syslit, data): self.flush() #self.sink.makeComment('DOCTYPE:' +tag + `data`) def handle_data(self, data): if self._state == STATE_VALUE: self.testdata = self.testdata + data def flush(self): data = self.testdata if data: self.testdata = "" # print '# flushed data:', `data` def handle_cdata(self, data): self.flush() print 'cdata:', `data` def handle_proc(self, name, data): self.flush() print 'processing:',name,`data` def handle_comment(self, data): self.flush() self.sink.makeComment(data) def syntax_error(self, message): print 'error at line %d:' % self.lineno, message def tag2uri(self, str): """ Generate URI from tagname """ x = string.find(str, " ") if x < 0: return str return str[:x]+ str[x+1:] def uriref(self, str): """ Generate uri from uriref in this document """ return urlparse.urljoin(self._thisURI,str) def idAboutAttr(self, attrs): #6.5 also proprAttr 6.10 """ set up subject and maybe context from attributes """ self._subject = None self._state = STATE_DESCRIPTION self._subject = None properties = [] for name, value in attrs.items(): x = string.find(name, " ") if x>=0: ns = name[:x] ln = name[x+1:] # Strip any namespace on attributes!!! @@@@ if string.find("ID ambout AboutEachPrefix bagid type", name)>0: if ns != RDF_NS_URI: print ("# Warning -- %s attribute in %s namespace not RDF NS." % name, ln) ns = RDF_NS_URI # @@@@@@@@@@@@@@@@ uri = ns + ln else: ln = name ns = None # raise NoNS # @@@ Actually, XML spec says we should get these: parser is wrong if ns == RDF_NS_URI or ns == None: # Opinions vary sometimes none but RDF_NS is common :-( if ln == "ID": if self._subject: print "# oops - subject already", self._subject raise syntaxError # ">1 subject" self._subject = self.uriref("#" + value) elif ln == "about": if self._subject: raise syntaxError # ">1 subject" self._subject = self.uriref(value) elif ln == "aboutEachPrefix": if value == " ": # OK - a trick to make NO subject self._subject = None else: raise ooops # can't do about each prefix yet elif ln == "bagid": c = self._context self._context = self.uriref("#" + value) elif ln == "parseType": pass #later - object-related elif ln == "value": pass #later elif ln == "resource": pass #later else: if not ns: print "#@@@@@@@@@@@@ No namespace on property attribute", ln raise self.syntaxError properties.append((uri, value))# If no uri, syntax error @@ # self.sink.makeComment("xml2rdf: Ignored attribute "+uri) else: # Property attribute propAttr #6.10 properties.append((uri, value)) # print "@@@@@@ <%s> <%s>" % properties[-1] if self._subject == None: self._subject = self._generate() for pred, obj in properties: self.sink.makeStatement(( (RESOURCE, self._context), (RESOURCE, pred), (RESOURCE, self._subject), (LITERAL, obj) )) def _generate(self): generatedId = self._genPrefix + `self._nextId` # self._nextId = self._nextId + 1 self.sink.makeStatement(( (RESOURCE, self._context), (RESOURCE, notation3.N3_forSome_URI), (RESOURCE, self._context), (RESOURCE, generatedId) )) # Note this is anonymous node return generatedId def _obj(self, tagURI, attrs): # 6.2 if tagURI == RDF_NS_URI + "Description": self.idAboutAttr(attrs) # Set up subject and context elif ( tagURI == RDF_NS_URI + "Bag" or # 6.4 container :: bag | sequence | alternative tagURI == RDF_NS_URI + "Alt" or tagURI == RDF_NS_URI + "Seq"): raise unimplemented # Don't parse bags yet sorry else: # Unknown tag within STATE_NO_SUBJECT: typedNode #6.13 c = self._context # (Might be change in idAboutAttr) self.idAboutAttr(attrs) if c == None: raise roof if self._subject == None:raise roof self.sink.makeStatement(( (RESOURCE, c), (RESOURCE, RDF_NS_URI+"type"), (RESOURCE, self._subject), (RESOURCE, tagURI) )) self._state = STATE_DESCRIPTION def unknown_starttag(self, tag, attrs): """ Handle start tag. We register none so all are unknown """ self.flush() tagURI = self.tag2uri(tag) if chatty: if not attrs: print '# State =', self._state, 'start tag: <' + tagURI + '>' else: print '# state =', self._state, 'start tag: <' + tagURI, for name, value in attrs.items(): print " " + name + '=' + '"' + value + '"', print '>' self._stack.append([self._state, self._context, self._predicate, self._subject]) if self._state == STATE_NOT_RDF: if tagURI == RDF_NS_URI + "RDF": self._state = STATE_NO_SUBJECT # HACK @@ to grab prefixes nslist = self._XMLParser__namespaces.items() for t, d, nst in self.stack: # Hack nslist = nslist + d.items() # print "### Namespaces: ", `nslist` for prefix, nsURI in nslist: if nsURI: self.sink.bind(prefix, nsURI) else: pass # Some random XML elif self._state == STATE_NO_SUBJECT: # 6.2 obj :: desription | container self._obj(tagURI, attrs) elif self._state == STATE_DESCRIPTION: # Expect predicate (property) PropertyElt self._predicate = tagURI # propertyElt #6.12 self._state = STATE_VALUE # May be looking for value but see parse type self.testdata = "" # Flush value data # print "\n attributes:", `attrs` for name, value in attrs.items(): x = string.find(name, " ") if x>=0: name=name[x+1:] # Strip any namespace on attributes!!! @@@@ if name == "ID": print "# Warning: ID=%s on statement ignored" % (value) # I consider these a bug elif name == "parseType": nslist = self._XMLParser__namespaces.items() # Get namespaces (this is a qname) for t, d, nst in self.stack: # Hack - look inside parser - Yuk@@ nslist = nslist + d.items() if value == "Literal": self._state = STATE_LITERAL # That's an XML subtree not a string elif value == "Resource": c = self._context s = self._subject self.idAboutAttr(attrs) # set subject and context for nested description self.sink.makeStatement(( (RESOURCE, c), (RESOURCE, self._predicate), (RESOURCE, s), (RESOURCE, self._subject) )) self._state = STATE_DESCRIPTION # Nest description elif value[-6:] == ":quote": for p, nsURI in nslist: if p == pref and nsURI == Logic_NS: c = self._context s = self._subject self.idAboutAttr(attrs) # set subject and context for nested description if self._predicate == RDF_NS_URI+"is": # magic :-( self._subject = s # Forget anonymous genid - context is subect print "#@@@@@@@@@@@@@ decided subject is ",`s`[-10:-1] else: self.sink.makeStatement(( (RESOURCE, c), (RESOURCE, self._predicate), (RESOURCE, s), (RESOURCE, self._subject) )) self._context = self._subject self._subject = None self._state = STATE_NO_SUBJECT # Nest context elif value[-11:] == ":collection": # Is this a daml:collection qname? pref = value[:-11] for p, nsURI in nslist: if p == pref and (nsURI == DAML_ONT_NS or nsURI == DPO_NS): self._state = STATE_LIST # Linked list of obj's #print "########### Start list" #print "############ parsetype pref=",pref ,"nslist",nslist elif name == "resource": self.sink.makeStatement(((RESOURCE, self._context), (RESOURCE, self._predicate), (RESOURCE, self._subject), (RESOURCE, self.uriref(value)) )) self._state = STATE_NOVALUE # NOT looking for value elif name == "value": self.sink.makeStatement(((RESOURCE, self._context), (RESOURCE, self._predicate), (RESOURCE, self._subject), (LITERAL, value) )) self._state = STATE_NOVALUE # NOT looking for value else: self.sink.makeComment("# Warning: Ignored attribute %s on %s" % ( name, tagURI)) elif self._state == STATE_LIST: # damlCollection :: objs - make list # Subject and predicate are set and dangling. c = self._context s = self._subject # The tail of the list so far p = self._predicate pair = self._generate() # The new pair self.sink.makeStatement(( (RESOURCE, c), # Link in new pair (RESOURCE, p), (RESOURCE, s), (RESOURCE, pair) )) self.idAboutAttr(attrs) # set subject (the next item) and context self.sink.makeStatement(( (RESOURCE, c), (RESOURCE, DPO_NS + "first"), (RESOURCE, pair), (RESOURCE, self._subject) )) # new item self._stack[-1][2] = DPO_NS + "rest" # Leave dangling link self._stack[-1][3] = pair # Underlying state tracks tail of growing list elif self._state == STATE_VALUE: # Value :: Obj in this case # 6.17 6.2 c = self._context p = self._predicate s = self._subject self._obj(tagURI, attrs) # Parse the object thing's attributes self.sink.makeStatement(( (RESOURCE, c), # Link to new object (RESOURCE, p), (RESOURCE, s), (RESOURCE, self._subject) )) self._stack[-1][0] = STATE_NOVALUE # When we return, cannot have literal now elif self._state == STATE_NOVALUE: print "\n@@ Expected no value, found ", tag, attrs, "\n Stack: ",self._stack raise syntaxError # Found tag, expected empty else: raise internalError # Unknown state # aboutEachprefix { <#> forall r . { r startsWith ppp } l:implies ( zzz } ) # aboutEach { <#> forall r . { ppp rdf:li r } l:implies ( zzz } ) def unknown_endtag(self, tag): if self._state == STATE_VALUE: buf = "" for c in self.testdata: if c != "\r": buf = buf + c # Strip CRs self.sink.makeStatement(( (RESOURCE, self._context), (RESOURCE, self._predicate), (RESOURCE, self._subject), (LITERAL, buf) )) self.testdata = "" elif self._state == STATE_LIST: self.sink.makeStatement(( (RESOURCE, self._context), (RESOURCE, DPO_NS + "rest"), (RESOURCE, self._subject), (RESOURCE, DPO_NS + "nil") )) l = self._stack.pop() # [self._state, self._context, self._subject]) self._state = l[0] self._context = l[1] self._predicate = l[2] self._subject = l[3] self.flush() # print '\nend tag: ' def unknown_entityref(self, ref): self.flush() print '#*** unknown entity ref: &' + ref + ';' def unknown_charref(self, ref): self.flush() print '#*** unknown char ref: &#' + ref + ';' def close(self): xmllib.XMLParser.close(self) self.flush() self.sink.endDoc(self._formula) def test(args = None): import sys, getopt from time import time if not args: args = sys.argv[1:] opts, args = getopt.getopt(args, 'st') klass = RDFXMLParser do_time = 0 for o, a in opts: if o == '-s': klass = xmllib.XMLParser elif o == '-t': do_time = 1 if args: file = args[0] else: file = 'test.xml' if file == '-': f = sys.stdin else: try: f = open(file, 'r') except IOError, msg: print file, ":", msg sys.exit(1) data = f.read() if f is not sys.stdin: f.close() x = klass(notation3.ToN3(sys.stdout.write), "file:/test.rdf") # test only! t0 = time() try: if do_time: x.feed(data) x.close() else: for c in data: x.feed(c) x.close() except RuntimeError, msg: t1 = time() print msg if do_time: print 'total time: %g' % (t1-t0) sys.exit(1) t1 = time() if do_time: print 'total time: %g' % (t1-t0) if __name__ == '__main__': test() # References: # # How to on xmllib: # http://www.python.org/doc/howto/xml/node7.html ###################################### SAX pointers # First hit on Python SAX parser # http://www.gca.org/papers/xmleurope2000/papers/s28-04.html#N84395 # Howto use SAX in python: # http://www.python.org/doc/howto/xml/SAX.html