1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 """
28 While the ligolw module provides classes and parser support for reading and
29 writing LIGO Light Weight XML documents, this module supplements that code
30 with classes and parsers that add intelligence to the in-RAM document
31 representation.
32
33 In particular, the document tree associated with a Table element is
34 enhanced. During parsing, the Stream element in this module converts the
35 character data contained within it into a list of objects. The list
36 contains one object for each row of the table, and the objects' attributes
37 are the names of the table's columns. When the document is written out
38 again, the Stream element serializes the row objects back into character
39 data.
40
41 The Table element exports a list-like interface to the rows. The Column
42 elements also provide list-like access to the values in the corresponding
43 columns of the table.
44 """
45
46
47 import copy
48 import itertools
49 import re
50 import sys
51 from xml.sax.saxutils import escape as xmlescape
52 from xml.sax.xmlreader import AttributesImpl
53
54
55 from glue import git_version
56 from . import ligolw
57 from . import tokenizer
58 from . import types as ligolwtypes
59
60
61 __author__ = "Kipp Cannon <kipp.cannon@ligo.org>"
62 __version__ = "git id %s" % git_version.id
63 __date__ = git_version.date
64
65
66
67
68
69
70
71
72
73
74
75 -def get_table(xmldoc, name):
76 """
77 Scan xmldoc for a Table element named name. Raises ValueError if
78 not exactly 1 such table is found.
79
80 NOTE: if a Table sub-class has its .tableName attribute set, then
81 its .get_table() class method can be used instead. This is true
82 for all Table classes in the glue.ligolw.lsctables module, and it
83 is recommended to always use the .get_table() class method of those
84 classes to retrieve those standard tables instead of calling this
85 function and passing the .tableName attribute. The example below
86 shows both techniques.
87
88 Example:
89
90 >>> import ligolw
91 >>> import lsctables
92 >>> xmldoc = ligolw.Document()
93 >>> xmldoc.appendChild(ligolw.LIGO_LW()).appendChild(lsctables.New(lsctables.SnglInspiralTable))
94 []
95 >>> # find table with this function
96 >>> sngl_inspiral_table = get_table(xmldoc, lsctables.SnglInspiralTable.tableName)
97 >>> # find table with .get_table() class method (preferred)
98 >>> sngl_inspiral_table = lsctables.SnglInspiralTable.get_table(xmldoc)
99
100 See also the .get_table() class method of the Table class.
101 """
102 tables = Table.getTablesByName(xmldoc, name)
103 if len(tables) != 1:
104 raise ValueError("document must contain exactly one %s table" % Table.TableName(name))
105 return tables[0]
106
109 """
110 Recurses over all Table elements below elem whose next_id
111 attributes are not None, and uses the .get_next_id() method of each
112 of those Tables to generate and assign new IDs to their rows. The
113 modifications are recorded, and finally all ID attributes in all
114 rows of all tables are updated to fix cross references to the
115 modified IDs.
116
117 This function is used by ligolw_add to assign new IDs to rows when
118 merging documents in order to make sure there are no ID collisions.
119 Using this function in this way requires the .get_next_id() methods
120 of all Table elements to yield unused IDs, otherwise collisions
121 will result anyway. See the .sync_next_id() method of the Table
122 class for a way to initialize the .next_id attributes so that
123 collisions will not occur.
124
125 Example:
126
127 >>> import ligolw
128 >>> import lsctables
129 >>> xmldoc = ligolw.Document()
130 >>> xmldoc.appendChild(ligolw.LIGO_LW()).appendChild(lsctables.New(lsctables.SnglInspiralTable))
131 []
132 >>> reassign_ids(xmldoc)
133 """
134 mapping = {}
135 for tbl in elem.getElementsByTagName(ligolw.Table.tagName):
136 if tbl.next_id is not None:
137 tbl.updateKeyMapping(mapping)
138 for tbl in elem.getElementsByTagName(ligolw.Table.tagName):
139 tbl.applyKeyMapping(mapping)
140
141
142
143
144
145
146
147
148
149
150
151 -class Column(ligolw.Column):
152 """
153 High-level column element that provides list-like access to the
154 values in a column.
155
156 Example:
157
158 >>> from xml.sax.xmlreader import AttributesImpl
159 >>> import sys
160 >>> tbl = Table(AttributesImpl({u"Name": u"test"}))
161 >>> col = tbl.appendChild(Column(AttributesImpl({u"Name": u"test:snr", u"Type": u"real_8"})))
162 >>> tbl.appendChild(TableStream(AttributesImpl({u"Name": u"test"}))) # doctest: +ELLIPSIS
163 <glue.ligolw.table.TableStream object at ...>
164 >>> tbl._update_column_info()
165 >>> col.Name
166 u'snr'
167 >>> col.Type
168 u'real_8'
169 >>> # append 3 rows (with nothing in them)
170 >>> tbl.append(tbl.RowType())
171 >>> tbl.append(tbl.RowType())
172 >>> tbl.append(tbl.RowType())
173 >>> # assign values to the rows, in order, in this column
174 >>> col[:] = [8.0, 10.0, 12.0]
175 >>> col[:]
176 [8.0, 10.0, 12.0]
177 >>> col.asarray()
178 array([ 8., 10., 12.])
179 >>> tbl.write(sys.stdout) # doctest: +NORMALIZE_WHITESPACE
180 <Table Name="test">
181 <Column Type="real_8" Name="test:snr"/>
182 <Stream Name="test">
183 8,
184 10,
185 12
186 </Stream>
187 </Table>
188 >>> col.index(10)
189 1
190 >>> 12 in col
191 True
192 >>> col[0] = 9.
193 >>> col[1] = 9.
194 >>> col[2] = 9.
195 >>> tbl.write(sys.stdout) # doctest: +NORMALIZE_WHITESPACE
196 <Table Name="test">
197 <Column Type="real_8" Name="test:snr"/>
198 <Stream Name="test">
199 9,
200 9,
201 9
202 </Stream>
203 </Table>
204 >>> col.count(9)
205 3
206
207 NOTE: the .Name attribute returns the stripped "Name" attribute of
208 the element, e.g. with the table suffix removed, but when assigning
209 to the .Name attribute the value provided is stored without
210 modification, i.e. there is no attempt to reattach the table's name
211 to the string. The calling code is responsible for doing the
212 correct manipulations. Therefore, the assignment operation below
213
214 >>> col.Name, col.getAttribute("Name")
215 (u'snr', u'test:snr')
216 >>> col.Name = col.Name
217 >>> col.Name, col.getAttribute("Name")
218 (u'snr', u'snr')
219
220 does not preserve the value of the "Name" attribute (though it does
221 preserve the stripped form reported by the .Name property). This
222 asymmetry is necessary because the correct table name string to
223 reattach to the attribute's value cannot always be known, e.g., if
224 the Column object is not part of an XML tree and does not have a
225 parent node.
226 """
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
247 dec_pattern = re.compile(r"(?:\A\w+:|\A)(?P<FullName>(?:(?P<Table>\w+):|\A)(?P<Name>\w+))\Z")
248 enc_pattern = u"%s"
249
250 Name = ligolw.attributeproxy(u"Name", enc = ColumnName.enc, dec = ColumnName)
251
253 """
254 The number of values in this column.
255 """
256 return len(self.parentNode)
257
259 """
260 Retrieve the value in this column in row i.
261 """
262 if isinstance(i, slice):
263 return [getattr(r, self.Name) for r in self.parentNode[i]]
264 else:
265 return getattr(self.parentNode[i], self.Name)
266
268 """
269 Set the value in this column in row i. i may be a slice.
270
271 NOTE: Unlike normal Python lists, the length of the Column
272 cannot be changed as it is tied to the number of rows in
273 the Table. Therefore, if i is a slice, value should be an
274 iterable with exactly the correct number of items. No
275 check is performed to ensure that this is true: if value
276 contains too many items the extras will be ignored, and if
277 value contains too few items only as many rows will be
278 updated as there are items.
279 """
280 if isinstance(i, slice):
281 for r, val in itertools.izip(self.parentNode[i], value):
282 setattr(r, self.Name, val)
283 else:
284 setattr(self.parentNode[i], self.Name, value)
285
287 raise NotImplementedError
288
290 """
291 Return an iterator object for iterating over values in this
292 column.
293 """
294 for row in self.parentNode:
295 yield getattr(row, self.Name)
296
298 """
299 Return the number of rows with this column equal to value.
300 """
301 return sum(x == value for x in self)
302
304 """
305 Return the smallest index of the row(s) with this column
306 equal to value.
307 """
308 for i, x in enumerate(self):
309 if x == value:
310 return i
311 raise ValueError(value)
312
314 """
315 Returns True or False if there is or is not, respectively,
316 a row containing val in this column.
317 """
318 return value in iter(self)
319
321 """
322 Construct a numpy array from this column. Note that this
323 creates a copy of the data, so modifications made to the
324 array will *not* be recorded in the original document.
325 """
326
327
328
329 import numpy
330 try:
331 dtype = ligolwtypes.ToNumPyType[self.Type]
332 except KeyError as e:
333 raise TypeError("cannot determine numpy dtype for Column '%s': %s" % (self.getAttribute("Name"), e))
334 return numpy.fromiter(self, dtype = dtype)
335
336 @classmethod
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359 -class InterningRowBuilder(tokenizer.RowBuilder):
360 """
361 This subclass of the tokenizer.RowBuilder class respects the
362 "interning" hints provided by table definitions, and attempts to
363 replace the values of row attributes associated with interned
364 columns with references to shared instances of those values. This
365 results in a reduction in memory use which is small for most
366 documents, but can be subtantial when dealing with tables
367 containing large volumes of repeated information.
368
369 Example:
370
371 >>> class Row(object):
372 ... pass
373 ...
374 >>> # 3rd arg is optional list of attributes to intern
375 >>> rows = InterningRowBuilder(Row, ["name", "age"], ("name",))
376 >>> l = list(rows.append(["Dick", 20., "Jane", 75., "Dick", 22.]))
377 >>> l[0].name
378 'Dick'
379 >>> l[2].name
380 'Dick'
381 >>> l[2].name is l[0].name
382 True
383
384 Note that Python naturally interns short strings, so this example
385 would return True regardless; it is intended only to demonstrate
386 the use of the class.
387
388 The values are stored in a dictionary that is shared between all
389 instances of this class, and which survives forever. Nothing is
390 ever naturally "uninterned", so the string dictionary grows without
391 bound as more documents are processed. This can be a problem in
392 some use cases, and the work-around is to run
393
394 >>> InterningRowBuilder.strings.clear()
395
396 to reset the dictionary at appropriate points in the application.
397 Typically this would be done immediately after each document is
398 loaded.
399 """
400 strings = {}
409
410
411
412
413
414
415
416 -class TableStream(ligolw.Stream):
417 """
418 High-level Stream element for use inside Tables. This element
419 knows how to parse the delimited character stream into row objects
420 that it appends into the list-like parent element, and knows how to
421 turn the parent's rows back into a character stream.
422 """
423
424
425
426
427 RowBuilder = tokenizer.RowBuilder
428
429 - def config(self, parentNode):
446
448
449
450 appendfunc = self.parentNode.append
451 for row in self._rowbuilder.append(self._tokenizer.append(content)):
452 appendfunc(row)
453
455
456
457
458
459
460
461
462 if not self._tokenizer.data.isspace():
463 self.appendData(self.Delimiter)
464
465 del self._tokenizer
466 del self._rowbuilder
467
468 self.parentNode._end_of_rows()
469
470 - def write(self, fileobj = sys.stdout, indent = u""):
471
472
473 w = fileobj.write
474
475
476
477 w(self.start_tag(indent))
478 rowdumper = tokenizer.RowDumper(self.parentNode.columnnames, [ligolwtypes.FormatFunc[coltype] for coltype in self.parentNode.columntypes], self.Delimiter)
479 rowdumper.dump(self.parentNode)
480 try:
481 line = next(rowdumper)
482 except StopIteration:
483
484 pass
485 else:
486
487 newline = u"\n" + indent + ligolw.Indent
488 w(newline)
489
490
491
492
493
494
495 w(xmlescape(line))
496
497 newline = rowdumper.delimiter + newline
498 for line in rowdumper:
499 w(newline)
500 w(xmlescape(line))
501 if rowdumper.tokens and rowdumper.tokens[-1] == u"":
502
503
504
505 w(rowdumper.delimiter)
506 w(u"\n" + self.end_tag(indent) + u"\n")
507
508
509
510
511
512
513
514
515
516
517
518 -class Table(ligolw.Table, list):
519 """
520 High-level Table element that knows about its columns and rows.
521 """
523 dec_pattern = re.compile(r"(?:\A[a-z0-9_]+:|\A)(?P<Name>[a-z0-9_]+):table\Z")
524 enc_pattern = u"%s:table"
525
526 Name = ligolw.attributeproxy(u"Name", enc = TableName.enc, dec = TableName)
527
528 validcolumns = None
529 loadcolumns = None
530 interncolumns = None
531 constraints = None
532 how_to_index = None
533 next_id = None
534
536 """
537 Helpful parent class for row objects. Also used as the
538 default row class by Table instances. Provides an
539 __init__() method that accepts keyword arguments from which
540 the object's attributes are initialized.
541
542 Example:
543
544 >>> x = Table.RowType(a = 0.0, b = "test", c = True)
545 >>> x.a
546 0.0
547 >>> x.b
548 'test'
549 >>> x.c
550 True
551
552 Also provides .__getstate__() and .__setstate__() methods
553 to allow row objects to be pickled (otherwise, because they
554 all use __slots__ to reduce their memory footprint, they
555 aren't pickleable).
556 """
558 for key, value in kwargs.items():
559 setattr(self, key, value)
560
562 return dict((key, getattr(self, key)) for key in self.__slots__ if hasattr(self, key))
563
566
567
569 """
570 Initialize
571 """
572 super(Table, self).__init__(*args)
573 self.columnnames = []
574 self.columntypes = []
575 self.columnpytypes = []
576
577
578
579
580
581
582
583 @classmethod
590
591 @classmethod
593 """
594 Equivalent to the module-level function get_table(), but
595 uses the .tableName attribute of this class to provide the
596 name of the table to search for. The Table parent class
597 does not provide a .tableName attribute, but sub-classes,
598 especially those in lsctables.py, do provide a value for
599 that attribute, and in those cases this class method
600 provides a cleaner way to retrieve them.
601
602 Example:
603
604 >>> import ligolw
605 >>> import lsctables
606 >>> xmldoc = ligolw.Document()
607 >>> xmldoc.appendChild(ligolw.LIGO_LW()).appendChild(lsctables.New(lsctables.SnglInspiralTable))
608 []
609 >>> sngl_inspiral_table = lsctables.SnglInspiralTable.get_table(xmldoc)
610 """
611 return get_table(xmldoc, cls.tableName)
612
614 """
615 Construct and return a new Table document subtree whose
616 structure is the same as this table, that is it has the
617 same columns etc.. The rows are not copied. Note that a
618 fair amount of metadata is shared between the original and
619 new tables. In particular, a copy of the Table object
620 itself is created (but with no rows), and copies of the
621 child nodes are created. All other object references are
622 shared between the two instances, such as the RowType
623 attribute on the Table object.
624 """
625 new = copy.copy(self)
626 new.childNodes = []
627 for elem in self.childNodes:
628 new.appendChild(copy.copy(elem))
629 del new[:]
630 new._end_of_columns()
631 new._end_of_rows()
632 return new
633
634
635 @classmethod
637 """
638 Return True if element is a Table element whose Name
639 attribute matches the .tableName attribute of this class ;
640 return False otherwise. See also .CheckProperties().
641 """
642 return cls.CheckProperties(elem.tagName, elem.attributes)
643
644
645 @classmethod
647 """
648 Return True if tagname and attrs are the XML tag name and
649 element attributes, respectively, of a Table element whose
650 Name attribute matches the .tableName attribute of this
651 class; return False otherwise. The Table parent class
652 does not provide a .tableName attribute, but sub-classes,
653 especially those in lsctables.py, do provide a value for
654 that attribute. See also .CheckElement()
655
656 Example:
657
658 >>> import lsctables
659 >>> lsctables.ProcessTable.CheckProperties(u"Table", {u"Name": u"process:table"})
660 True
661 """
662 return tagname == cls.tagName and cls.TableName(attrs[u"Name"]) == cls.tableName
663
664
665
666
667
668
670 """
671 Retrieve and return the Column child element named name.
672 The comparison is done using the stripped names. Raises
673 KeyError if this table has no column by that name.
674
675 Example:
676
677 >>> import lsctables
678 >>> tbl = lsctables.New(lsctables.SnglInspiralTable)
679 >>> col = tbl.getColumnByName("mass1")
680 """
681 try:
682 col, = Column.getColumnsByName(self, name)
683 except ValueError:
684
685 raise KeyError(name)
686 return col
687
688
690 """
691 Append a Column element named "name" to the table. Returns
692 the new child. Raises ValueError if the table already has
693 a column by that name, and KeyError if the validcolumns
694 attribute of this table does not contain an entry for a
695 column by that name.
696
697 Note that the name string is assumed to be "pre-stripped",
698 that is it is the significant portion of the elements Name
699 attribute. The Column element's Name attribute will be
700 constructed by pre-pending the stripped Table element's
701 name and a colon.
702
703 Example:
704
705 >>> import lsctables
706 >>> process_table = lsctables.New(lsctables.ProcessTable, [])
707 >>> col = process_table.appendColumn("program")
708 >>> col.getAttribute("Name")
709 u'process:program'
710 >>> col.Name
711 u'program'
712 """
713 try:
714 self.getColumnByName(name)
715
716 raise ValueError("duplicate Column '%s'" % name)
717 except KeyError:
718 pass
719 column = Column(AttributesImpl({u"Name": "%s:%s" % (self.Name, name), u"Type": self.validcolumns[name]}))
720 streams = self.getElementsByTagName(ligolw.Stream.tagName)
721 if streams:
722 self.insertBefore(column, streams[0])
723 else:
724 self.appendChild(column)
725 return column
726
727
728
729
730
731
733 """
734 Create and append a new row to this table, then return it
735
736 All positional and keyword arguments are passed to the RowType
737 constructor for this table.
738 """
739 row = self.RowType(*args, **kwargs)
740 self.append(row)
741 return row
742
743
744
745
746
747
771
785
787 """
788 Called during parsing to indicate that the last Column
789 child element has been added. Subclasses can override this
790 to perform any special action that should occur following
791 the addition of the last Column element.
792 """
793 pass
794
796 """
797 Called during parsing to indicate that the last row has
798 been added. Subclasses can override this to perform any
799 special action that should occur following the addition of
800 the last row.
801 """
802 pass
803
813
815 """
816 Break internal references within the document tree rooted
817 on this element to promote garbage collection.
818 """
819 super(Table, self).unlink()
820 del self[:]
821
829
830
831
832
833
834 @classmethod
836 """
837 Returns the current value of the next_id class attribute,
838 and increments the next_id class attribute by 1. Raises
839 ValueError if the table does not have an ID generator
840 associated with it.
841 """
842
843 id = cls.next_id
844 cls.next_id += 1
845 return id
846
847 @classmethod
849 """
850 Sets the value of the next_id class attribute. This is a
851 convenience function to help prevent accidentally assigning
852 a value to an instance attribute instead of the class
853 attribute.
854 """
855 cls.next_id = id
856
857 @classmethod
859 """
860 If the current value of the next_id class attribute is not
861 None then set it to 0, otherwise it is left unmodified.
862
863 Example:
864
865 >>> import lsctables
866 >>> for cls in lsctables.TableByName.values(): cls.reset_next_id()
867 """
868 if cls.next_id is not None:
869 cls.set_next_id(type(cls.next_id)(0))
870
872 """
873 Determines the highest-numbered ID in this table, and sets
874 the table's .next_id attribute to the next highest ID in
875 sequence. If the .next_id attribute is already set to a
876 value greater than the highest value found, then it is left
877 unmodified. The return value is the ID identified by this
878 method. If the table's .next_id attribute is None, then
879 this function is a no-op.
880
881 Note that tables of the same name typically share a common
882 .next_id attribute (it is a class attribute, not an
883 attribute of each instance) so that IDs can be generated
884 that are unique across all tables in the document. Running
885 sync_next_id() on all the tables in a document that are of
886 the same type will have the effect of setting the ID to the
887 next ID higher than any ID in any of those tables.
888
889 Example:
890
891 >>> import lsctables
892 >>> tbl = lsctables.New(lsctables.ProcessTable)
893 >>> print(tbl.sync_next_id())
894 process:process_id:0
895 """
896 if self.next_id is not None:
897 if len(self):
898 n = max(self.getColumnByName(self.next_id.column_name)) + 1
899 else:
900 n = type(self.next_id)(0)
901 if n > self.next_id:
902 self.set_next_id(n)
903 return self.next_id
904
906 """
907 Used as the first half of the row key reassignment
908 algorithm. Accepts a dictionary mapping old key --> new
909 key. Iterates over the rows in this table, using the
910 table's next_id attribute to assign a new ID to each row,
911 recording the changes in the mapping. Returns the mapping.
912 Raises ValueError if the table's next_id attribute is None.
913 """
914 if self.next_id is None:
915 raise ValueError(self)
916 try:
917 column = self.getColumnByName(self.next_id.column_name)
918 except KeyError:
919
920 return mapping
921 for i, old in enumerate(column):
922 if old is None:
923 raise ValueError("null row ID encountered in Table '%s', row %d" % (self.getAttribute("Name"), i))
924 if old in mapping:
925 column[i] = mapping[old]
926 else:
927 column[i] = mapping[old] = self.get_next_id()
928 return mapping
929
931 """
932 Used as the second half of the key reassignment algorithm.
933 Loops over each row in the table, replacing references to
934 old row keys with the new values from the mapping.
935 """
936 for coltype, colname in zip(self.columntypes, self.columnnames):
937 if coltype in ligolwtypes.IDTypes and (self.next_id is None or colname != self.next_id.column_name):
938 column = self.getColumnByName(colname)
939 for i, old in enumerate(column):
940 try:
941 column[i] = mapping[old]
942 except KeyError:
943 pass
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960 -def use_in(ContentHandler):
961 """
962 Modify ContentHandler, a sub-class of
963 glue.ligolw.LIGOLWContentHandler, to cause it to use the Table,
964 Column, and Stream classes defined in this module when parsing XML
965 documents.
966
967 Example:
968
969 >>> from glue.ligolw import ligolw
970 >>> class LIGOLWContentHandler(ligolw.LIGOLWContentHandler):
971 ... pass
972 ...
973 >>> use_in(LIGOLWContentHandler)
974 <class 'glue.ligolw.table.LIGOLWContentHandler'>
975 """
976 def startColumn(self, parent, attrs):
977 return Column(attrs)
978
979 def startStream(self, parent, attrs, __orig_startStream = ContentHandler.startStream):
980 if parent.tagName == ligolw.Table.tagName:
981 parent._end_of_columns()
982 return TableStream(attrs).config(parent)
983 return __orig_startStream(self, parent, attrs)
984
985 def startTable(self, parent, attrs):
986 return Table(attrs)
987
988 ContentHandler.startColumn = startColumn
989 ContentHandler.startStream = startStream
990 ContentHandler.startTable = startTable
991
992 return ContentHandler
993