glue.ligolw.table

1 # Copyright (C) 2006--2016 Kipp Cannon 2 # 3 # This program is free software; you can redistribute it and/or modify it 4 # under the terms of the GNU General Public License as published by the 5 # Free Software Foundation; either version 3 of the License, or (at your 6 # option) any later version. 7 # 8 # This program is distributed in the hope that it will be useful, but 9 # WITHOUT ANY WARRANTY; without even the implied warranty of 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 11 # Public License for more details. 12 # 13 # You should have received a copy of the GNU General Public License along 14 # with this program; if not, write to the Free Software Foundation, Inc., 15 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 16 17 18 # 19 # ============================================================================= 20 # 21 # Preamble 22 # 23 # ============================================================================= 24 # 25 26 27 """ 28 While the ligolw module provides classes and parser support for reading and 29 writing LIGO Light Weight XML documents, this module supplements that code 30 with classes and parsers that add intelligence to the in-RAM document 31 representation. 32 33 In particular, the document tree associated with a Table element is 34 enhanced. During parsing, the Stream element in this module converts the 35 character data contained within it into a list of objects. The list 36 contains one object for each row of the table, and the objects' attributes 37 are the names of the table's columns. When the document is written out 38 again, the Stream element serializes the row objects back into character 39 data. 40 41 The Table element exports a list-like interface to the rows. The Column 42 elements also provide list-like access to the values in the corresponding 43 columns of the table. 44 """ 45 46 47 import copy 48 import itertools 49 import re 50 import sys 51 from xml.sax.saxutils import escape as xmlescape 52 from xml.sax.xmlreader import AttributesImpl 53 54 55 from glue import git_version 56 from . import ligolw 57 from . import tokenizer 58 from . import types as ligolwtypes 59 60 61 __author__ = "Kipp Cannon <kipp.cannon@ligo.org>" 62 __version__ = "git id %s" % git_version.id 63 __date__ = git_version.date

64 65 66 # 67 # ============================================================================= 68 # 69 # Utilities 70 # 71 # ============================================================================= 72 # 73 74 75 -def get_table(xmldoc, name):

76 """ 77 Scan xmldoc for a Table element named name. Raises ValueError if 78 not exactly 1 such table is found. 79 80 NOTE: if a Table sub-class has its .tableName attribute set, then 81 its .get_table() class method can be used instead. This is true 82 for all Table classes in the glue.ligolw.lsctables module, and it 83 is recommended to always use the .get_table() class method of those 84 classes to retrieve those standard tables instead of calling this 85 function and passing the .tableName attribute. The example below 86 shows both techniques. 87 88 Example: 89 90 >>> import ligolw 91 >>> import lsctables 92 >>> xmldoc = ligolw.Document() 93 >>> xmldoc.appendChild(ligolw.LIGO_LW()).appendChild(lsctables.New(lsctables.SnglInspiralTable)) 94 [] 95 >>> # find table with this function 96 >>> sngl_inspiral_table = get_table(xmldoc, lsctables.SnglInspiralTable.tableName) 97 >>> # find table with .get_table() class method (preferred) 98 >>> sngl_inspiral_table = lsctables.SnglInspiralTable.get_table(xmldoc) 99 100 See also the .get_table() class method of the Table class. 101 """ 102 tables = Table.getTablesByName(xmldoc, name) 103 if len(tables) != 1: 104 raise ValueError("document must contain exactly one %s table" % Table.TableName(name)) 105 return tables[0]

106

107 108 -def reassign_ids(elem):

109 """ 110 Recurses over all Table elements below elem whose next_id 111 attributes are not None, and uses the .get_next_id() method of each 112 of those Tables to generate and assign new IDs to their rows. The 113 modifications are recorded, and finally all ID attributes in all 114 rows of all tables are updated to fix cross references to the 115 modified IDs. 116 117 This function is used by ligolw_add to assign new IDs to rows when 118 merging documents in order to make sure there are no ID collisions. 119 Using this function in this way requires the .get_next_id() methods 120 of all Table elements to yield unused IDs, otherwise collisions 121 will result anyway. See the .sync_next_id() method of the Table 122 class for a way to initialize the .next_id attributes so that 123 collisions will not occur. 124 125 Example: 126 127 >>> import ligolw 128 >>> import lsctables 129 >>> xmldoc = ligolw.Document() 130 >>> xmldoc.appendChild(ligolw.LIGO_LW()).appendChild(lsctables.New(lsctables.SnglInspiralTable)) 131 [] 132 >>> reassign_ids(xmldoc) 133 """ 134 mapping = {} 135 for tbl in elem.getElementsByTagName(ligolw.Table.tagName): 136 if tbl.next_id is not None: 137 tbl.updateKeyMapping(mapping) 138 for tbl in elem.getElementsByTagName(ligolw.Table.tagName): 139 tbl.applyKeyMapping(mapping)

140

141 142 # 143 # ============================================================================= 144 # 145 # Column Element 146 # 147 # ============================================================================= 148 # 149 150 151 -class Column(ligolw.Column):

152 """ 153 High-level column element that provides list-like access to the 154 values in a column. 155 156 Example: 157 158 >>> from xml.sax.xmlreader import AttributesImpl 159 >>> import sys 160 >>> tbl = Table(AttributesImpl({u"Name": u"test"})) 161 >>> col = tbl.appendChild(Column(AttributesImpl({u"Name": u"test:snr", u"Type": u"real_8"}))) 162 >>> tbl.appendChild(TableStream(AttributesImpl({u"Name": u"test"}))) # doctest: +ELLIPSIS 163 <glue.ligolw.table.TableStream object at ...> 164 >>> tbl._update_column_info() 165 >>> col.Name 166 u'snr' 167 >>> col.Type 168 u'real_8' 169 >>> # append 3 rows (with nothing in them) 170 >>> tbl.append(tbl.RowType()) 171 >>> tbl.append(tbl.RowType()) 172 >>> tbl.append(tbl.RowType()) 173 >>> # assign values to the rows, in order, in this column 174 >>> col[:] = [8.0, 10.0, 12.0] 175 >>> col[:] 176 [8.0, 10.0, 12.0] 177 >>> col.asarray() 178 array([ 8., 10., 12.]) 179 >>> tbl.write(sys.stdout) # doctest: +NORMALIZE_WHITESPACE 180 <Table Name="test"> 181 <Column Type="real_8" Name="test:snr"/> 182 <Stream Name="test"> 183 8, 184 10, 185 12 186 </Stream> 187 </Table> 188 >>> col.index(10) 189 1 190 >>> 12 in col 191 True 192 >>> col[0] = 9. 193 >>> col[1] = 9. 194 >>> col[2] = 9. 195 >>> tbl.write(sys.stdout) # doctest: +NORMALIZE_WHITESPACE 196 <Table Name="test"> 197 <Column Type="real_8" Name="test:snr"/> 198 <Stream Name="test"> 199 9, 200 9, 201 9 202 </Stream> 203 </Table> 204 >>> col.count(9) 205 3 206 207 NOTE: the .Name attribute returns the stripped "Name" attribute of 208 the element, e.g. with the table suffix removed, but when assigning 209 to the .Name attribute the value provided is stored without 210 modification, i.e. there is no attempt to reattach the table's name 211 to the string. The calling code is responsible for doing the 212 correct manipulations. Therefore, the assignment operation below 213 214 >>> col.Name, col.getAttribute("Name") 215 (u'snr', u'test:snr') 216 >>> col.Name = col.Name 217 >>> col.Name, col.getAttribute("Name") 218 (u'snr', u'snr') 219 220 does not preserve the value of the "Name" attribute (though it does 221 preserve the stripped form reported by the .Name property). This 222 asymmetry is necessary because the correct table name string to 223 reattach to the attribute's value cannot always be known, e.g., if 224 the Column object is not part of an XML tree and does not have a 225 parent node. 226 """ 227 # FIXME: the pattern should be 228 # 229 # r"(?:\A[a-z0-9_]+:|\A)(?P<FullName>(?:[a-z0-9_]+:|\A)(?P<Name>[a-z0-9_]+))\Z" 230 # 231 # but people are putting upper case letters in names!!!!! Someone 232 # is going to get the beats. There is a reason for requiring names 233 # to be all lower case: SQL table and column names are case 234 # insensitive, therefore (i) when converting a document to SQL the 235 # columns "Rho" and "rho" would become indistinguishable and so it 236 # would be impossible to convert a document with columns having 237 # names like this into an SQL database; and (ii) even if that 238 # degeneracy is not encountered the case cannot be preserved and so 239 # when converting back to XML the correct capitalization is lost. 240 # Requiring names to be all lower-case creates the same 241 # degeneracies in XML representations that exist in SQL 242 # representations ensuring compatibility and defines the correct 243 # case to restore the names to when converting to XML. Other rules 244 # can be imagined that would work as well, this is the one that got 245 # chosen.

246 - class ColumnName(ligolw.LLWNameAttr):

247 dec_pattern = re.compile(r"(?:\A\w+:|\A)(?P<FullName>(?:(?P<Table>\w+):|\A)(?P<Name>\w+))\Z") 248 enc_pattern = u"%s"

249 250 Name = ligolw.attributeproxy(u"Name", enc = ColumnName.enc, dec = ColumnName) 251

252 - def __len__(self):

253 """ 254 The number of values in this column. 255 """ 256 return len(self.parentNode)

257

258 - def __getitem__(self, i):

259 """ 260 Retrieve the value in this column in row i. 261 """ 262 if isinstance(i, slice): 263 return [getattr(r, self.Name) for r in self.parentNode[i]] 264 else: 265 return getattr(self.parentNode[i], self.Name)

266

267 - def __setitem__(self, i, value):

268 """ 269 Set the value in this column in row i. i may be a slice. 270 271 NOTE: Unlike normal Python lists, the length of the Column 272 cannot be changed as it is tied to the number of rows in 273 the Table. Therefore, if i is a slice, value should be an 274 iterable with exactly the correct number of items. No 275 check is performed to ensure that this is true: if value 276 contains too many items the extras will be ignored, and if 277 value contains too few items only as many rows will be 278 updated as there are items. 279 """ 280 if isinstance(i, slice): 281 for r, val in itertools.izip(self.parentNode[i], value): 282 setattr(r, self.Name, val) 283 else: 284 setattr(self.parentNode[i], self.Name, value)

285

286 - def __delitem__(self, *args):

287 raise NotImplementedError

288

289 - def __iter__(self):

290 """ 291 Return an iterator object for iterating over values in this 292 column. 293 """ 294 for row in self.parentNode: 295 yield getattr(row, self.Name)

296

297 - def count(self, value):

298 """ 299 Return the number of rows with this column equal to value. 300 """ 301 return sum(x == value for x in self)

302

303 - def index(self, value):

304 """ 305 Return the smallest index of the row(s) with this column 306 equal to value. 307 """ 308 for i, x in enumerate(self): 309 if x == value: 310 return i 311 raise ValueError(value)

312

313 - def __contains__(self, value):

314 """ 315 Returns True or False if there is or is not, respectively, 316 a row containing val in this column. 317 """ 318 return value in iter(self)

319

320 - def asarray(self):

321 """ 322 Construct a numpy array from this column. Note that this 323 creates a copy of the data, so modifications made to the 324 array will *not* be recorded in the original document. 325 """ 326 # most codes don't use this feature, this is the only place 327 # numpy is used here, and importing numpy can be 328 # time-consuming, so we derfer the import until needed. 329 import numpy 330 try: 331 dtype = ligolwtypes.ToNumPyType[self.Type] 332 except KeyError as e: 333 raise TypeError("cannot determine numpy dtype for Column '%s': %s" % (self.getAttribute("Name"), e)) 334 return numpy.fromiter(self, dtype = dtype)

335 336 @classmethod

337 - def getColumnsByName(cls, elem, name):

338 """ 339 Return a list of Column elements named name under elem. 340 """ 341 name = cls.ColumnName(name) 342 return elem.getElements(lambda e: (e.tagName == cls.tagName) and (e.Name == name))

343

344 345 # 346 # ============================================================================= 347 # 348 # Stream Element 349 # 350 # ============================================================================= 351 # 352 353 354 # 355 # A subclass of tokenizer.RowBuilder that interns strings. 356 # 357 358 359 -class InterningRowBuilder(tokenizer.RowBuilder):

360 """ 361 This subclass of the tokenizer.RowBuilder class respects the 362 "interning" hints provided by table definitions, and attempts to 363 replace the values of row attributes associated with interned 364 columns with references to shared instances of those values. This 365 results in a reduction in memory use which is small for most 366 documents, but can be subtantial when dealing with tables 367 containing large volumes of repeated information. 368 369 Example: 370 371 >>> class Row(object): 372 ... pass 373 ... 374 >>> # 3rd arg is optional list of attributes to intern 375 >>> rows = InterningRowBuilder(Row, ["name", "age"], ("name",)) 376 >>> l = list(rows.append(["Dick", 20., "Jane", 75., "Dick", 22.])) 377 >>> l[0].name 378 'Dick' 379 >>> l[2].name 380 'Dick' 381 >>> l[2].name is l[0].name 382 True 383 384 Note that Python naturally interns short strings, so this example 385 would return True regardless; it is intended only to demonstrate 386 the use of the class. 387 388 The values are stored in a dictionary that is shared between all 389 instances of this class, and which survives forever. Nothing is 390 ever naturally "uninterned", so the string dictionary grows without 391 bound as more documents are processed. This can be a problem in 392 some use cases, and the work-around is to run 393 394 >>> InterningRowBuilder.strings.clear() 395 396 to reset the dictionary at appropriate points in the application. 397 Typically this would be done immediately after each document is 398 loaded. 399 """ 400 strings = {}

401 - def append(self, tokens):

402 interns = self.interns 403 setdefault = self.strings.setdefault 404 for row in super(InterningRowBuilder, self).append(tokens): 405 for col in interns: 406 val = getattr(row, col) 407 setattr(row, col, setdefault(val, val)) 408 yield row

409

410 411 # 412 # Stream class 413 # 414 415 416 -class TableStream(ligolw.Stream):

417 """ 418 High-level Stream element for use inside Tables. This element 419 knows how to parse the delimited character stream into row objects 420 that it appends into the list-like parent element, and knows how to 421 turn the parent's rows back into a character stream. 422 """ 423 # 424 # Select the RowBuilder class to use when parsing tables. 425 # 426 427 RowBuilder = tokenizer.RowBuilder 428

429 - def config(self, parentNode):

430 # some initialization that requires access to the 431 # parentNode, and so cannot be done inside the __init__() 432 # function. 433 loadcolumns = set(parentNode.columnnames) 434 if parentNode.loadcolumns is not None: 435 # FIXME: convert loadcolumns attributes to sets to 436 # avoid the conversion. 437 loadcolumns &= set(parentNode.loadcolumns) 438 self._tokenizer = tokenizer.Tokenizer(self.Delimiter) 439 self._tokenizer.set_types([(pytype if colname in loadcolumns else None) for pytype, colname in zip(parentNode.columnpytypes, parentNode.columnnames)]) 440 columnnames = [name for name in parentNode.columnnames if name in loadcolumns] 441 # FIXME: convert interncolumns attributes to sets to 442 # simplify computing the intersection 443 interncolumns = [name for name in (parentNode.interncolumns or set()) if name in columnnames] 444 self._rowbuilder = self.RowBuilder(parentNode.RowType, columnnames, interncolumns) 445 return self

446

447 - def appendData(self, content):

448 # tokenize buffer, pack into row objects, and append to 449 # table 450 appendfunc = self.parentNode.append 451 for row in self._rowbuilder.append(self._tokenizer.append(content)): 452 appendfunc(row)

453

454 - def endElement(self):

455 # stream tokenizer uses delimiter to identify end of each 456 # token, so add a final delimiter to induce the last token 457 # to get parsed but only if there's something other than 458 # whitespace left in the tokenizer's buffer. the writing 459 # code will have put a final delimiter into the stream if 460 # the final token was pure whitespace in order to 461 # unambiguously indicate that token's presence 462 if not self._tokenizer.data.isspace(): 463 self.appendData(self.Delimiter) 464 # now we're done with these 465 del self._tokenizer 466 del self._rowbuilder 467 # call parent's _end_of_rows() hook. 468 self.parentNode._end_of_rows()

469

470 - def write(self, fileobj = sys.stdout, indent = u""):

471 # retrieve the .write() method of the file object to avoid 472 # doing the attribute lookup in loops 473 w = fileobj.write 474 # loop over parent's rows. This is complicated because we 475 # need to not put a delimiter at the end of the last row 476 # unless it ends with a null token 477 w(self.start_tag(indent)) 478 rowdumper = tokenizer.RowDumper(self.parentNode.columnnames, [ligolwtypes.FormatFunc[coltype] for coltype in self.parentNode.columntypes], self.Delimiter) 479 rowdumper.dump(self.parentNode) 480 try: 481 line = next(rowdumper) 482 except StopIteration: 483 # table is empty 484 pass 485 else: 486 # write first row 487 newline = u"\n" + indent + ligolw.Indent 488 w(newline) 489 # the xmlescape() call replaces things like "<" 490 # with "<" so that the string will not confuse 491 # an XML parser when the file is read. turning 492 # "<" back into "<" during file reading is 493 # handled by the XML parser, so there is no code 494 # in Glue related to that. 495 w(xmlescape(line)) 496 # now add delimiter and write the remaining rows 497 newline = rowdumper.delimiter + newline 498 for line in rowdumper: 499 w(newline) 500 w(xmlescape(line)) 501 if rowdumper.tokens and rowdumper.tokens[-1] == u"": 502 # the last token of the last row was null: 503 # add a final delimiter to indicate that a 504 # token is present 505 w(rowdumper.delimiter) 506 w(u"\n" + self.end_tag(indent) + u"\n")

507

508 509 # 510 # ============================================================================= 511 # 512 # Table Element 513 # 514 # ============================================================================= 515 # 516 517 518 -class Table(ligolw.Table, list):

519 """ 520 High-level Table element that knows about its columns and rows. 521 """

522 - class TableName(ligolw.LLWNameAttr):

523 dec_pattern = re.compile(r"(?:\A[a-z0-9_]+:|\A)(?P<Name>[a-z0-9_]+):table\Z") 524 enc_pattern = u"%s:table"

525 526 Name = ligolw.attributeproxy(u"Name", enc = TableName.enc, dec = TableName) 527 528 validcolumns = None 529 loadcolumns = None 530 interncolumns = None 531 constraints = None 532 how_to_index = None 533 next_id = None 534

535 - class RowType(object):

536 """ 537 Helpful parent class for row objects. Also used as the 538 default row class by Table instances. Provides an 539 __init__() method that accepts keyword arguments from which 540 the object's attributes are initialized. 541 542 Example: 543 544 >>> x = Table.RowType(a = 0.0, b = "test", c = True) 545 >>> x.a 546 0.0 547 >>> x.b 548 'test' 549 >>> x.c 550 True 551 552 Also provides .__getstate__() and .__setstate__() methods 553 to allow row objects to be pickled (otherwise, because they 554 all use __slots__ to reduce their memory footprint, they 555 aren't pickleable). 556 """

557 - def __init__(self, **kwargs):

558 for key, value in kwargs.items(): 559 setattr(self, key, value)

560

561 - def __getstate__(self):

562 return dict((key, getattr(self, key)) for key in self.__slots__ if hasattr(self, key))

563

564 - def __setstate__(self, state):

565 self.__init__(**state)

566 567

568 - def __init__(self, *args):

569 """ 570 Initialize 571 """ 572 super(Table, self).__init__(*args) 573 self.columnnames = [] 574 self.columntypes = [] 575 self.columnpytypes = []

576 577 578 # 579 # Table retrieval 580 # 581 582 583 @classmethod

584 - def getTablesByName(cls, elem, name):

585 """ 586 Return a list of Table elements named name under elem. 587 """ 588 name = cls.TableName(name) 589 return elem.getElements(lambda e: (e.tagName == cls.tagName) and (e.Name == name))

590 591 @classmethod

592 - def get_table(cls, xmldoc):

593 """ 594 Equivalent to the module-level function get_table(), but 595 uses the .tableName attribute of this class to provide the 596 name of the table to search for. The Table parent class 597 does not provide a .tableName attribute, but sub-classes, 598 especially those in lsctables.py, do provide a value for 599 that attribute, and in those cases this class method 600 provides a cleaner way to retrieve them. 601 602 Example: 603 604 >>> import ligolw 605 >>> import lsctables 606 >>> xmldoc = ligolw.Document() 607 >>> xmldoc.appendChild(ligolw.LIGO_LW()).appendChild(lsctables.New(lsctables.SnglInspiralTable)) 608 [] 609 >>> sngl_inspiral_table = lsctables.SnglInspiralTable.get_table(xmldoc) 610 """ 611 return get_table(xmldoc, cls.tableName)

612

613 - def copy(self):

614 """ 615 Construct and return a new Table document subtree whose 616 structure is the same as this table, that is it has the 617 same columns etc.. The rows are not copied. Note that a 618 fair amount of metadata is shared between the original and 619 new tables. In particular, a copy of the Table object 620 itself is created (but with no rows), and copies of the 621 child nodes are created. All other object references are 622 shared between the two instances, such as the RowType 623 attribute on the Table object. 624 """ 625 new = copy.copy(self) 626 new.childNodes = [] # got reference to original list 627 for elem in self.childNodes: 628 new.appendChild(copy.copy(elem)) 629 del new[:] 630 new._end_of_columns() 631 new._end_of_rows() 632 return new

633 634 635 @classmethod

636 - def CheckElement(cls, elem):

637 """ 638 Return True if element is a Table element whose Name 639 attribute matches the .tableName attribute of this class ; 640 return False otherwise. See also .CheckProperties(). 641 """ 642 return cls.CheckProperties(elem.tagName, elem.attributes)

643 644 645 @classmethod

646 - def CheckProperties(cls, tagname, attrs):

647 """ 648 Return True if tagname and attrs are the XML tag name and 649 element attributes, respectively, of a Table element whose 650 Name attribute matches the .tableName attribute of this 651 class; return False otherwise. The Table parent class 652 does not provide a .tableName attribute, but sub-classes, 653 especially those in lsctables.py, do provide a value for 654 that attribute. See also .CheckElement() 655 656 Example: 657 658 >>> import lsctables 659 >>> lsctables.ProcessTable.CheckProperties(u"Table", {u"Name": u"process:table"}) 660 True 661 """ 662 return tagname == cls.tagName and cls.TableName(attrs[u"Name"]) == cls.tableName

663 664 665 # 666 # Column access 667 # 668

669 - def getColumnByName(self, name):

670 """ 671 Retrieve and return the Column child element named name. 672 The comparison is done using the stripped names. Raises 673 KeyError if this table has no column by that name. 674 675 Example: 676 677 >>> import lsctables 678 >>> tbl = lsctables.New(lsctables.SnglInspiralTable) 679 >>> col = tbl.getColumnByName("mass1") 680 """ 681 try: 682 col, = Column.getColumnsByName(self, name) 683 except ValueError: 684 # did not find exactly 1 matching child 685 raise KeyError(name) 686 return col

687 688

689 - def appendColumn(self, name):

690 """ 691 Append a Column element named "name" to the table. Returns 692 the new child. Raises ValueError if the table already has 693 a column by that name, and KeyError if the validcolumns 694 attribute of this table does not contain an entry for a 695 column by that name. 696 697 Note that the name string is assumed to be "pre-stripped", 698 that is it is the significant portion of the elements Name 699 attribute. The Column element's Name attribute will be 700 constructed by pre-pending the stripped Table element's 701 name and a colon. 702 703 Example: 704 705 >>> import lsctables 706 >>> process_table = lsctables.New(lsctables.ProcessTable, []) 707 >>> col = process_table.appendColumn("program") 708 >>> col.getAttribute("Name") 709 u'process:program' 710 >>> col.Name 711 u'program' 712 """ 713 try: 714 self.getColumnByName(name) 715 # if we get here the table already has that column 716 raise ValueError("duplicate Column '%s'" % name) 717 except KeyError: 718 pass 719 column = Column(AttributesImpl({u"Name": "%s:%s" % (self.Name, name), u"Type": self.validcolumns[name]})) 720 streams = self.getElementsByTagName(ligolw.Stream.tagName) 721 if streams: 722 self.insertBefore(column, streams[0]) 723 else: 724 self.appendChild(column) 725 return column

726 727 728 # 729 # Row access 730 # 731

732 - def appendRow(self, *args, **kwargs):

733 """ 734 Create and append a new row to this table, then return it 735 736 All positional and keyword arguments are passed to the RowType 737 constructor for this table. 738 """ 739 row = self.RowType(*args, **kwargs) 740 self.append(row) 741 return row

742 743 744 # 745 # Element methods 746 # 747

748 - def _update_column_info(self):

749 """ 750 Used for validation during parsing, and additional 751 book-keeping. For internal use only. 752 """ 753 del self.columnnames[:] 754 del self.columntypes[:] 755 del self.columnpytypes[:] 756 for child in self.getElementsByTagName(ligolw.Column.tagName): 757 if self.validcolumns is not None: 758 try: 759 if self.validcolumns[child.Name] != child.Type: 760 raise ligolw.ElementError("invalid type '%s' for Column '%s' in Table '%s', expected type '%s'" % (child.Type, child.getAttribute("Name"), self.getAttribute("Name"), self.validcolumns[child.Name])) 761 except KeyError: 762 raise ligolw.ElementError("invalid Column '%s' for Table '%s'" % (child.getAttribute("Name"), self.getAttribute("Name"))) 763 if child.Name in self.columnnames: 764 raise ligolw.ElementError("duplicate Column '%s' in Table '%s'" % (child.getAttribute("Name"), self.getAttribute("Name"))) 765 self.columnnames.append(child.Name) 766 self.columntypes.append(child.Type) 767 try: 768 self.columnpytypes.append(ligolwtypes.ToPyType[child.Type]) 769 except KeyError: 770 raise ligolw.ElementError("unrecognized Type '%s' for Column '%s' in Table '%s'" % (child.Type, child.getAttribute("Name"), self.getAttribute("Name")))

771

772 - def _verifyChildren(self, i):

773 """ 774 Used for validation during parsing, and additional 775 book-keeping. For internal use only. 776 """ 777 super(Table, self)._verifyChildren(i) 778 child = self.childNodes[i] 779 if child.tagName == ligolw.Column.tagName: 780 self._update_column_info() 781 elif child.tagName == ligolw.Stream.tagName: 782 # require agreement of non-stripped strings 783 if child.getAttribute("Name") != self.getAttribute("Name"): 784 raise ligolw.ElementError("Stream name '%s' does not match Table name '%s'" % (child.getAttribute("Name"), self.getAttribute("Name")))

785

786 - def _end_of_columns(self):

787 """ 788 Called during parsing to indicate that the last Column 789 child element has been added. Subclasses can override this 790 to perform any special action that should occur following 791 the addition of the last Column element. 792 """ 793 pass

794

795 - def _end_of_rows(self):

796 """ 797 Called during parsing to indicate that the last row has 798 been added. Subclasses can override this to perform any 799 special action that should occur following the addition of 800 the last row. 801 """ 802 pass

803

804 - def removeChild(self, child):

805 """ 806 Remove a child from this element. The child element is 807 returned, and it's parentNode element is reset. 808 """ 809 super(Table, self).removeChild(child) 810 if child.tagName == ligolw.Column.tagName: 811 self._update_column_info() 812 return child

813

814 - def unlink(self):

815 """ 816 Break internal references within the document tree rooted 817 on this element to promote garbage collection. 818 """ 819 super(Table, self).unlink() 820 del self[:]

821

822 - def endElement(self):

823 # Table elements are allowed to contain 0 Stream children, 824 # but _end_of_columns() and _end_of_rows() hooks must be 825 # called regardless, so we do that here if needed. 826 if self.childNodes[-1].tagName != ligolw.Stream.tagName: 827 self._end_of_columns() 828 self._end_of_rows()

829 830 # 831 # Row ID manipulation 832 # 833 834 @classmethod

835 - def get_next_id(cls):

836 """ 837 Returns the current value of the next_id class attribute, 838 and increments the next_id class attribute by 1. Raises 839 ValueError if the table does not have an ID generator 840 associated with it. 841 """ 842 # = None if no ID generator 843 id = cls.next_id 844 cls.next_id += 1 845 return id

846 847 @classmethod

848 - def set_next_id(cls, id):

849 """ 850 Sets the value of the next_id class attribute. This is a 851 convenience function to help prevent accidentally assigning 852 a value to an instance attribute instead of the class 853 attribute. 854 """ 855 cls.next_id = id

856 857 @classmethod

858 - def reset_next_id(cls):

859 """ 860 If the current value of the next_id class attribute is not 861 None then set it to 0, otherwise it is left unmodified. 862 863 Example: 864 865 >>> import lsctables 866 >>> for cls in lsctables.TableByName.values(): cls.reset_next_id() 867 """ 868 if cls.next_id is not None: 869 cls.set_next_id(type(cls.next_id)(0))

870

871 - def sync_next_id(self):

872 """ 873 Determines the highest-numbered ID in this table, and sets 874 the table's .next_id attribute to the next highest ID in 875 sequence. If the .next_id attribute is already set to a 876 value greater than the highest value found, then it is left 877 unmodified. The return value is the ID identified by this 878 method. If the table's .next_id attribute is None, then 879 this function is a no-op. 880 881 Note that tables of the same name typically share a common 882 .next_id attribute (it is a class attribute, not an 883 attribute of each instance) so that IDs can be generated 884 that are unique across all tables in the document. Running 885 sync_next_id() on all the tables in a document that are of 886 the same type will have the effect of setting the ID to the 887 next ID higher than any ID in any of those tables. 888 889 Example: 890 891 >>> import lsctables 892 >>> tbl = lsctables.New(lsctables.ProcessTable) 893 >>> print(tbl.sync_next_id()) 894 process:process_id:0 895 """ 896 if self.next_id is not None: 897 if len(self): 898 n = max(self.getColumnByName(self.next_id.column_name)) + 1 899 else: 900 n = type(self.next_id)(0) 901 if n > self.next_id: 902 self.set_next_id(n) 903 return self.next_id

904

905 - def updateKeyMapping(self, mapping):

906 """ 907 Used as the first half of the row key reassignment 908 algorithm. Accepts a dictionary mapping old key --> new 909 key. Iterates over the rows in this table, using the 910 table's next_id attribute to assign a new ID to each row, 911 recording the changes in the mapping. Returns the mapping. 912 Raises ValueError if the table's next_id attribute is None. 913 """ 914 if self.next_id is None: 915 raise ValueError(self) 916 try: 917 column = self.getColumnByName(self.next_id.column_name) 918 except KeyError: 919 # table is missing its ID column, this is a no-op 920 return mapping 921 for i, old in enumerate(column): 922 if old is None: 923 raise ValueError("null row ID encountered in Table '%s', row %d" % (self.getAttribute("Name"), i)) 924 if old in mapping: 925 column[i] = mapping[old] 926 else: 927 column[i] = mapping[old] = self.get_next_id() 928 return mapping

929

930 - def applyKeyMapping(self, mapping):

931 """ 932 Used as the second half of the key reassignment algorithm. 933 Loops over each row in the table, replacing references to 934 old row keys with the new values from the mapping. 935 """ 936 for coltype, colname in zip(self.columntypes, self.columnnames): 937 if coltype in ligolwtypes.IDTypes and (self.next_id is None or colname != self.next_id.column_name): 938 column = self.getColumnByName(colname) 939 for i, old in enumerate(column): 940 try: 941 column[i] = mapping[old] 942 except KeyError: 943 pass

944

945 946 # 947 # ============================================================================= 948 # 949 # Content Handler 950 # 951 # ============================================================================= 952 # 953 954 955 # 956 # Override portions of a ligolw.LIGOLWContentHandler class 957 # 958 959 960 -def use_in(ContentHandler):

961 """ 962 Modify ContentHandler, a sub-class of 963 glue.ligolw.LIGOLWContentHandler, to cause it to use the Table, 964 Column, and Stream classes defined in this module when parsing XML 965 documents. 966 967 Example: 968 969 >>> from glue.ligolw import ligolw 970 >>> class LIGOLWContentHandler(ligolw.LIGOLWContentHandler): 971 ... pass 972 ... 973 >>> use_in(LIGOLWContentHandler) 974 <class 'glue.ligolw.table.LIGOLWContentHandler'> 975 """ 976 def startColumn(self, parent, attrs): 977 return Column(attrs)

978 979 def startStream(self, parent, attrs, __orig_startStream = ContentHandler.startStream): 980 if parent.tagName == ligolw.Table.tagName: 981 parent._end_of_columns() 982 return TableStream(attrs).config(parent) 983 return __orig_startStream(self, parent, attrs) 984 985 def startTable(self, parent, attrs): 986 return Table(attrs) 987 988 ContentHandler.startColumn = startColumn 989 ContentHandler.startStream = startStream 990 ContentHandler.startTable = startTable 991 992 return ContentHandler 993

Source Code for Module glue.ligolw.table