glue.ligolw.utils

1 # Copyright (C) 2006--2014 Kipp Cannon 2 # 3 # This program is free software; you can redistribute it and/or modify it 4 # under the terms of the GNU General Public License as published by the 5 # Free Software Foundation; either version 3 of the License, or (at your 6 # option) any later version. 7 # 8 # This program is distributed in the hope that it will be useful, but 9 # WITHOUT ANY WARRANTY; without even the implied warranty of 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 11 # Public License for more details. 12 # 13 # You should have received a copy of the GNU General Public License along 14 # with this program; if not, write to the Free Software Foundation, Inc., 15 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 16 17 18 # 19 # ============================================================================= 20 # 21 # Preamble 22 # 23 # ============================================================================= 24 # 25 26 27 """ 28 Library of utility code for LIGO Light Weight XML applications. 29 """ 30 31 32 import codecs 33 import gzip 34 from hashlib import md5 35 import warnings 36 import os 37 from six.moves import urllib 38 import signal 39 import stat 40 import sys 41 42 43 from glue import git_version 44 from .. import ligolw 45 46 47 __author__ = "Kipp Cannon <kipp.cannon@ligo.org>" 48 __version__ = "git id %s" % git_version.id 49 __date__ = git_version.date 50 51 52 __all__ = ["sort_files_by_size", "local_path_from_url", "load_fileobj", "load_filename", "load_url", "write_fileobj", "write_filename", "write_url"] 53 54 55 # 56 # ============================================================================= 57 # 58 # Input/Output 59 # 60 # ============================================================================= 61 # 62 63

64 -def sort_files_by_size(filenames, verbose = False, reverse = False):

65 """ 66 Return a list of the filenames sorted in order from smallest file 67 to largest file (or largest to smallest if reverse is set to True). 68 If a filename in the list is None (used by many glue.ligolw based 69 codes to indicate stdin), its size is treated as 0. The filenames 70 may be any sequence, including generator expressions. 71 """ 72 if verbose: 73 if reverse: 74 sys.stderr.write("sorting files from largest to smallest ...\n") 75 else: 76 sys.stderr.write("sorting files from smallest to largest ...\n") 77 return sorted(filenames, key = (lambda filename: os.stat(filename)[stat.ST_SIZE] if filename is not None else 0), reverse = reverse)

78 79

80 -def local_path_from_url(url):

81 """ 82 For URLs that point to locations in the local filesystem, extract 83 and return the filesystem path of the object to which they point. 84 As a special case pass-through, if the URL is None, the return 85 value is None. Raises ValueError if the URL is not None and does 86 not point to a local file. 87 88 Example: 89 90 >>> print(local_path_from_url(None)) 91 None 92 >>> local_path_from_url("file:///home/me/somefile.xml.gz") 93 '/home/me/somefile.xml.gz' 94 """ 95 if url is None: 96 return None 97 scheme, host, path = urllib.parse.urlparse(url)[:3] 98 if scheme.lower() not in ("", "file") or host.lower() not in ("", "localhost"): 99 raise ValueError("%s is not a local file" % repr(url)) 100 return path

101 102

103 -class RewindableInputFile(object):

104 # The GzipFile class in Python's standard library is, in my 105 # opinion, somewhat weak. Instead of relying on the return values 106 # from the file object's .read() method, GzipFile checks for EOF 107 # using calls to .seek(). Furthermore, it uses .seek() instead of 108 # buffering data internally as required. This makes GzipFile 109 # gratuitously unable to work with pipes, urlfile objects, and 110 # anything else that does not support seeking (including the 111 # MD5File class in this module). To hack around this, this class 112 # provides the buffering needed by GzipFile. It also does proper 113 # EOF checking, and uses the results to emulate the results of 114 # GzipFile's .seek() games. 115 # 116 # By wrapping your file object in this class before passing it to 117 # GzipFile, you can use GzipFile to read from non-seekable files. 118 # 119 # How GzipFile checks for EOF == call .tell() to get current 120 # position, seek to end of file with .seek(0, 2), call .tell() 121 # again and check if the number has changed from before, if it has 122 # then we weren't at EOF so call .seek() with original position and 123 # keep going. ?! 124

125 - def __init__(self, fileobj, buffer_size = 1024):

126 # the real source of data 127 self.fileobj = fileobj 128 # where the application thinks it is in the file. this is 129 # used to fake .tell() because file objects that don't 130 # support seeking, like stdin, report IOError, and the 131 # things returned by urllib don't have a .tell() method at 132 # all. 133 self.pos = 0 134 # how many octets of the internal buffer to return before 135 # getting more data 136 self.reuse = 0 137 # the internal buffer 138 self.buf = b' ' * buffer_size 139 # flag indicating a .seek()-based EOF test is in progress 140 self.gzip_hack_pretend_to_be_at_eof = False 141 # avoid attribute look-ups 142 self._next = self.fileobj.next 143 self._read = self.fileobj.read 144 self.close = self.fileobj.close

145

146 - def __iter__(self):

147 return self

148

149 - def next(self):

150 if self.gzip_hack_pretend_to_be_at_eof: 151 return b'' 152 if self.reuse: 153 buf = self.buf[-self.reuse:] 154 self.reuse = 0 155 else: 156 buf = self._next() 157 self.buf = (self.buf + buf)[-len(self.buf):] 158 self.pos += len(buf) 159 return buf

160

161 - def read(self, size = None):

162 if self.gzip_hack_pretend_to_be_at_eof: 163 return b'' 164 if self.reuse: 165 if self.reuse < 0: 166 buf = self._read(size - self.reuse) 167 self.buf = (self.buf + buf)[-len(self.buf):] 168 buf = buf[-self.reuse:] 169 self.reuse = 0 170 # size is None --> condition is False 171 elif 0 <= size < self.reuse: 172 buf = self.buf[-self.reuse:-self.reuse + size] 173 self.reuse -= size 174 else: 175 buf = self.buf[-self.reuse:] 176 self.reuse = 0 177 # size is None --> condition is False 178 if len(buf) < size: 179 buf += self.read(size - len(buf)) 180 else: 181 buf = self._read(size) 182 self.buf = (self.buf + buf)[-len(self.buf):] 183 self.pos += len(buf) 184 return buf

185

186 - def seek(self, offset, whence = os.SEEK_SET):

187 self.gzip_hack_pretend_to_be_at_eof = False 188 if whence == os.SEEK_SET: 189 if offset >= 0 and 0 <= self.pos + self.reuse - offset < len(self.buf): 190 self.reuse += self.pos - offset 191 self.pos = offset 192 else: 193 raise IOError("seek out of range") 194 elif whence == os.SEEK_CUR: 195 if self.reuse - len(self.buf) <= offset: 196 self.reuse -= offset 197 self.pos += offset 198 else: 199 raise IOError("seek out of range") 200 elif whence == os.SEEK_END: 201 if offset == 0: 202 self.gzip_hack_pretend_to_be_at_eof = True 203 else: 204 raise IOError("seek out of range")

205

206 - def tell(self):

207 if self.gzip_hack_pretend_to_be_at_eof: 208 # check to see if we are at EOF by seeing if we can 209 # read 1 character. save it in the internal buffer 210 # to not loose it. 211 c = self._read(1) 212 self.buf = (self.buf + c)[-len(self.buf):] 213 self.reuse += len(c) 214 if c: 215 # this will not return the same answer as 216 # when GzipFile called it before seeking to 217 # EOF 218 return self.pos + 1 219 return self.pos

220

221 - def __enter__(self):

222 return self

223

224 - def __exit__(self, *args):

225 self.close() 226 return False

227 228

229 -class MD5File(object):

230 - def __init__(self, fileobj, md5obj = None, closable = True):

231 self.fileobj = fileobj 232 if md5obj is None: 233 self.md5obj = md5() 234 else: 235 self.md5obj = md5obj 236 self.closable = closable 237 # avoid attribute look-ups 238 self._update = self.md5obj.update 239 try: 240 self._next = self.fileobj.next 241 except AttributeError: 242 # replace our .next() method with something that 243 # will raise a more meaningful exception if 244 # attempted 245 self.next = lambda *args, **kwargs: fileobj.next(*args, **kwargs) 246 try: 247 self._read = self.fileobj.read 248 except AttributeError: 249 # replace our .read() method with something that 250 # will raise a more meaningful exception if 251 # attempted 252 self.read = lambda *args, **kwargs: fileobj.read(*args, **kwargs) 253 try: 254 self._write = self.fileobj.write 255 except AttributeError: 256 # replace our .write() method with something that 257 # will raise a more meaningful exception if 258 # attempted 259 self.write = lambda *args, **kwargs: fileobj.write(*args, **kwargs) 260 try: 261 self.tell = self.fileobj.tell 262 except AttributeError: 263 self.tell = lambda *args, **kwargs: fileobj.tell(*args, **kwargs) 264 try: 265 self.flush = self.fileobj.flush 266 except AttributeError: 267 self.flush = lambda *args, **kwargs: fileobj.flush(*args, **kwargs)

268

269 - def __iter__(self):

270 return self

271

272 - def next(self):

273 buf = self._next() 274 self._update(buf) 275 return buf

276

277 - def read(self, *args):

278 buf = self._read(*args) 279 self._update(buf) 280 return buf

281

282 - def write(self, buf):

283 self._update(buf) 284 return self._write(buf)

285

286 - def close(self):

287 if self.closable: 288 return self.fileobj.close() 289 else: 290 # at least make sure we're flushed 291 self.flush()

292

293 - def __enter__(self):

294 return self

295

296 - def __exit__(self, *args):

297 self.close() 298 return False

299 300

301 -class SignalsTrap(object):

302 default_signals = (signal.SIGTERM, signal.SIGTSTP) 303

304 - def __init__(self, trap_signals = default_signals):

305 self.trap_signals = trap_signals

306

307 - def handler(self, signum, frame):

308 self.deferred_signals.append(signum)

309

310 - def __enter__(self):

311 self.oldhandlers = {} 312 self.deferred_signals = [] 313 if self.trap_signals is None: 314 return self 315 for sig in self.trap_signals: 316 self.oldhandlers[sig] = signal.getsignal(sig) 317 signal.signal(sig, self.handler) 318 return self

319

320 - def __exit__(self, *args):

321 # restore original handlers 322 for sig, oldhandler in self.oldhandlers.items(): 323 signal.signal(sig, oldhandler) 324 # send ourselves the trapped signals in order 325 while self.deferred_signals: 326 os.kill(os.getpid(), self.deferred_signals.pop(0)) 327 return False

328 329

330 -def load_fileobj(fileobj, gz = None, xmldoc = None, contenthandler = None):

331 """ 332 Parse the contents of the file object fileobj, and return the 333 contents as a LIGO Light Weight document tree. The file object 334 does not need to be seekable. 335 336 If the gz parameter is None (the default) then gzip compressed data 337 will be automatically detected and decompressed, otherwise 338 decompression can be forced on or off by setting gz to True or 339 False respectively. 340 341 If the optional xmldoc argument is provided and not None, the 342 parsed XML tree will be appended to that document, otherwise a new 343 document will be created. The return value is a tuple, the first 344 element of the tuple is the XML document and the second is a string 345 containing the MD5 digest in hex digits of the bytestream that was 346 parsed. 347 348 Example: 349 350 >>> from glue.ligolw import ligolw 351 >>> import StringIO 352 >>> f = StringIO.StringIO('<?xml version="1.0" encoding="utf-8" ?><!DOCTYPE LIGO_LW SYSTEM "http://ldas-sw.ligo.caltech.edu/doc/ligolwAPI/html/ligolw_dtd.txt"><LIGO_LW><Table Name="demo:table"><Column Name="name" Type="lstring"/><Column Name="value" Type="real8"/><Stream Name="demo:table" Type="Local" Delimiter=",">"mass",0.5,"velocity",34</Stream></Table></LIGO_LW>') 353 >>> xmldoc, digest = load_fileobj(f, contenthandler = ligolw.LIGOLWContentHandler) 354 >>> digest 355 '6bdcc4726b892aad913531684024ed8e' 356 357 The contenthandler argument specifies the SAX content handler to 358 use when parsing the document. The contenthandler is a required 359 argument. See the glue.ligolw package documentation for typical 360 parsing scenario involving a custom content handler. See 361 glue.ligolw.ligolw.PartialLIGOLWContentHandler and 362 glue.ligolw.ligolw.FilteringLIGOLWContentHandler for examples of 363 custom content handlers used to load subsets of documents into 364 memory. 365 """ 366 fileobj = MD5File(fileobj) 367 md5obj = fileobj.md5obj 368 if gz or gz is None: 369 fileobj = RewindableInputFile(fileobj) 370 magic = fileobj.read(2) 371 fileobj.seek(0, os.SEEK_SET) 372 if gz or magic == b'\037\213': 373 fileobj = gzip.GzipFile(mode = "rb", fileobj = fileobj) 374 if xmldoc is None: 375 xmldoc = ligolw.Document() 376 ligolw.make_parser(contenthandler(xmldoc)).parse(fileobj) 377 return xmldoc, md5obj.hexdigest()

378 379

380 -def load_filename(filename, verbose = False, **kwargs):

381 """ 382 Parse the contents of the file identified by filename, and return 383 the contents as a LIGO Light Weight document tree. stdin is parsed 384 if filename is None. Helpful verbosity messages are printed to 385 stderr if verbose is True. All other keyword arguments are passed 386 to load_fileobj(), see that function for more information. In 387 particular note that a content handler must be specified. 388 389 Example: 390 391 >>> from glue.ligolw import ligolw 392 >>> xmldoc = load_filename("demo.xml", contenthandler = ligolw.LIGOLWContentHandler, verbose = True) 393 """ 394 if verbose: 395 sys.stderr.write("reading %s ...\n" % (("'%s'" % filename) if filename is not None else "stdin")) 396 if filename is not None: 397 fileobj = open(filename, "rb") 398 else: 399 fileobj = sys.stdin 400 xmldoc, hexdigest = load_fileobj(fileobj, **kwargs) 401 if verbose: 402 sys.stderr.write("md5sum: %s %s\n" % (hexdigest, (filename if filename is not None else ""))) 403 return xmldoc

404 405

406 -def load_url(url, verbose = False, **kwargs):

407 """ 408 Parse the contents of file at the given URL and return the contents 409 as a LIGO Light Weight document tree. Any source from which 410 Python's urllib library can read data is acceptable. stdin is 411 parsed if url is None. Helpful verbosity messages are printed to 412 stderr if verbose is True. All other keyword arguments are passed 413 to load_fileobj(), see that function for more information. In 414 particular note that a content handler must be specified. 415 416 Example: 417 418 >>> from os import getcwd 419 >>> from glue.ligolw import ligolw 420 >>> xmldoc = load_url("file://localhost/%s/demo.xml" % getcwd(), contenthandler = ligolw.LIGOLWContentHandler, verbose = True) 421 """ 422 if verbose: 423 sys.stderr.write("reading %s ...\n" % (("'%s'" % url) if url is not None else "stdin")) 424 if url is not None: 425 scheme, host, path = urllib.parse.urlparse(url)[:3] 426 if scheme.lower() in ("", "file") and host.lower() in ("", "localhost"): 427 fileobj = open(path, "rb") 428 else: 429 fileobj = urllib.request.urlopen(url) 430 else: 431 fileobj = sys.stdin 432 xmldoc, hexdigest = load_fileobj(fileobj, **kwargs) 433 if verbose: 434 sys.stderr.write("md5sum: %s %s\n" % (hexdigest, (url if url is not None else ""))) 435 return xmldoc

436 437

438 -def write_fileobj(xmldoc, fileobj, gz = False, compresslevel = 3, **kwargs):

439 """ 440 Writes the LIGO Light Weight document tree rooted at xmldoc to the 441 given file object. Internally, the .write() method of the xmldoc 442 object is invoked and any additional keyword arguments are passed 443 to that method. The file object need not be seekable. The output 444 data is gzip compressed on the fly if gz is True, and in that case 445 the compresslevel parameter sets the gzip compression level (the 446 default is 3). The return value is a string containing the hex 447 digits of the MD5 digest of the output bytestream. 448 449 Example: 450 451 >>> import sys 452 >>> from glue.ligolw import ligolw 453 >>> xmldoc = load_filename("demo.xml", contenthandler = ligolw.LIGOLWContentHandler) 454 >>> digest = write_fileobj(xmldoc, sys.stdout) # doctest: +NORMALIZE_WHITESPACE 455 <?xml version='1.0' encoding='utf-8'?> 456 <!DOCTYPE LIGO_LW SYSTEM "http://ldas-sw.ligo.caltech.edu/doc/ligolwAPI/html/ligolw_dtd.txt"> 457 <LIGO_LW> 458 <Table Name="demo:table"> 459 <Column Type="lstring" Name="name"/> 460 <Column Type="real8" Name="value"/> 461 <Stream Delimiter="," Type="Local" Name="demo:table"> 462 "mass",0.5,"velocity",34 463 </Stream> 464 </Table> 465 </LIGO_LW> 466 >>> digest 467 '37044d979a79409b3d782da126636f53' 468 """ 469 with MD5File(fileobj, closable = False) as fileobj: 470 md5obj = fileobj.md5obj 471 with fileobj if not gz else gzip.GzipFile(mode = "wb", fileobj = fileobj, compresslevel = compresslevel) as fileobj: 472 with codecs.getwriter("utf_8")(fileobj) as fileobj: 473 xmldoc.write(fileobj, **kwargs) 474 return md5obj.hexdigest()

475 476

477 -class tildefile(object):

478 - def __init__(self, filename):

479 if not filename: 480 raise ValueError(filename) 481 self.filename = filename

482

483 - def __enter__(self):

484 try: 485 self.tildefilename = self.filename + "~" 486 self.fobj = open(self.tildefilename, "wb") 487 except IOError: 488 self.tildefilename = None 489 self.fobj = open(self.filename, "wb") 490 return self.fobj

491

492 - def __exit__(self, exc_type, exc_val, exc_tb):

493 self.fobj.close() 494 del self.fobj 495 496 # 497 # only rename the "~" version to the final destination if 498 # no exception has occurred. 499 # 500 501 if exc_type is None and self.tildefilename is not None: 502 os.rename(self.tildefilename, self.filename) 503 504 return False

505 506

507 -def write_filename(xmldoc, filename, verbose = False, gz = False, with_mv = True, trap_signals = SignalsTrap.default_signals, **kwargs):

508 """ 509 Writes the LIGO Light Weight document tree rooted at xmldoc to the 510 file name filename. If filename is None the file is written to 511 stdout, otherwise it is written to the named file. Friendly 512 verbosity messages are printed while writing the file if verbose is 513 True. The output data is gzip compressed on the fly if gz is True. 514 If with_mv is True and filename is not None the filename has a "~" 515 appended to it and the file is written to that name then moved to 516 the requested name once the write has completed successfully. 517 518 Internally, write_fileobj() is used to perform the write. All 519 additional keyword arguments are passed to write_fileobj(). 520 521 This function traps the signals in the trap_signals iterable during 522 the write process (see SignalsTrap for the default signals), and it 523 does this by temporarily installing its own signal handlers in 524 place of the current handlers. This is done to prevent Condor 525 eviction during the write process. When the file write is 526 concluded the original signal handlers are restored. Then, if 527 signals were trapped during the write process, the signals are then 528 resent to the current process in the order in which they were 529 received. The signal.signal() system call cannot be invoked from 530 threads, and trap_signals must be set to None or an empty sequence 531 if this function is used from a thread. 532 533 Example: 534 535 >>> write_filename(xmldoc, "demo.xml") # doctest: +SKIP 536 >>> write_filename(xmldoc, "demo.xml.gz", gz = True) # doctest: +SKIP 537 """ 538 if verbose: 539 sys.stderr.write("writing %s ...\n" % (("'%s'" % filename) if filename is not None else "stdout")) 540 with SignalsTrap(trap_signals): 541 if filename is None: 542 hexdigest = write_fileobj(xmldoc, sys.stdout, gz = gz, **kwargs) 543 else: 544 if not gz and filename.endswith(".gz"): 545 warnings.warn("filename '%s' ends in '.gz' but file is not being gzip-compressed" % filename, UserWarning) 546 binary_open = lambda filename: open(filename, 'wb') 547 with (binary_open if not with_mv else tildefile)(filename) as fileobj: 548 hexdigest = write_fileobj(xmldoc, fileobj, gz = gz, **kwargs) 549 if verbose: 550 sys.stderr.write("md5sum: %s %s\n" % (hexdigest, (filename if filename is not None else "")))

551 552

553 -def write_url(xmldoc, url, **kwargs):

554 """ 555 Writes the LIGO Light Weight document tree rooted at xmldoc to the 556 URL name url. 557 558 NOTE: only URLs that point to local files can be written to at 559 this time. Internally, write_filename() is used to perform the 560 write. All additional keyword arguments are passed to that 561 function. The implementation might change in the future, 562 especially if support for other types of URLs is ever added. 563 564 Example: 565 566 >>> write_url(xmldoc, "file:///data.xml") # doctest: +SKIP 567 >>> write_url(xmldoc, "file:///data.xml.gz", gz = True) # doctest: +SKIP 568 """ 569 return write_filename(xmldoc, local_path_from_url(url), **kwargs)

570

Source Code for Package glue.ligolw.utils