1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 """
28 Library of utility code for LIGO Light Weight XML applications.
29 """
30
31
32 import codecs
33 import gzip
34 from hashlib import md5
35 import warnings
36 import os
37 from six.moves import urllib
38 import signal
39 import stat
40 import sys
41
42
43 from glue import git_version
44 from .. import ligolw
45
46
47 __author__ = "Kipp Cannon <kipp.cannon@ligo.org>"
48 __version__ = "git id %s" % git_version.id
49 __date__ = git_version.date
50
51
52 __all__ = ["sort_files_by_size", "local_path_from_url", "load_fileobj", "load_filename", "load_url", "write_fileobj", "write_filename", "write_url"]
53
54
55
56
57
58
59
60
61
62
63
65 """
66 Return a list of the filenames sorted in order from smallest file
67 to largest file (or largest to smallest if reverse is set to True).
68 If a filename in the list is None (used by many glue.ligolw based
69 codes to indicate stdin), its size is treated as 0. The filenames
70 may be any sequence, including generator expressions.
71 """
72 if verbose:
73 if reverse:
74 sys.stderr.write("sorting files from largest to smallest ...\n")
75 else:
76 sys.stderr.write("sorting files from smallest to largest ...\n")
77 return sorted(filenames, key = (lambda filename: os.stat(filename)[stat.ST_SIZE] if filename is not None else 0), reverse = reverse)
78
79
81 """
82 For URLs that point to locations in the local filesystem, extract
83 and return the filesystem path of the object to which they point.
84 As a special case pass-through, if the URL is None, the return
85 value is None. Raises ValueError if the URL is not None and does
86 not point to a local file.
87
88 Example:
89
90 >>> print(local_path_from_url(None))
91 None
92 >>> local_path_from_url("file:///home/me/somefile.xml.gz")
93 '/home/me/somefile.xml.gz'
94 """
95 if url is None:
96 return None
97 scheme, host, path = urllib.parse.urlparse(url)[:3]
98 if scheme.lower() not in ("", "file") or host.lower() not in ("", "localhost"):
99 raise ValueError("%s is not a local file" % repr(url))
100 return path
101
102
227
228
230 - def __init__(self, fileobj, md5obj = None, closable = True):
231 self.fileobj = fileobj
232 if md5obj is None:
233 self.md5obj = md5()
234 else:
235 self.md5obj = md5obj
236 self.closable = closable
237
238 self._update = self.md5obj.update
239 try:
240 self._next = self.fileobj.next
241 except AttributeError:
242
243
244
245 self.next = lambda *args, **kwargs: fileobj.next(*args, **kwargs)
246 try:
247 self._read = self.fileobj.read
248 except AttributeError:
249
250
251
252 self.read = lambda *args, **kwargs: fileobj.read(*args, **kwargs)
253 try:
254 self._write = self.fileobj.write
255 except AttributeError:
256
257
258
259 self.write = lambda *args, **kwargs: fileobj.write(*args, **kwargs)
260 try:
261 self.tell = self.fileobj.tell
262 except AttributeError:
263 self.tell = lambda *args, **kwargs: fileobj.tell(*args, **kwargs)
264 try:
265 self.flush = self.fileobj.flush
266 except AttributeError:
267 self.flush = lambda *args, **kwargs: fileobj.flush(*args, **kwargs)
268
271
273 buf = self._next()
274 self._update(buf)
275 return buf
276
277 - def read(self, *args):
278 buf = self._read(*args)
279 self._update(buf)
280 return buf
281
283 self._update(buf)
284 return self._write(buf)
285
287 if self.closable:
288 return self.fileobj.close()
289 else:
290
291 self.flush()
292
295
297 self.close()
298 return False
299
300
302 default_signals = (signal.SIGTERM, signal.SIGTSTP)
303
305 self.trap_signals = trap_signals
306
308 self.deferred_signals.append(signum)
309
311 self.oldhandlers = {}
312 self.deferred_signals = []
313 if self.trap_signals is None:
314 return self
315 for sig in self.trap_signals:
316 self.oldhandlers[sig] = signal.getsignal(sig)
317 signal.signal(sig, self.handler)
318 return self
319
321
322 for sig, oldhandler in self.oldhandlers.items():
323 signal.signal(sig, oldhandler)
324
325 while self.deferred_signals:
326 os.kill(os.getpid(), self.deferred_signals.pop(0))
327 return False
328
329
330 -def load_fileobj(fileobj, gz = None, xmldoc = None, contenthandler = None):
331 """
332 Parse the contents of the file object fileobj, and return the
333 contents as a LIGO Light Weight document tree. The file object
334 does not need to be seekable.
335
336 If the gz parameter is None (the default) then gzip compressed data
337 will be automatically detected and decompressed, otherwise
338 decompression can be forced on or off by setting gz to True or
339 False respectively.
340
341 If the optional xmldoc argument is provided and not None, the
342 parsed XML tree will be appended to that document, otherwise a new
343 document will be created. The return value is a tuple, the first
344 element of the tuple is the XML document and the second is a string
345 containing the MD5 digest in hex digits of the bytestream that was
346 parsed.
347
348 Example:
349
350 >>> from glue.ligolw import ligolw
351 >>> import StringIO
352 >>> f = StringIO.StringIO('<?xml version="1.0" encoding="utf-8" ?><!DOCTYPE LIGO_LW SYSTEM "http://ldas-sw.ligo.caltech.edu/doc/ligolwAPI/html/ligolw_dtd.txt"><LIGO_LW><Table Name="demo:table"><Column Name="name" Type="lstring"/><Column Name="value" Type="real8"/><Stream Name="demo:table" Type="Local" Delimiter=",">"mass",0.5,"velocity",34</Stream></Table></LIGO_LW>')
353 >>> xmldoc, digest = load_fileobj(f, contenthandler = ligolw.LIGOLWContentHandler)
354 >>> digest
355 '6bdcc4726b892aad913531684024ed8e'
356
357 The contenthandler argument specifies the SAX content handler to
358 use when parsing the document. The contenthandler is a required
359 argument. See the glue.ligolw package documentation for typical
360 parsing scenario involving a custom content handler. See
361 glue.ligolw.ligolw.PartialLIGOLWContentHandler and
362 glue.ligolw.ligolw.FilteringLIGOLWContentHandler for examples of
363 custom content handlers used to load subsets of documents into
364 memory.
365 """
366 fileobj = MD5File(fileobj)
367 md5obj = fileobj.md5obj
368 if gz or gz is None:
369 fileobj = RewindableInputFile(fileobj)
370 magic = fileobj.read(2)
371 fileobj.seek(0, os.SEEK_SET)
372 if gz or magic == b'\037\213':
373 fileobj = gzip.GzipFile(mode = "rb", fileobj = fileobj)
374 if xmldoc is None:
375 xmldoc = ligolw.Document()
376 ligolw.make_parser(contenthandler(xmldoc)).parse(fileobj)
377 return xmldoc, md5obj.hexdigest()
378
379
381 """
382 Parse the contents of the file identified by filename, and return
383 the contents as a LIGO Light Weight document tree. stdin is parsed
384 if filename is None. Helpful verbosity messages are printed to
385 stderr if verbose is True. All other keyword arguments are passed
386 to load_fileobj(), see that function for more information. In
387 particular note that a content handler must be specified.
388
389 Example:
390
391 >>> from glue.ligolw import ligolw
392 >>> xmldoc = load_filename("demo.xml", contenthandler = ligolw.LIGOLWContentHandler, verbose = True)
393 """
394 if verbose:
395 sys.stderr.write("reading %s ...\n" % (("'%s'" % filename) if filename is not None else "stdin"))
396 if filename is not None:
397 fileobj = open(filename, "rb")
398 else:
399 fileobj = sys.stdin
400 xmldoc, hexdigest = load_fileobj(fileobj, **kwargs)
401 if verbose:
402 sys.stderr.write("md5sum: %s %s\n" % (hexdigest, (filename if filename is not None else "")))
403 return xmldoc
404
405
406 -def load_url(url, verbose = False, **kwargs):
407 """
408 Parse the contents of file at the given URL and return the contents
409 as a LIGO Light Weight document tree. Any source from which
410 Python's urllib library can read data is acceptable. stdin is
411 parsed if url is None. Helpful verbosity messages are printed to
412 stderr if verbose is True. All other keyword arguments are passed
413 to load_fileobj(), see that function for more information. In
414 particular note that a content handler must be specified.
415
416 Example:
417
418 >>> from os import getcwd
419 >>> from glue.ligolw import ligolw
420 >>> xmldoc = load_url("file://localhost/%s/demo.xml" % getcwd(), contenthandler = ligolw.LIGOLWContentHandler, verbose = True)
421 """
422 if verbose:
423 sys.stderr.write("reading %s ...\n" % (("'%s'" % url) if url is not None else "stdin"))
424 if url is not None:
425 scheme, host, path = urllib.parse.urlparse(url)[:3]
426 if scheme.lower() in ("", "file") and host.lower() in ("", "localhost"):
427 fileobj = open(path, "rb")
428 else:
429 fileobj = urllib.request.urlopen(url)
430 else:
431 fileobj = sys.stdin
432 xmldoc, hexdigest = load_fileobj(fileobj, **kwargs)
433 if verbose:
434 sys.stderr.write("md5sum: %s %s\n" % (hexdigest, (url if url is not None else "")))
435 return xmldoc
436
437
438 -def write_fileobj(xmldoc, fileobj, gz = False, compresslevel = 3, **kwargs):
439 """
440 Writes the LIGO Light Weight document tree rooted at xmldoc to the
441 given file object. Internally, the .write() method of the xmldoc
442 object is invoked and any additional keyword arguments are passed
443 to that method. The file object need not be seekable. The output
444 data is gzip compressed on the fly if gz is True, and in that case
445 the compresslevel parameter sets the gzip compression level (the
446 default is 3). The return value is a string containing the hex
447 digits of the MD5 digest of the output bytestream.
448
449 Example:
450
451 >>> import sys
452 >>> from glue.ligolw import ligolw
453 >>> xmldoc = load_filename("demo.xml", contenthandler = ligolw.LIGOLWContentHandler)
454 >>> digest = write_fileobj(xmldoc, sys.stdout) # doctest: +NORMALIZE_WHITESPACE
455 <?xml version='1.0' encoding='utf-8'?>
456 <!DOCTYPE LIGO_LW SYSTEM "http://ldas-sw.ligo.caltech.edu/doc/ligolwAPI/html/ligolw_dtd.txt">
457 <LIGO_LW>
458 <Table Name="demo:table">
459 <Column Type="lstring" Name="name"/>
460 <Column Type="real8" Name="value"/>
461 <Stream Delimiter="," Type="Local" Name="demo:table">
462 "mass",0.5,"velocity",34
463 </Stream>
464 </Table>
465 </LIGO_LW>
466 >>> digest
467 '37044d979a79409b3d782da126636f53'
468 """
469 with MD5File(fileobj, closable = False) as fileobj:
470 md5obj = fileobj.md5obj
471 with fileobj if not gz else gzip.GzipFile(mode = "wb", fileobj = fileobj, compresslevel = compresslevel) as fileobj:
472 with codecs.getwriter("utf_8")(fileobj) as fileobj:
473 xmldoc.write(fileobj, **kwargs)
474 return md5obj.hexdigest()
475
476
482
484 try:
485 self.tildefilename = self.filename + "~"
486 self.fobj = open(self.tildefilename, "wb")
487 except IOError:
488 self.tildefilename = None
489 self.fobj = open(self.filename, "wb")
490 return self.fobj
491
492 - def __exit__(self, exc_type, exc_val, exc_tb):
493 self.fobj.close()
494 del self.fobj
495
496
497
498
499
500
501 if exc_type is None and self.tildefilename is not None:
502 os.rename(self.tildefilename, self.filename)
503
504 return False
505
506
508 """
509 Writes the LIGO Light Weight document tree rooted at xmldoc to the
510 file name filename. If filename is None the file is written to
511 stdout, otherwise it is written to the named file. Friendly
512 verbosity messages are printed while writing the file if verbose is
513 True. The output data is gzip compressed on the fly if gz is True.
514 If with_mv is True and filename is not None the filename has a "~"
515 appended to it and the file is written to that name then moved to
516 the requested name once the write has completed successfully.
517
518 Internally, write_fileobj() is used to perform the write. All
519 additional keyword arguments are passed to write_fileobj().
520
521 This function traps the signals in the trap_signals iterable during
522 the write process (see SignalsTrap for the default signals), and it
523 does this by temporarily installing its own signal handlers in
524 place of the current handlers. This is done to prevent Condor
525 eviction during the write process. When the file write is
526 concluded the original signal handlers are restored. Then, if
527 signals were trapped during the write process, the signals are then
528 resent to the current process in the order in which they were
529 received. The signal.signal() system call cannot be invoked from
530 threads, and trap_signals must be set to None or an empty sequence
531 if this function is used from a thread.
532
533 Example:
534
535 >>> write_filename(xmldoc, "demo.xml") # doctest: +SKIP
536 >>> write_filename(xmldoc, "demo.xml.gz", gz = True) # doctest: +SKIP
537 """
538 if verbose:
539 sys.stderr.write("writing %s ...\n" % (("'%s'" % filename) if filename is not None else "stdout"))
540 with SignalsTrap(trap_signals):
541 if filename is None:
542 hexdigest = write_fileobj(xmldoc, sys.stdout, gz = gz, **kwargs)
543 else:
544 if not gz and filename.endswith(".gz"):
545 warnings.warn("filename '%s' ends in '.gz' but file is not being gzip-compressed" % filename, UserWarning)
546 binary_open = lambda filename: open(filename, 'wb')
547 with (binary_open if not with_mv else tildefile)(filename) as fileobj:
548 hexdigest = write_fileobj(xmldoc, fileobj, gz = gz, **kwargs)
549 if verbose:
550 sys.stderr.write("md5sum: %s %s\n" % (hexdigest, (filename if filename is not None else "")))
551
552
554 """
555 Writes the LIGO Light Weight document tree rooted at xmldoc to the
556 URL name url.
557
558 NOTE: only URLs that point to local files can be written to at
559 this time. Internally, write_filename() is used to perform the
560 write. All additional keyword arguments are passed to that
561 function. The implementation might change in the future,
562 especially if support for other types of URLs is ever added.
563
564 Example:
565
566 >>> write_url(xmldoc, "file:///data.xml") # doctest: +SKIP
567 >>> write_url(xmldoc, "file:///data.xml.gz", gz = True) # doctest: +SKIP
568 """
569 return write_filename(xmldoc, local_path_from_url(url), **kwargs)
570