1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22   
 23   
 24   
 25   
 26   
 27  """ 
 28  Library of utility code for LIGO Light Weight XML applications. 
 29  """ 
 30   
 31   
 32  import codecs 
 33  import gzip 
 34  from hashlib import md5 
 35  import warnings 
 36  import os 
 37  from six.moves import urllib 
 38  import signal 
 39  import stat 
 40  import sys 
 41   
 42   
 43  from glue import git_version 
 44  from .. import ligolw 
 45   
 46   
 47  __author__ = "Kipp Cannon <kipp.cannon@ligo.org>" 
 48  __version__ = "git id %s" % git_version.id 
 49  __date__ = git_version.date 
 50   
 51   
 52  __all__ = ["sort_files_by_size", "local_path_from_url", "load_fileobj", "load_filename", "load_url", "write_fileobj", "write_filename", "write_url"] 
 53   
 54   
 55   
 56   
 57   
 58   
 59   
 60   
 61   
 62   
 63   
 65          """ 
 66          Return a list of the filenames sorted in order from smallest file 
 67          to largest file (or largest to smallest if reverse is set to True). 
 68          If a filename in the list is None (used by many glue.ligolw based 
 69          codes to indicate stdin), its size is treated as 0.  The filenames 
 70          may be any sequence, including generator expressions. 
 71          """ 
 72          if verbose: 
 73                  if reverse: 
 74                          sys.stderr.write("sorting files from largest to smallest ...\n") 
 75                  else: 
 76                          sys.stderr.write("sorting files from smallest to largest ...\n") 
 77          return sorted(filenames, key = (lambda filename: os.stat(filename)[stat.ST_SIZE] if filename is not None else 0), reverse = reverse) 
  78   
 79   
 81          """ 
 82          For URLs that point to locations in the local filesystem, extract 
 83          and return the filesystem path of the object to which they point. 
 84          As a special case pass-through, if the URL is None, the return 
 85          value is None.  Raises ValueError if the URL is not None and does 
 86          not point to a local file. 
 87   
 88          Example: 
 89   
 90          >>> print(local_path_from_url(None)) 
 91          None 
 92          >>> local_path_from_url("file:///home/me/somefile.xml.gz") 
 93          '/home/me/somefile.xml.gz' 
 94          """ 
 95          if url is None: 
 96                  return None 
 97          scheme, host, path = urllib.parse.urlparse(url)[:3] 
 98          if scheme.lower() not in ("", "file") or host.lower() not in ("", "localhost"): 
 99                  raise ValueError("%s is not a local file" % repr(url)) 
100          return path 
 101   
102   
227   
228   
230 -        def __init__(self, fileobj, md5obj = None, closable = True): 
 231                  self.fileobj = fileobj 
232                  if md5obj is None: 
233                          self.md5obj = md5() 
234                  else: 
235                          self.md5obj = md5obj 
236                  self.closable = closable 
237                   
238                  self._update = self.md5obj.update 
239                  try: 
240                          self._next = self.fileobj.next 
241                  except AttributeError: 
242                           
243                           
244                           
245                          self.next = lambda *args, **kwargs: fileobj.next(*args, **kwargs) 
246                  try: 
247                          self._read = self.fileobj.read 
248                  except AttributeError: 
249                           
250                           
251                           
252                          self.read = lambda *args, **kwargs: fileobj.read(*args, **kwargs) 
253                  try: 
254                          self._write = self.fileobj.write 
255                  except AttributeError: 
256                           
257                           
258                           
259                          self.write = lambda *args, **kwargs: fileobj.write(*args, **kwargs) 
260                  try: 
261                          self.tell = self.fileobj.tell 
262                  except AttributeError: 
263                          self.tell = lambda *args, **kwargs: fileobj.tell(*args, **kwargs) 
264                  try: 
265                          self.flush = self.fileobj.flush 
266                  except AttributeError: 
267                          self.flush = lambda *args, **kwargs: fileobj.flush(*args, **kwargs) 
 268   
271   
273                  buf = self._next() 
274                  self._update(buf) 
275                  return buf 
 276   
277 -        def read(self, *args): 
 278                  buf = self._read(*args) 
279                  self._update(buf) 
280                  return buf 
 281   
283                  self._update(buf) 
284                  return self._write(buf) 
 285   
287                  if self.closable: 
288                          return self.fileobj.close() 
289                  else: 
290                           
291                          self.flush() 
 292   
295   
297                  self.close() 
298                  return False 
  299   
300   
302          default_signals = (signal.SIGTERM, signal.SIGTSTP) 
303   
305                  self.trap_signals = trap_signals 
 306   
308                  self.deferred_signals.append(signum) 
 309   
311                  self.oldhandlers = {} 
312                  self.deferred_signals = [] 
313                  if self.trap_signals is None: 
314                          return self 
315                  for sig in self.trap_signals: 
316                          self.oldhandlers[sig] = signal.getsignal(sig) 
317                          signal.signal(sig, self.handler) 
318                  return self 
 319   
321                   
322                  for sig, oldhandler in self.oldhandlers.items(): 
323                          signal.signal(sig, oldhandler) 
324                   
325                  while self.deferred_signals: 
326                          os.kill(os.getpid(), self.deferred_signals.pop(0)) 
327                  return False 
  328   
329   
330 -def load_fileobj(fileobj, gz = None, xmldoc = None, contenthandler = None): 
 331          """ 
332          Parse the contents of the file object fileobj, and return the 
333          contents as a LIGO Light Weight document tree.  The file object 
334          does not need to be seekable. 
335   
336          If the gz parameter is None (the default) then gzip compressed data 
337          will be automatically detected and decompressed, otherwise 
338          decompression can be forced on or off by setting gz to True or 
339          False respectively. 
340   
341          If the optional xmldoc argument is provided and not None, the 
342          parsed XML tree will be appended to that document, otherwise a new 
343          document will be created.  The return value is a tuple, the first 
344          element of the tuple is the XML document and the second is a string 
345          containing the MD5 digest in hex digits of the bytestream that was 
346          parsed. 
347   
348          Example: 
349   
350          >>> from glue.ligolw import ligolw 
351          >>> import StringIO 
352          >>> f = StringIO.StringIO('<?xml version="1.0" encoding="utf-8" ?><!DOCTYPE LIGO_LW SYSTEM "http://ldas-sw.ligo.caltech.edu/doc/ligolwAPI/html/ligolw_dtd.txt"><LIGO_LW><Table Name="demo:table"><Column Name="name" Type="lstring"/><Column Name="value" Type="real8"/><Stream Name="demo:table" Type="Local" Delimiter=",">"mass",0.5,"velocity",34</Stream></Table></LIGO_LW>') 
353          >>> xmldoc, digest = load_fileobj(f, contenthandler = ligolw.LIGOLWContentHandler) 
354          >>> digest 
355          '6bdcc4726b892aad913531684024ed8e' 
356   
357          The contenthandler argument specifies the SAX content handler to 
358          use when parsing the document.  The contenthandler is a required 
359          argument.  See the glue.ligolw package documentation for typical 
360          parsing scenario involving a custom content handler.  See 
361          glue.ligolw.ligolw.PartialLIGOLWContentHandler and 
362          glue.ligolw.ligolw.FilteringLIGOLWContentHandler for examples of 
363          custom content handlers used to load subsets of documents into 
364          memory. 
365          """ 
366          fileobj = MD5File(fileobj) 
367          md5obj = fileobj.md5obj 
368          if gz or gz is None: 
369                  fileobj = RewindableInputFile(fileobj) 
370                  magic = fileobj.read(2) 
371                  fileobj.seek(0, os.SEEK_SET) 
372                  if gz or magic == b'\037\213': 
373                          fileobj = gzip.GzipFile(mode = "rb", fileobj = fileobj) 
374          if xmldoc is None: 
375                  xmldoc = ligolw.Document() 
376          ligolw.make_parser(contenthandler(xmldoc)).parse(fileobj) 
377          return xmldoc, md5obj.hexdigest() 
 378   
379   
381          """ 
382          Parse the contents of the file identified by filename, and return 
383          the contents as a LIGO Light Weight document tree.  stdin is parsed 
384          if filename is None.  Helpful verbosity messages are printed to 
385          stderr if verbose is True.  All other keyword arguments are passed 
386          to load_fileobj(), see that function for more information.  In 
387          particular note that a content handler must be specified. 
388   
389          Example: 
390   
391          >>> from glue.ligolw import ligolw 
392          >>> xmldoc = load_filename("demo.xml", contenthandler = ligolw.LIGOLWContentHandler, verbose = True) 
393          """ 
394          if verbose: 
395                  sys.stderr.write("reading %s ...\n" % (("'%s'" % filename) if filename is not None else "stdin")) 
396          if filename is not None: 
397                  fileobj = open(filename, "rb") 
398          else: 
399                  fileobj = sys.stdin 
400          xmldoc, hexdigest = load_fileobj(fileobj, **kwargs) 
401          if verbose: 
402                  sys.stderr.write("md5sum: %s  %s\n" % (hexdigest, (filename if filename is not None else ""))) 
403          return xmldoc 
 404   
405   
406 -def load_url(url, verbose = False, **kwargs): 
 407          """ 
408          Parse the contents of file at the given URL and return the contents 
409          as a LIGO Light Weight document tree.  Any source from which 
410          Python's urllib library can read data is acceptable.  stdin is 
411          parsed if url is None.  Helpful verbosity messages are printed to 
412          stderr if verbose is True.  All other keyword arguments are passed 
413          to load_fileobj(), see that function for more information.  In 
414          particular note that a content handler must be specified. 
415   
416          Example: 
417   
418          >>> from os import getcwd 
419          >>> from glue.ligolw import ligolw 
420          >>> xmldoc = load_url("file://localhost/%s/demo.xml" % getcwd(), contenthandler = ligolw.LIGOLWContentHandler, verbose = True) 
421          """ 
422          if verbose: 
423                  sys.stderr.write("reading %s ...\n" % (("'%s'" % url) if url is not None else "stdin")) 
424          if url is not None: 
425                  scheme, host, path = urllib.parse.urlparse(url)[:3] 
426                  if scheme.lower() in ("", "file") and host.lower() in ("", "localhost"): 
427                          fileobj = open(path, "rb") 
428                  else: 
429                          fileobj = urllib.request.urlopen(url) 
430          else: 
431                  fileobj = sys.stdin 
432          xmldoc, hexdigest = load_fileobj(fileobj, **kwargs) 
433          if verbose: 
434                  sys.stderr.write("md5sum: %s  %s\n" % (hexdigest, (url if url is not None else ""))) 
435          return xmldoc 
 436   
437   
438 -def write_fileobj(xmldoc, fileobj, gz = False, compresslevel = 3, **kwargs): 
 439          """ 
440          Writes the LIGO Light Weight document tree rooted at xmldoc to the 
441          given file object.  Internally, the .write() method of the xmldoc 
442          object is invoked and any additional keyword arguments are passed 
443          to that method.  The file object need not be seekable.  The output 
444          data is gzip compressed on the fly if gz is True, and in that case 
445          the compresslevel parameter sets the gzip compression level (the 
446          default is 3).  The return value is a string containing the hex 
447          digits of the MD5 digest of the output bytestream. 
448   
449          Example: 
450   
451          >>> import sys 
452          >>> from glue.ligolw import ligolw 
453          >>> xmldoc = load_filename("demo.xml", contenthandler = ligolw.LIGOLWContentHandler) 
454          >>> digest = write_fileobj(xmldoc, sys.stdout)  # doctest: +NORMALIZE_WHITESPACE 
455          <?xml version='1.0' encoding='utf-8'?> 
456          <!DOCTYPE LIGO_LW SYSTEM "http://ldas-sw.ligo.caltech.edu/doc/ligolwAPI/html/ligolw_dtd.txt"> 
457          <LIGO_LW> 
458                  <Table Name="demo:table"> 
459                          <Column Type="lstring" Name="name"/> 
460                          <Column Type="real8" Name="value"/> 
461                          <Stream Delimiter="," Type="Local" Name="demo:table"> 
462          "mass",0.5,"velocity",34 
463                          </Stream> 
464                  </Table> 
465          </LIGO_LW> 
466          >>> digest 
467          '37044d979a79409b3d782da126636f53' 
468          """ 
469          with MD5File(fileobj, closable = False) as fileobj: 
470                  md5obj = fileobj.md5obj 
471                  with fileobj if not gz else gzip.GzipFile(mode = "wb", fileobj = fileobj, compresslevel = compresslevel) as fileobj: 
472                          with codecs.getwriter("utf_8")(fileobj) as fileobj: 
473                                  xmldoc.write(fileobj, **kwargs) 
474                  return md5obj.hexdigest() 
 475   
476   
482   
484                  try: 
485                          self.tildefilename = self.filename + "~" 
486                          self.fobj = open(self.tildefilename, "wb") 
487                  except IOError: 
488                          self.tildefilename = None 
489                          self.fobj = open(self.filename, "wb") 
490                  return self.fobj 
 491   
492 -        def __exit__(self, exc_type, exc_val, exc_tb): 
 493                  self.fobj.close() 
494                  del self.fobj 
495   
496                   
497                   
498                   
499                   
500   
501                  if exc_type is None and self.tildefilename is not None: 
502                          os.rename(self.tildefilename, self.filename) 
503   
504                  return False 
  505   
506   
508          """ 
509          Writes the LIGO Light Weight document tree rooted at xmldoc to the 
510          file name filename.  If filename is None the file is written to 
511          stdout, otherwise it is written to the named file.  Friendly 
512          verbosity messages are printed while writing the file if verbose is 
513          True.  The output data is gzip compressed on the fly if gz is True. 
514          If with_mv is True and filename is not None the filename has a "~" 
515          appended to it and the file is written to that name then moved to 
516          the requested name once the write has completed successfully. 
517   
518          Internally, write_fileobj() is used to perform the write.  All 
519          additional keyword arguments are passed to write_fileobj(). 
520   
521          This function traps the signals in the trap_signals iterable during 
522          the write process (see SignalsTrap for the default signals), and it 
523          does this by temporarily installing its own signal handlers in 
524          place of the current handlers.  This is done to prevent Condor 
525          eviction during the write process.  When the file write is 
526          concluded the original signal handlers are restored.  Then, if 
527          signals were trapped during the write process, the signals are then 
528          resent to the current process in the order in which they were 
529          received.  The signal.signal() system call cannot be invoked from 
530          threads, and trap_signals must be set to None or an empty sequence 
531          if this function is used from a thread. 
532   
533          Example: 
534   
535          >>> write_filename(xmldoc, "demo.xml")  # doctest: +SKIP 
536          >>> write_filename(xmldoc, "demo.xml.gz", gz = True)    # doctest: +SKIP 
537          """ 
538          if verbose: 
539                  sys.stderr.write("writing %s ...\n" % (("'%s'" % filename) if filename is not None else "stdout")) 
540          with SignalsTrap(trap_signals): 
541                  if filename is None: 
542                          hexdigest = write_fileobj(xmldoc, sys.stdout, gz = gz, **kwargs) 
543                  else: 
544                          if not gz and filename.endswith(".gz"): 
545                                  warnings.warn("filename '%s' ends in '.gz' but file is not being gzip-compressed" % filename, UserWarning) 
546                          binary_open = lambda filename: open(filename, 'wb') 
547                          with (binary_open if not with_mv else tildefile)(filename) as fileobj: 
548                                  hexdigest = write_fileobj(xmldoc, fileobj, gz = gz, **kwargs) 
549          if verbose: 
550                  sys.stderr.write("md5sum: %s  %s\n" % (hexdigest, (filename if filename is not None else ""))) 
 551   
552   
554          """ 
555          Writes the LIGO Light Weight document tree rooted at xmldoc to the 
556          URL name url. 
557   
558          NOTE:  only URLs that point to local files can be written to at 
559          this time.  Internally, write_filename() is used to perform the 
560          write.  All additional keyword arguments are passed to that 
561          function.  The implementation might change in the future, 
562          especially if support for other types of URLs is ever added. 
563   
564          Example: 
565   
566          >>> write_url(xmldoc, "file:///data.xml")       # doctest: +SKIP 
567          >>> write_url(xmldoc, "file:///data.xml.gz", gz = True) # doctest: +SKIP 
568          """ 
569          return write_filename(xmldoc, local_path_from_url(url), **kwargs) 
 570