Source code for emdbXMLTranslator.process_all_19_19

#!/usr/bin/env python
"""
process_all_19_19.py

Wrapper script for emdb_xml_translate.py for converting v 1.9 files
in EMDB to v1.9. The only reason for doing this is that it puts elements
in a canonical order that makes comparison with 1.9 -> 2.0 -> 1.9
translation easier.

TODO:

Version history:
0.2, 2015-11-12, Ardan Patwardhan: Minor changes associated with moving file to new project structure
0.3, 2015-11-18, Ardan Patwardhan: Adding mechanism to exclude empty tags which should make comparison easier

                

Copyright [2014-2016] EMBL - European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the
"License"); you may not use this file except in
compliance with the License. You may obtain a copy of
the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
"""



__author__ = 'Ardan Patwardhan'
__email__ = 'ardan@ebi.ac.uk'
__date__ = '2014-11-30'

import glob
import os
import re
import logging
import subprocess
from optparse import OptionParser
from emdb_settings import emdb_settings

logging.basicConfig(level=emdb_settings.log_level, format=emdb_settings.log_format)

[docs]def process_all_19_19(filePathTemplate, outDir): """ Take a v1.9 file and read and write it using emdb_xml_translate.py to put it in a canonical form. Some post processing is also done to remove empty tags etc Parameters: @param filePathTemplate: Regular expression that is passed to a glob function to extract a list of input files @param outDir: The canonical files will be written to this directory """ commandListBase= ['python', './emdb_xml_translate.py', '-i', '1.9', '-o', '1.9', '-f'] pat = re.compile(r'^[ ]*<(sliceSet|fscSet|maskSet|supplement|fitting|externalReferences|pdbEntryIdList|imageAcquisition|specimenPreparation)/>[ ]*$') space_tags = 'title|articleTitle|software|resolutionMethod|algorithm|name|timeResolvedState|molWtMethod' pat_spaces_start = re.compile(r'^(.*<(%s)>)(((?!</(%s)>).)*)((</(%s)>)?(((?!<(/%s>)).)*))' % (space_tags, space_tags, space_tags, space_tags)) pat_spaces_end = re.compile(r'^(((?!</(%s>)).)*)(</(%s)>.*)' % (space_tags, space_tags)) pat_spaces_sub = re.compile(r'\s+') pat_spaces_rep = ' ' def clean_spaces(infHd, outfHd): """ Reduce spaces, new-lines and tabs to single spaces Note: will NOT handle nested tags!! Parameters: @param infHd: Input file handle @param outfHd: Output file handle """ infHd.seek(0) outfHd.seek(0) startTagFound = False for line in infHd: if startTagFound == False: m = re.match(pat_spaces_start,line) if m is not None: startGroups = m.groups() if startGroups is not None: prefix = startGroups[0] startTag = startGroups[1] tagContent = startGroups[2] suffix = startGroups[5] endTag = startGroups[7] if endTag is not None and endTag == startTag: cleanedContent = re.sub(pat_spaces_sub, pat_spaces_rep, tagContent).strip() outfHd.write('%s%s%s\n' % (prefix, cleanedContent, suffix)) else: startTagFound = True else: m = re.match(pat, line) if m is None: outfHd.write(line) else: m = re.match(pat_spaces_end,line) if m is not None: endGroups = m.groups() if endGroups is not None: tagContent += ' ' + endGroups[0] endTag = endGroups[4] suffix = endGroups[3] if endTag is not None and endTag == startTag: cleanedContent = re.sub(pat_spaces_sub, pat_spaces_rep, tagContent).strip() outfHd.write('%s%s%s\n' % (prefix, cleanedContent, suffix)) startTagFound = False else: tagContent += ' ' + line else: tagContent += ' ' + line emdbFiles = glob.glob(filePathTemplate) numErrors = 0 numSuccess = 0 errorList = [] for f in emdbFiles: inf = os.path.basename(f) outf = os.path.join(outDir,inf) tmpf = os.path.join(outDir,inf + '.tmp') logging.info("Input file: %s, output file: %s" % (f, outf)) commandList = list(commandListBase) commandList.append(tmpf) commandList.append(f) cmdText = ' '.join(commandList) logging.info('Executing: %s' % cmdText) exitCode = subprocess.call(commandList) if exitCode != 0: numErrors += 1 errorList.append(inf) else: tmpfHd = open(tmpf, 'r') outfHd = open(outf, 'w') """ for line in tmpfHd: m = re.match(pat, line) if m is None: outfHd.write(line) """ clean_spaces(tmpfHd, outfHd) outfHd.close() tmpfHd.close() os.remove(tmpf) numSuccess += 1 logging.warning('%d files successfully processed!' % numSuccess) if numErrors > 0: logging.warning('%d errors!' % numErrors) logging.warning('List of entries that were not translated') for entry in errorList: logging.warning(entry)
[docs]def main(): """ Convert all EMDB XML 1.9 header files to canonical XML 1.9 files. This makes comparison with output from round-trip conversion (1.9 -> 2.0 -> 1.9) easier. """ defaultFilePathTemplate=emdb_settings.archiveHeaderTemplate defaultOutDir=emdb_settings.emdb19To19Dir # Handle command line options usage = """ python process_all_19_19.py [options] Convert all EMDB XML 1.9 header files to canonical XML 1.9 files. This makes comparison with output from round-trip conversion (1.9 -> 2.0 -> 1.9) easier. Examples: python process_all_19_19.py Typical run: python process_all_19_19.py -t '/data/emstaging/EMD-*/header/emd-*.xml' -o '/data/emdb19_to_19' /data/emstaging/EMD-*/header/emd-*.xml is the template used to glob all input 1.9 header files /data/emdb19_to_19 is the output directory with the canonical EMDB XML 1.9 files """ version = "0.3" parser = OptionParser(usage = usage, version = version) parser.add_option("-t", "--template", action="store", type="string", metavar="TEMPLATE", dest="filePathTemplate", default = defaultFilePathTemplate, help="Template used to glob all input 1.9 header files [default: %default]") parser.add_option("-o", "--out-dir", action="store", type="string", metavar="DIR", dest="outDir", default = defaultOutDir, help="Directory for canonical EMDB 1.9 files [default: %default]") (options, args) = parser.parse_args() process_all_19_19(options.filePathTemplate, options.outDir)
if __name__ == "__main__": main()