Source code for emdbXMLTranslator.emdb_20_to_json

#!/usr/bin/env python
"""
emdb_20_to_json

Reads in a EMDB header file following 2.0 schema and outputs summary information as a JSON.
This is an stub example to show how to use emdb_da.py to read the header file.

TODO:

Version history:

                 
                

Copyright [2014-2016] EMBL - European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the
"License"); you may not use this file except in
compliance with the License. You may obtain a copy of
the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
"""


__author__ = 'Ardan Patwardhan'
__email__ = 'ardan@ebi.ac.uk'
__date__ = '2016-01-05'

import sys
import logging
import traceback
import json
from datetime import date
from optparse import OptionParser
import emdb_da
from emdb_settings import emdb_settings



[docs]class EMDBXML20JSONTranslator: """ Class for translating summary info from EMDB 2.0 to a JSON file """ def __init__(self): self.warningLevel = 1 # 0 = min, 3 = max logging.basicConfig(level=emdb_settings.log_level, format=emdb_settings.log_format)
[docs] def setWarningLevel(self, level): """ Set the level of logging warnings. 0 = no warnings, 3 = max warnings, 1 = default Parameters @param level: warning level 0 -> 3 """ if level <= 0: self.warningLevel = 0 elif level >= 3: self.warningLevel = 3 else: self.warningLevel = level
[docs] def warn(self, level, msg): """ Log a warning message but take into account the warningLevel Parameters: @param level: only messages with level >= warningLevel are printed @param msg: warning message """ if level <= self.warningLevel: logging.warning(msg)
[docs] def translate(self, inputFile, outputFile): """ Translate EMDB 2.0 to a JSON. Summary info only Parameters @param inputFile: input file in EMDB 2.0 XML @param outputFile: output JSON file """ def checkSet(getX, key, jsonDict, transform=None): """ Call setVar only if getX does not return None Parameters: @param getX: getter function that must return value @param key: key in dictionary to be set jsonDict[key] @param jsonDict: JSON dictionary whose key will be set, jsonDict[key] @param transform: Apply transform(x) before calling setter function """ x = getX() if x is not None: if transform is not None: try: z = x x = transform(z) except Exception: self.warn(3, "function checkSet: Transform function did not work: %s(%s)" % (transform, z)) self.warn(3, traceback.format_exc()) return jsonDict[key] = x def getAuthors(authListIn, simple=False): """ Get authors from 2.0 and return comma seperated string Parameters @param authListIn: list object of 2.0 author objects @param simple: boolean - True means that the authors in 2.0 are simple strings, otherwise they are journal authors @return: """ authList = [] for authIn in authListIn: if simple == True: x = authIn else: x = authIn.get_valueOf_() authList.append(x) if len(authList) > 0: authStr = ', '.join(authList) else: authStr = '' return authStr def copyCitation(refIn): """ Return JSON object containing citation - more complex example... Parameters: @param citeIn: Input citation in 2.0 schema @return: python dictionary with reference info """ jrnlIn = refIn.get_citation_type() refOut = {} if jrnlIn.original_tagname_ == 'journal_citation': refOut['citationType'] = 'Journal' refOut['authors'] = getAuthors(jrnlIn.get_author()) checkSet(jrnlIn.get_title, 'title', refOut) checkSet(jrnlIn.get_journal, 'journal', refOut) checkSet(jrnlIn.get_published, 'published', refOut) # This is a fix because of bad data - emd-1648.xml has an empty volume tag! vol = jrnlIn.get_volume() if vol is not None and len(vol) > 0: refOut['volume'] = vol checkSet(jrnlIn.get_first_page, 'firstPage', refOut) checkSet(jrnlIn.get_last_page, 'lastPage', refOut) checkSet(jrnlIn.get_year, 'year', refOut) else: refOut['citationType'] = 'Non-journal' nonJrnlIn = jrnlIn refOut['authors'] = getAuthors(nonJrnlIn.get_author()) checkSet(nonJrnlIn.get_editor, 'editor', refOut) checkSet(nonJrnlIn.get_book_chapter_title, 'chapterTitle', refOut) checkSet(nonJrnlIn.get_book_title, 'title', refOut) checkSet(nonJrnlIn.get_thesis_title, 'thesisTitle', refOut) checkSet(nonJrnlIn.get_published, 'published', refOut) checkSet(nonJrnlIn.get_publisher, 'publisher', refOut) checkSet(nonJrnlIn.get_publication_location, 'location', refOut) checkSet(nonJrnlIn.get_volume, 'volume', refOut) checkSet(nonJrnlIn.get_first_page, 'firstPage', refOut) checkSet(nonJrnlIn.get_last_page, 'lastPage', refOut) checkSet(nonJrnlIn.get_year, 'year', refOut) return refOut xmlIn = emdb_da.parse(inputFile, silence=True) jsonOut = {} admIn = xmlIn.get_admin() datesIn = admIn.get_key_dates() statusIn = admIn.get_current_status() authListIn = admIn.get_authors_list() xRefIn = xmlIn.get_crossreferences() citeListIn = xRefIn.get_citation_list() citeIn = citeListIn.get_primary_citation() checkSet(xmlIn.get_emdb_id, 'emdbId', jsonOut) checkSet(admIn.get_title, 'title', jsonOut) checkSet(statusIn.get_code().get_valueOf_, 'status', jsonOut) jsonOut['authors'] = getAuthors(authListIn.get_author(), simple=True) # dates checkSet(datesIn.get_deposition, 'depositionDate', jsonOut, str) checkSet(datesIn.get_header_release, 'headerReleaseDate', jsonOut, str) checkSet(datesIn.get_map_release, 'mapReleaseDate', jsonOut, str) # primary citation jsonOut['citation'] = copyCitation(citeIn) fd = open(outputFile, 'w') if outputFile else sys.stdout json.dump(jsonOut, fd) if fd is not sys.stdout: fd.close()
[docs]def main(): """ Extract summary info from EMDB XML 2.0 file to JSON """ # Handle command line options usage = """ emdb_20_to_json.py [options] inputFile Convert EMDB XML Examples: python emdb_20_to_json.py inputFile Typical run: python emdb_20_to_json.py -f out.json in.xml in.xml is assumed to be a EMDB 2.0 XML file out.json is a JSON file created with the summary information """ version = "0.1" parser = OptionParser(usage = usage, version = version) parser.add_option("-f", "--out-file", action="store", type="string", metavar="FILE", dest="outputFile", help="Write output to FILE") parser.add_option("-w", "--warning-level", action="store", type="int", dest="warningLevel", default=1, help="Level of warning output. 0 is none, 3 is max, default = 1") (options, args) = parser.parse_args() # Check for sensible/supported options if len(args) < 1: sys.exit("No input file specified!") else: inputFile = args[0] # Call appropriate conversion routine translator = EMDBXML20JSONTranslator() translator.setWarningLevel(options.warningLevel) translator.translate(inputFile, options.outputFile)
if __name__ == "__main__": main()