Exploiting XML data from Human Metabolome Data Base

I am currently working on a project where I need to annotate exogenous chemicals measured in plasma using GC-MS/MS to the Human Metabolome Data Base (HMDB) and to the Toxin and Toxin Target Database (T3DB).

T3DB offers a series of downloadable resources that can be easy integrated into R as a CSV file after being parsed using bash. On the other hand, HMDB only offers XML sources that are to heavy to be parsed using the XML R package in my laptop.

In my case I was interested in obtaining the chemical class of each exogenous chemical as well as their sources and routes of exposure.

The following is the python code I used to parse the XML file and create a CSV file with the information I required for my project.

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    import csv
    from lxml import etree
    from functools import reduce
    
    # The input file, "hmdb.xml", is the original file, "hmdb_metabolites.xml", but without the URL attribute in the hmdb tag.
    xml = 'hmdb.xml'
    context = etree.iterparse( xml, tag = 'metabolite' )
    
    csvfile = open( 'hmdb.csv', 'w' )
    fieldnames = [ 'accession', 'cas', 'name', 'kingdom', 'super_class', 'class', 'sub_class', 'source', 'route_exposure', 'synonyms' ]
    writer = csv.DictWriter( csvfile, fieldnames = fieldnames )
    writer.writeheader()
    
    for event, elm in context:
        accession = elm.xpath( 'accession/text()' )[ 0 ]
        name = elm.xpath( 'name/text()' )[ 0 ]
        try:
            cas = elm.xpath( 'cas_registry_number/text()' )[ 0 ]
        except:
            cas = 'NA'
        try:
            kingdom = elm.xpath( 'taxonomy/kingdom/text()' )[ 0 ]
        except:
            kingdom = 'NA'
        try:
            chem_super_class = elm.xpath( 'taxonomy/super_class/text()' )[ 0 ]
        except:
            chem_super_class = 'NA'
        try:
            chem_class = elm.xpath( 'taxonomy/class/text()' )[ 0 ]
        except:
            chem_class = 'NA'
        try:
            chem_sub_class = elm.xpath( 'taxonomy/sub_class/text()' )[ 0 ]
        except:
            chem_sub_class = 'NA'
        
        try:
            synonyms = elm.xpath( 'synonyms/synonym/text()' )
            synonyms = reduce( lambda x, y: x + '//' + y, synonyms )
        except:
            synonyms = 'NA'
    
        ontology_terms = elm.xpath( 'ontology/root/term' )
        ontology_terms = [ term.text for term in ontology_terms ]
        if 'Disposition' in ontology_terms:
            disposition_terms = []
            role = []
            for idx, term in enumerate( ontology_terms ):
                if term == 'Disposition':
                    disposition = elm.xpath( 'ontology/root/descendants' )[ idx ]
                    disposition_terms = [ term.text for term in disposition.xpath( 'descendant/term' ) ]
                if term == 'Role':
                    role = elm.xpath( 'ontology/root/descendants' )[ idx ]
            source = []
            if 'Source' in disposition_terms:
                for idx, term in enumerate( disposition_terms ):
                    if term == 'Source':
                        source = disposition.xpath( 'descendant' )[ idx ].xpath( 'descendants/descendant/term/text()' )
                        break
            route_exposure = []
            if 'Route of exposure' in disposition_terms:
                for idx, term in enumerate( disposition_terms ):
                    if term == 'Route of exposure':
                        route_exposure = disposition.xpath( 'descendant' )[ idx ].xpath( 'descendants/descendant/descendants/descendant/term/text()' )
                        break
            if len( source ) > 0:
                source = reduce( lambda x, y: x + "//" + y, source )
            else:
                source = 'NA'
            if len( route_exposure ) > 0:
                route_exposure = reduce( lambda x, y: x + "//" + y, route_exposure )
            else:
                route_exposure = 'NA'
        else:
                route_exposure = 'NA'
                source = 'NA'
    
        writer.writerow( { 'accession': accession, 'cas': cas, 'name': name, 'kingdom': kingdom,
            'super_class': chem_super_class, 'class': chem_class,
            'sub_class': chem_sub_class, 'source': source,
            'route_exposure': route_exposure, 'synonyms': synonyms } )
    
        elm.clear()
        for ancestor in elm.xpath( 'ancestor-or-self::*' ):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[ 0 ]
    
    del context