Source code for emdros2laf.settings

import os
import sys
import collections
import glob

import configparser
import argparse

NAME = 'LAF-Fabric'
VERSION = '4.8.3'
APIREF = 'http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html'
DEFAULT_DATA_DIR = 'laf-fabric-data'
MAIN_CFG = 'laf-fabric.cfg'
ALL_PARTS = ['monad', 'section', 'lingo']

[docs]class Settings:
    ''' Stores configuration information from the main configuration file and the command line.
        
        Defines an extra function in order to get the items in a section as a dictionary,
        without getting the DEFAULT items as wel
    '''
    _myconfig = {
        'my_name':              NAME,
        'version':              VERSION,
    }
    _env_def = {
        'my_name':              '{my_name}',
        'version':              '{version}',
        'template_dir':         '{script_dir}/templates',
        'xml_dir':              '{script_dir}/xml',
        'source':               '{source}',
        'meta_info':            '{data_dir}/{source}/config/main.cfg',
        'feature_info':         '{data_dir}/{source}/config/ObjectsFeaturesValues.txt',
        'feature_plain_info':   '{data_dir}/{source}/config/ObjectsFeatures.csv',
        'object_info':          '{data_dir}/{source}/config/Objects.txt',
        'raw_emdros_dir':       '{data_dir}/{source}/raw',
        'source_data':          '{data_dir}/{source}/mql/{source}',
        'query_dst_dir':        '{data_dir}/{source}/mql',
        'result_dir':           '{data_dir}/{source}/laf',
        'annot_hdr':            '{data_dir}/{source}/laf/{source}',
        'primary_text':         '{data_dir}/{source}/laf/{source}.txt',
        'primary_hdr_txt':      '{data_dir}/{source}/laf/{source}.txt.hdr',
        'resource_hdr_txt':     '{data_dir}/{source}/laf/{source}.hdr',
        'monad_index':          '{data_dir}/{source}/laf/{source}.lst',
        'decl_dst_dir':         '{data_dir}/{source}/decl',

    }
    _metaconfig = {
        'my_name':              NAME,
        'version':              VERSION,
        'ISOcatprefix':         'http://www.isocat.org/datcat/DC-',
        'DANSpidprefix':        'http://persistent-identifier/?identifier=',
    }
    _meta_def = {
        'my_name':              '{my_name}',
        'version':              '{version}',
        'source':               '{source}',
        'ISOcatprefix':         '{ISOcatprefix}',
        'DANSpidprefix':        '{DANSpidprefix}',
        'danspid_act':          '{DANSpidprefix}{danspid_urn}',
        'publicationdate':      '{publicationdate}',
        'danspid_urn':          '{danspid_urn}',
        'annot_method':         'conversion script {my_name} {version}',
        'annot_resp':           '{annot_resp}',
        'primary':              '{primary}',
        'trailer':              '{trailer}',
        'verse_newline':        '{verse_newline}',
        'annot_space_def':      '{annot_space_def}',
        'prim_creator':         '{prim_creator}',
        'res_creator':          '{res_creator}',
        'prim_title':           '{prim_title}',
        'res_title':            '{res_title}',
        'prim_source_title':    '{prim_source_title}',
        'prim_source_author':   '{prim_source_author}',
        'prim_source_publisher':'{prim_source_publisher}',
        'prim_source_date':     '{prim_source_date}',
        'prim_source_year':     '{prim_source_year}',
        'prim_source_place':    '{prim_source_place}',
        'prim_languages':       '{prim_languages}',
        'res_funder':           '{res_funder}',
        'res_respons_link':     '{res_respons_link}',
        'res_respons_name':     '{res_respons_name}',
        'res_distributor':      '{res_distributor}',
        'res_institute':        '{res_institute}',
        'res_email':            '{res_email}',
        'res_project_desc':     '{res_project_desc}',
        'res_sampling_desc':    '{res_sampling_desc}',
        'res_transduction':     '{res_transduction}',
        'res_correction':       '{res_correction}',
        'res_segmentation':     '{res_segmentation}',
    }
    _laf_templates = {
        'feature_decl':         ('feature_decl.xml', False),
        'feature':              ('feature.xml', True),
        'feature_local':        ('feature_local.xml', True),
        'feature_val':          ('feature_val.xml', True),
        'feature_val1':         ('feature_sym.xml', True),
        'feature_basic':        ('feature_basic.xml', True),
        'annotation_decl':      ('annotation_decl.xml', True),
        'annotation_item':      ('annotation_item.xml', True),
        'annotation_hdr':       ('annotation_header.xml', False),
        'annotation_label':     ('annotation_label.xml', True),
        'annotation_ftr':       ('annotation_footer.xml', False),
        'annotation_elem':      ('annotation_element.xml', False),
        'feature_elem':         ('feature_element.xml', False),
        'node_elem':            ('node_element.xml', False),
        'edge_elem':            ('edge_element.xml', False),
        'edgenode_elem':        ('edgenode_element.xml', False),
        'region_hdr':           ('region_header.xml', False),
        'region_elem':          ('region_element.xml', False),
        'resource_hdr':         ('resource_header.xml', False),
        'primary_hdr':          ('primary_header.xml', False),
        'dependency':           ('dependency.xml', True),
    }
    _laf = {
        'resource_header':      'requires',
        'annotation_header':    'dependsOn',
    }
    _xml = {
        'xmllint_cmd':          'xmllint --noout --nonet --schema {{schema}} {{xmlfile}}',
        'xmllint_cat_env_var':  'XML_CATALOG_FILES',
        'xmllint_cat_env_val':  '{xml_dir}/xmllint_cat.xml',
        'xlink_src':            '{xml_dir}/xlink.xsd',
        'xlink_dst':            '{decl_dst_dir}/xlink.xsd',
        'xml_src':              '{xml_dir}/xml.xsd',
        'xml_dst':              '{decl_dst_dir}/xml.xsd',
        'xml_isofs_src':        '{xml_dir}/xml-isofs.xsd',
        'xml_isofs_dst':        '{decl_dst_dir}/xml-isofs.xsd',
        'graf_annot_src':       '{xml_dir}/graf-standoff.xsd',
        'graf_annot_dst':       '{decl_dst_dir}/graf-standoff.xsd',
        'graf_resource_src':    '{xml_dir}/graf-resource.xsd',
        'graf_resource_dst':    '{decl_dst_dir}/graf-resource.xsd',
        'graf_document_src':    '{xml_dir}/graf-document.xsd',
        'graf_document_dst':    '{decl_dst_dir}/graf-document.xsd',
        'tei_fs_src':           '{xml_dir}/isofs_dcr.xsd',
        'tei_fs_dst':           '{decl_dst_dir}/isofs_dcr.xsd',
        'dcr_src':              '{xml_dir}/dcr.xsd',
        'dcr_dst':              '{decl_dst_dir}/dcr.xsd',
    }
    _parts = {
        'monad': {
            'raw_text':         '{raw_emdros_dir}/monad.txt',
            'object_type':      'word',
            'make_index':       '',
            'do_primary':       '',
            'separate_node_file':'',
        },
        'section': {
            'raw_text':         '{raw_emdros_dir}/section.txt',
            'use_index':        '',
            'find_embedding':   '',
            'hierarchy':        'book chapter verse half_verse',
        },
        'lingo': {
            'raw_text':         '{raw_emdros_dir}/lingo.txt',
            'separate_node_file':'',
            'no_monad_nodes':   '',
        },
    }
    _annotation_kind = {
        'monad':                'minimal objects&text&',
        'section':              'section objects&text&',
        'lingo':                'linguistic objects&text&',
        'reference':            'linguistic relationships&fsDecl&decl/ft.xml',
        'ft':                   'linguistic features&fsDecl&decl/ft.xml',
        'sft':                  'sectional features&fsDecl&decl/sft.xml',
        'db':                   'database features&fsDecl&decl/db.xml',
    }
    _annotation_regions = {
        'name':                 'region',
        'word':                 'w',
        'punct':                'p',
        'section':              's',
    }
    _annotation_skip_object = {
        'lingo':                'word',
    }
    annotation_skip = set(('self',))
    _annotation_label = {
        'section_label':        'sft',
        'lingo_label':          'ft',
        'monad_label':          'ft',
        'db_label':             'db',
    }
    _type_mapping = {
        'string':               'string',
        'ascii':                'string',
        'integer':              'numeric&value="0" max="100000000"',
        'enum':                 'symbol',
        'boolean':              'binary',
        'reference':            'string',
    }
    _type_boolean = {
        't':                    'false',
        'f':                    'true',
    }
    laf_switches = set(('comment_local_deps',))
    _file_types = collections.OrderedDict((
        ('f.hdr',               '.hdr&xml'),
        ('f.primary.hdr',       '.text.hdr&xml'),
        ('f.primary',           '.txt&text'),
        ('f_monad.region',      '_regions.xml&xml'),
        ('f_monad',             '_monads.xml&xml&db&f_monad.region'),
        ('f_lingo',             '_lingo.xml&xml&db&f_monad'),
        ('f_section',           '_sections.xml&xml&db sft'),
        ('f_monad.*',           '_monads.{{subpart}}.xml&xml&ft&f_monad'),
        ('f_lingo.*',           '_lingo.{{subpart}}.xml&xml&ft&f_lingo'),
    ))

[docs]    def flag(self, name): return getattr(self.args, name)

    def __init__(self):
        print('This is {} {}\n{}'.format(NAME, VERSION, APIREF))
        strings = configparser.ConfigParser(inline_comment_prefixes=('#'))
        script_dir = os.path.dirname(os.path.abspath(__file__))
        home_dir = os.path.expanduser('~')

        global_config_dir = "{}/{}".format(home_dir, DEFAULT_DATA_DIR)
        global_config_path = "{}/{}".format(global_config_dir, MAIN_CFG)
        local_config_path = MAIN_CFG
        default_data_dir = global_config_dir
        default_laf_dir = global_config_dir
        config_data_dir = None
        config_laf_dir = None
        config_output_dir = None
        the_config_path = None
        for config_path in (local_config_path, global_config_path):
            if os.path.exists(config_path): the_config_path = config_path
        if the_config_path != None:
            with open(the_config_path, "r", encoding="utf-8") as f: strings.read_file(f)
            if 'locations' in strings:
                if 'data_dir' in strings['locations']: config_data_dir = strings['locations']['data_dir']
                if 'laf_dir' in strings['locations']: config_laf_dir = strings['locations']['laf_dir']
                if 'output_dir' in strings['locations']: config_output_dir = strings['locations']['output_dir']
        the_data_dir = config_data_dir or default_data_dir
        the_laf_dir = config_laf_dir or the_data_dir
        the_output_dir = config_output_dir
        the_data_dir = \
            the_data_dir.replace('.', cw_dir, 1) if the_data_dir.startswith('.') else the_data_dir.replace('~', home_dir, 1) if the_data_dir.startswith('~') else the_data_dir
        the_laf_dir = \
            the_laf_dir.replace('.', cw_dir, 1) if the_laf_dir.startswith('.') else the_laf_dir.replace('~', home_dir, 1) if the_laf_dir.startswith('~') else the_laf_dir
        the_output_dir = \
            the_output_dir.replace('.', cw_dir, 1) if the_output_dir.startswith('.') else the_output_dir.replace('~', home_dir, 1) if the_output_dir.startswith('~') else the_output_dir

        sources = [os.path.basename(x) for x in glob.glob("{}/*".format(the_data_dir)) if os.path.isdir(x)]

        self._myconfig['data_dir'] = the_data_dir
        self._myconfig['home_dir'] = home_dir
        self._myconfig['script_dir'] = script_dir

        argsparser = argparse.ArgumentParser(description = 'Conversion of Emdros to LAF')
        argsparser.add_argument(
            '--source',
            nargs = 1,
            type = str,
            choices = sources,
            metavar = 'Source',
            help = 'Source selection for conversion',
        )
        argsparser.add_argument(
            '--parts',
            nargs = '*',
            type = str,
            choices = ALL_PARTS + ['all', 'none'],
            metavar = 'Kind',
            help = 'task in conversion process',
        )
        argsparser.add_argument(
            "--raw",
            action = "store_true",
            help = "retrieve raw data from Emdros",
        )
        argsparser.add_argument(
            "--validate",
            action = "store_true",
            help = "validate genrated xml files against their schemas",
        )
        argsparser.add_argument(
            "--fdecls-only",
            dest = 'fdecls_only',
            default = False,
            action = "store_true",
            help = "only generate feature declaration file, nothing else",
        )
        argsparser.add_argument(
            "--limit",
            dest = 'limit',
            type = int,
            metavar = 'Limit',
            help = "limit to the first N monads",
        )
        self.args = argsparser.parse_args()
        self.given_parts = collections.OrderedDict()
        for arg in self.args.parts:
            if arg == 'none' or self.args.fdecls_only:
                for a in ALL_PARTS:
                    if a in self.given_parts: del self.given_parts[a]
            elif arg == 'all':
                for a in ALL_PARTS: self.given_parts[a] = True
            else: self.given_parts[arg] = True
        source = self.args.source[0]

        self.env = dict((e, v.format(source=source, **self._myconfig)) for (e,v) in self._env_def.items())
        with open(self.env['meta_info'], "r", encoding="utf-8") as f: strings.read_file(f)
        self._metaconfig.update(strings['meta'] if 'meta' in strings else {})
        self.meta = dict((e, v.format(source=source, **self._metaconfig).replace('\\n','\n')) for (e,v) in self._meta_def.items())
        self._myconfig.update(self.env)
        self.laf_templates = dict((e, (v[0].format(**self._myconfig), v[1])) for (e,v) in self._laf_templates.items()) 
        self.laf = dict((e, v.format(**self._myconfig)) for (e,v) in self._laf.items())
        self.xml = dict((e, v.format(**self._myconfig)) for (e,v) in self._xml.items())
        self.parts = {}
        for p in self._parts:
            self.parts[p] = {}
            self.parts[p] = dict((e, v.format(**self._myconfig)) for (e,v) in self._parts[p].items())
        self.annotation_kind = dict((e, v.format(**self._myconfig)) for (e,v) in self._annotation_kind.items())
        self.annotation_regions = dict((e, v.format(**self._myconfig)) for (e,v) in self._annotation_regions.items())
        self.annotation_skip_object = dict((e, v.format(**self._myconfig)) for (e,v) in self._annotation_skip_object.items())
        self.annotation_label = dict((e, v.format(**self._myconfig)) for (e,v) in self._annotation_label.items())
        self.type_mapping = dict((e, v.format(**self._myconfig)) for (e,v) in self._type_mapping.items())
        self.type_boolean = dict((e, v.format(**self._myconfig)) for (e,v) in self._type_boolean.items())
        self.file_types = collections.OrderedDict((e, v.format(**self._myconfig)) for (e,v) in self._file_types.items())