Source code for emdros2laf.laf

import sys
import re
import collections

from .mylib import *

[docs]class Laf:
    ''' Knows the LAF data format.

    All LAF knowledge is stored in template files together with sections in the main configuration file.
    The LAF class finds those templates, sets up the result files, and fills them.

    Note:
        Templates

        *template[key] = text*
            where key is an entry in the *laf_templates* section of the main config file.

    Note:
        Files and Filetypes

        *annotation_files[part][subpart] = (ftype, medium, location, requires, annotations, is_region)*

        The order is important, so we generate a list too:

        *file_order*
            list of ftypes according *file_types section* in main config file, expanded, in the order encountered

        where

        *ftype*
            comes from the file_types section in the main config file.
            It has the shape of LAF file identifier, but with wild cards.

            *f.xxxxxx*
                not an annotation file, but primary data or a header file
            *f_part.subpart*
                annotation file for part, subpart

        *for each ftype*    
            there is an infostring consisting of fields

            *location*
                file name of corresponding file, modulo a common prefix
            *medium*
                file type (text or xml)
            *annotations*
                space separated annotation labels occurring in this part, subpart
            *requires*
                space separated list of ftypes of required files

        *is_region*
            reveals whether the file only contains regions or not.
            A pure region file needs a different template.

    Note:
        Header Generation

        All header files are generated here: 
        * the feature declaration file
        * the header for the resource as a whole
        * the header for the primary data file

        The headers of the annotation files are included in those files.
        Those headers contain statistics: counts of the number of annotations with a given label.
        We know those number only after generation because these statistics will be collected during
        further processing.

        When the annotation files are generated, we use placeholders for the statistics.
        In a post-generation stage we read/write the annotation files and replace the place holders by
        the true numbers. The files are written in situ. So we must take care that the placeholders
        contain enough space around them.

    Note:
        Processing

        This class provides methods to initialize and finalize the generation of primary data files
        and annotation files. There are methods to open/close all files that are relevant to the
        part that is being processed. (Part being: 'monad', 'section', 'lingo').

    Note:
        Statistics

        Counts are collected in a *stats* dictionary.

        * stats[statistic_name] = statistic_value*

    '''
    settings = None
    et = None
    annotation_files = collections.defaultdict(lambda: {})
    file_order = []
    file_handles = {}
    primary_handle = None
    template = {}
    stats = collections.defaultdict(lambda: 0)
    gstats = collections.defaultdict(lambda: 0)

    def __init__(self, settings, et, val):
        self.settings = settings
        self.et = et
        self.val = val
# make sure the directories in which files will be created, exist
        run('mkdir -p ' + settings.env['result_dir'])
        run('mkdir -p ' + settings.env['decl_dst_dir'])
# parse the config info about the LAF files to be created and create a to do list from this
# The todo list is a list of entries having the following information
#   part, subpart, filetype specification, annotation information
        todo = []
        for (ftype_spec, info_spec) in settings.file_types.items():
            info_x = info_spec.format(
                referencefeatures = " ".join(et.list_ref_noskip()),
                myobjects = '{myobjects}',
                subpart = '{subpart}',
            )
            if ftype_spec.startswith('f_'):
                partspec = ftype_spec.replace('f_', '', 1)
                components = partspec.split('.')
                part = components[0]
                if len(components) > 1:
                    subpart = components[1]
                    if subpart != '*':
                        todo += [(part, subpart, ftype_spec, info_x.format(
                            myobjects = " ".join(et.object_list_part(part)),
                            subpart = components[1],
                        ))]
                    else:
                        for subpart in et.subpart_list(part):
                            if subpart == '': continue
                            ftype_spec_x = ftype_spec.replace('*', subpart)
                            todo += [(part, subpart, ftype_spec_x, info_x.format(
                                myobjects = " ".join(et.object_list(part, subpart)),
                                subpart = subpart,
                            ))]
                else:
                    todo += [(part, '', ftype_spec, info_x.format(
                        myobjects = " ".join(et.object_list_part(part)),
                    ))]
            else:
                partspec = ftype_spec.replace('f.', '', 1)
                todo += [('', partspec, ftype_spec, info_x)]
        regions = settings.annotation_regions
# now the todo list is interpreted further, especially the info string.
# The result is stored in a dictionary keyed by part, then subpart.
        for (part, subpart, ftype, infostring) in todo:
            self.file_order += [ftype]
            (location, medium, annotstring, requirestring) = fillup(4, '', infostring.split('&'))
            annotations = annotstring.split(" ") if annotstring != '' else []
            requires = requirestring.split(" ") if requirestring != '' else []
            self.annotation_files[part][subpart] = (ftype, medium, location, requires, annotations, subpart == regions['name'])
# read and store the templates
        laf_templates = settings.laf_templates
        for tpl in laf_templates:
            (fname, nl_before) = laf_templates[tpl]
            file_handle = open('{}/{}'.format(settings.env['template_dir'], fname), encoding = 'utf-8')
            template = file_handle.read()
            file_handle.close()
            self.template[tpl] = ("\n" + template.rstrip("\n")) if  nl_before else template
        
[docs]    def makeheaders(self):
        if not self.et.simple or self.et.plain:
            self.makefeatureheader()
        if not self.settings.args.fdecls_only:
            self.makeresourceheader()
            self.makeprimaryheader()

[docs]    def makefeatureheader(self):
        f_text = collections.defaultdict(lambda: '')
        f_index = 0
        iso_prefix = self.settings.meta['ISOcatprefix']
        truth_values = sorted(self.settings.type_boolean.values())
        db_label = self.settings.annotation_label['db_label']
        meta = self.settings.meta
        plain_info = collections.defaultdict(lambda: {})
        plain_iso_info = {}
        for db_feature in (
            ('monads', 'monads', 'integer', 'the monads that belong to this object'),
            ('minmonad', 'minmonad', 'integer', 'the first monad of this object'),
            ('maxmonad', 'maxmonad', 'integer', 'the last monad of this object'),
            ('oid', 'objectId', 'integer', 'object identifier'),
            ('otype', 'objectType', 'string', 'object type'),
        ):
            (fname, longfname, ftype, descr) = db_feature
            f_index += 1
            (fs_type, fs_type_atts) = fillup(2, '', self.settings.type_mapping[ftype].split('&'))
            if fs_type_atts != '': fs_type_atts = ' ' + fs_type_atts
            fv_text = self.template['feature_basic'].format(valtype = fs_type, atts = fs_type_atts, **meta)
            f_text[db_label] += self.template['feature_local'].format(i = f_index, name = fname, isoname = longfname, isodescr = 'database value:' + descr,
                values = fv_text, **meta
            )
        if self.et.plain:
            fpfile = self.settings.env['feature_plain_info']
            with open(fpfile, 'r', encoding='utf-8') as fh:
                for line in fh:
                    if line.startswith('#'): continue
                    (otype, fname, local_name, ftype, iso_name, iso_id, iso_def, iso_exm, iso_exp) = fillup(9, '', line.rstrip().split("\t"))
                    if iso_def != 'idem.':
                        plain_iso_info[iso_id] = (iso_name, iso_def, iso_exm, iso_exp)
                    plain_info[otype][fname] = (local_name, iso_id)
            for part in sorted(self.et.part_feature):
                for object_type in sorted(self.et.part_feature[part]):
                    if object_type not in plain_info:
                        print("ERROR: object type {} not specified".format(object_type))
                        continue
                    object_kind = self.settings.annotation_label[part + '_label']
                    for feature_name in sorted(self.et.part_feature[part][object_type]):
                        if self.et.is_ref_skip(feature_name): continue
                        if feature_name not in plain_info[object_type]:
                            print("ERROR: feature {} of object type {} not specified".format(feature_name, object_type))
                            continue
                        (local_name, iso_id) = plain_info[object_type][feature_name]
                        (iso_name, iso_def, iso_exm, iso_exp) = plain_iso_info[iso_id]
                        f_index += 1
                        fv_text = self.template['feature_basic'].format(valtype = 'string', atts = fs_type_atts, **meta)
                        f_text[object_kind] += self.template['feature'].format(i = f_index, name = feature_name,
                            isoname = iso_name,
                            isolink = iso_prefix + iso_id,
                            isodescr = 'name in local documentation: {}. {}{}{}'.format(
                                local_name,
                                ('Definition: ' + iso_def.rstrip('.') + '. ') if iso_def else '',
                                ('Example: ' + iso_exm.rstrip('.') + '. ') if iso_exm else '',
                                ('Explanation: ' + iso_exp.rstrip('.') + '. ') if iso_exp else '',
                            ),
                            values = fv_text, **meta
                        )
        else:
            for part in self.et.part_list():
                for object_type in self.et.object_list_part(part):
                    object_kind = self.settings.annotation_label[part + '_label']
                    f_index += 1
                    isocat_key, isocat_name = self.et.object_atts(object_type)
                    fv_text = self.template['feature_basic'].format(valtype = 'string', atts = '', **meta)
                    f_text[object_kind] += self.template['feature'].format(i = f_index, name = object_type,
                        isoname = camel(isocat_name) if isocat_name != '' else object_type + '_object',
                        isolink = iso_prefix + isocat_key, isodescr = isocat_name if isocat_name != '' else 'MISSING ISOcat name',
                        values = fv_text, **meta
                    )
                    for feature_name in self.et.feature_list(object_type):
                        defined_on, etcbc_type, isocat_key, isocat_name = self.et.feature_atts(object_type, feature_name)
                        if defined_on != '': continue
                        if self.et.is_ref_skip(feature_name): continue
                        (fs_type, fs_type_atts) = fillup(2, '', self.settings.type_mapping[etcbc_type].split('&'))
                        if fs_type_atts != '': fs_type_atts = ' ' + fs_type_atts
                        f_index += 1
                        value_list = self.et.value_list(object_type, feature_name)
                        fv_text = ''
                        if len(value_list) > 0:
                            fv1_text = ''
                            for feature_value in value_list:
                                v_etcbc_type, v_isocat_key, v_isocat_name = self.et.value_atts(object_type, feature_name, feature_value)
                                fv1_text += self.template['feature_val1'].format(valtype = 'symbol', name = feature_value,
                                    value = camel(v_isocat_name) if v_isocat_name != '' else feature_value,
                                    isolink = iso_prefix + v_isocat_key, **meta
                                )
                            fv_text = self.template['feature_val'].format(values = fv1_text, **meta)
                        if fs_type != 'symbol':
                            if fs_type == 'binary':
                                fv1_text = ''
                                for feature_value in truth_values:
                                    fv1_text += "\n\t\t\t\t" + '<{} value="{}"/>'.format(fs_type, feature_value)
                                fv_text = self.template['feature_val'].format(values = fv1_text, **meta)
                            else:
                                fv_text = self.template['feature_basic'].format(valtype = fs_type, atts = fs_type_atts, **meta)
                        f_text[object_kind] += self.template['feature'].format(i = f_index, name = feature_name,
                            isoname = camel(isocat_name) if isocat_name != '' else feature_name,
                            isolink = iso_prefix + isocat_key, isodescr = isocat_name if isocat_name != '' else 'MISSING ISOcat name',
                            values = fv_text, **meta
                        )
        for feature_type in f_text:
            absolute_path = "{}/{}.xml".format(self.settings.env['decl_dst_dir'], feature_type)
            file_handle = open(absolute_path, "w", encoding = 'utf-8')
            text = self.template['feature_decl'].format(kind = feature_type, features = f_text[feature_type], **meta)
            file_handle.write(text) 
            file_handle.close()
            self.val.add(absolute_path, self.settings.xml['tei_fs_dst'])

[docs]    def makeresourceheader(self):
        absolute_path = self.settings.env['resource_hdr_txt']
        file_handle = open(absolute_path, "w", encoding = 'utf-8')
# generate the variable parts: the list of filetypes, the specification of file types
        filetypes_decl = ''
        annotation_decls = ''
        meta = self.settings.meta
        for part in sorted(self.annotation_files):
            for subpart in sorted(self.annotation_files[part]):
                ftype, medium, location, requires, annotations, is_region = self.annotation_files[part][subpart]
                dependencies = ''
                for dep in requires:
                    dependencies += self.template['dependency'].format(indent = "\t\t", elementname = self.settings.laf['resource_header'], fileid = dep, **meta)
                requirestxt = "/>"
                if requires:
                    requirestxt = ">" + dependencies + "\n\t\t\t\t</fileType>"
                filetypes_decl += ("\n\t\t\t\t" + '<fileType xml:id="{type}" f.suffix="{suffix}" {aids}medium="{medium}"{requires}').format(
                    type = ftype,
                    suffix = location,
                    medium = medium,
                    aids = 'a.ids="{}" '.format(" ".join(annotations)) if len(annotations) > 0 else '',
                    requires = requirestxt,
                )
        filetypes_list = ''
        for ftype in self.file_order: filetypes_list += "\n\t\t\t\t\t{}".format(ftype)
        danspid = self.settings.meta['danspid_act']
        decl_items = [('reference', self.et.list_ref_noskip())]
        skips = self.settings.annotation_skip_object
        for part in self.et.part_list():
            skip_objects = set()
            if part in skips: skip_objects.add(skips[part])
            decl_items.append((part, sorted(x for x in self.et.object_list_part(part) if x not in skip_objects)))
        plabels = self.settings.annotation_label
        labels = set()
        for plabel in plabels: labels.add(plabels[plabel])
        for label in sorted(labels): decl_items.append((label, [label]))
        for decl_item in decl_items:
            itemtype, items = decl_item
            info = self.settings.annotation_kind[itemtype]
            descr, schematype, schemaloc = info.split('&')
            for item in items:
                schema = danspid
                if schematype == 'fsDecl': schema = '{}/{}'.format(danspid, schemaloc)
                annotation_decls += self.template['annotation_decl'].format(name = item, kind = descr, schema = schema, schematype = schematype, **meta)
        text = self.template['resource_hdr'].format(createdate = today(), filetypeslist = filetypes_list, filetypesdecl = filetypes_decl,
            annotationdecls = annotation_decls, nmonads = self.gstats['f Number of monads'], **meta).format(nmonads = self.gstats['f Number of monads'], **meta)
        file_handle.write(text)
        file_handle.close()
        self.val.add(absolute_path, self.settings.xml['graf_resource_dst'])

[docs]    def makeprimaryheader(self):
        absolute_path = self.settings.env['primary_hdr_txt']
        file_handle = open(absolute_path, "w", encoding = 'utf-8')
        danspid = self.settings.meta['danspid_act']
        annotation_item_dict = {} 
        meta = self.settings.meta
        for part in sorted(self.annotation_files):
            if part == '': continue
            for subpart in sorted(self.annotation_files[part]):
                ftype, medium, location, requires, annotations, is_region = self.annotation_files[part][subpart]
                annotation_item_dict[ftype] = self.template['annotation_item'].format(id = ftype, loc = location, **meta)
        annotation_items = ''
        for ftype in self.file_order:
            if ftype in annotation_item_dict: annotation_items += annotation_item_dict[ftype]
        text = self.template['primary_hdr'].format(createdate = today(), charsize = self.stats['charsize'], annotationitems = annotation_items, **meta)
        file_handle.write(text)
        file_handle.close()
        self.val.add(absolute_path, self.settings.xml['graf_document_dst'])

[docs]    def start_annot(self, part):
        print("INFO: Generating annotation files for {}".format(part))
        self.file_handles = {}
        meta = self.settings.meta
        for subpart in self.annotation_files[part]:
            (ftype, medium, location, requires, annotations, is_region) = self.annotation_files[part][subpart]
            absolute_path = "{}{}".format(self.settings.env['annot_hdr'], location)
            annotation_labels = ''
            for annot in annotations:
                annotation_labels += self.template['annotation_label'].format(annot = annot, **meta)
            dependencies = ''
            for dep in requires:
                dependencies += self.template['dependency'].format(indent = "", elementname = self.settings.laf['annotation_header'], fileid = dep, **meta)
            if dependencies and 'comment_local_deps' in self.settings.laf_switches:
                dependencies = "<!--" + dependencies + "-->"
            annotation_header = ''
            if is_region:
                annotation_header = self.template['region_hdr'].format(dependencies = dependencies, **meta)
            else:
                annotation_header = self.template['annotation_hdr'].format(labels = annotation_labels, dependencies = dependencies, **meta)
            file_handle = open(absolute_path, "w", encoding = 'utf-8')
            file_handle.write(annotation_header)
            file_handle.close()
            file_handle = open(absolute_path, "a", encoding = 'utf-8')
            self.file_handles[subpart] = (file_handle, absolute_path, len(annotation_header), annotations)

[docs]    def start_primary(self):
        ftype, medium, location, requires, annotations, is_region = self.annotation_files['']['primary']
        absolute_path = self.settings.env['primary_text']
        self.primary_handle = open(absolute_path, "w", encoding = 'utf-8')

[docs]    def finish_annot(self, part):
        meta = self.settings.meta
        for subpart in self.file_handles:
            file_handle, absolute_path, length, annotations = self.file_handles[subpart]
            file_handle.write(self.template['annotation_ftr'].format(**meta))
            file_handle.close()
            file_handle = open(absolute_path, "r+")
            header = file_handle.read(length)
            for annot in annotations:
                number_annot = '"{}"'.format(self.stats["{}.{}".format(subpart, annot)]).ljust(14, ' ')
                header = re.sub('"{}" occurs="nnnnnnnnnnnn"'.format(annot), '"{}" occurs={}'.format(annot, number_annot), header)
            file_handle.seek(0)
            file_handle.write(header)
            file_handle.close()
            self.val.add(absolute_path, self.settings.xml['graf_annot_dst'])

[docs]    def finish_primary(self):
        ftype, medium, location, requires, annotations, is_region = self.annotation_files['']['primary']
        absolute_path = self.settings.env['primary_text']
        self.primary_handle.close()
        self.val.add(absolute_path, None)

[docs]    def report(self):
        for stat in sorted(self.gstats): print("INFO: {:<30}: {:>10}".format(stat, self.gstats[stat]))