Source code for emdros2laf.etcbc

import os
import sys
import subprocess
import collections

from .mylib import *

[docs]class Etcbc:
    ''' Knows the ETCBC data format.

    All ETCBC knowledge is stored in a file that describes objects, features and values.
    These are many items, and we divide them in parts and subparts.
    We have a parts for monads, sections and linguistic objects.
    When we generate LAF files, they may become unwieldy in size.
    That is why we also divide parts in subparts.
    Parts correspond to sets of objects and their features.
    Subparts correspond to subsets of objects and or subsets of features.
    N.B. It is "either or": 
    either 

    * a part consists of only one object type, and the subparts
      divide the features of that object type

    or

    * a part consists of multiple object types, and the subparts
      divide the object types of that part. If an object type belongs to
      a subpart, all its features belong to that subpart too.

    In our case, the part 'monad' has the single object type word, and its features
    are divided over subparts.
    The part 'lingo' has object types sentence, sentence_atom, clause, clause_atom,
    phrase, phrase_atom, subphrase, word. Its subparts are a partition of these object
    types in several subsets.
    The part 'section' does not have subparts.
    Note that an object type may occur in multiple parts: consider 'word'.
    However, 'word' in part 'monad' has all non-relational word features, but 'word' in part 'lingo'
    has only relational features, i.e.features that relate words to other objects.

    The Etcbc object stores the complete information found in the Etcbc config file
    in a bunch of data structures, and defines accessor functions for it.

    The feature information is stored in the following dictionaries:

    (Ia) part_info[part][subpart][object_type] = set of feature_names
        NB: object_types may occur in multiple parts.

    (Ib) part_object[part] = set of object_types
    
    (Ic) part_feature[part][object_type] = set of feature_names
    
    (Id) object_subpart[part][object_type] = subpart
    
        Stores the subpart in which each object type occurs, per part

    (II) object_info[object_type] = [attributes]
    
        Stores the information on objects, except their features and values.

    (III) feature_info[object_type][feature_name] = [attributes]
    
        Stores the information on features, except their values.

    (IV) value_info[object_type][feature_name][feature_value] = [attributes]
    
        Stores the feature value information

    (V) reference_feature[feature_name] = True | False
    
        Stores the names of features that reference other object. 
        The feature 'self' is an example. But we skip this feature. 
        'self' will get the value False, other features, such as mother and parents get True

    (VI) annotation_files[part][subpart] = (ftype, medium, location, requires, annotations, is_region)
    
        Stores information of the files that are generated as the resulting LAF resource
    
    The files are organized by part and subpart.
    Header files and primary data files are in part ''.
    Other files may or may not contain annotations. If not, they only contain regions. Then is_region is True.

      ftype
        the file identifier to be used in header files
      medium
        text or xml
      location
        the last part of the file name.
        All file names can be obtained by appending location after the absolute path followed by a common prefix.
      requires
        the identifier of a file that is required by the current file
      annotations
        the annotation labels to be declared for this file
    
    The feature information file contains lines with tab-delimited fields (only the starred ones are used):
      0*           1*            2*          3*          4*             5*          6          7*           8            9           10    11*   12*
      object_type, feature_name, defined_on, etcbc_type, feature_value, isocat_key, isocat_id, isocat_name, isocat_type, isocat_def, note, part, subpart
      0            1             2           3           4              5                      6                                           7     8
    
    '''
    settings = None

    object_info = {}
    feature_info = {}
    value_info = {}
    part_info = {}
    object_subpart = collections.defaultdict(lambda: {})
    part_object = collections.defaultdict(lambda: set())
    part_feature = collections.defaultdict(lambda: collections.defaultdict(lambda: set()))
    reference_feature = {}

    def __init__(self, settings):
        ''' Initialization is: reading the excel sheet with feature information.

        The sheet should be in the form of a tab-delimited text file.

        There are columns with:
            ETCBC information:
                object_type, feature_name, also_defined_on, type, value.
            ISOcat information
                key, id, name, type, definition, note
            LAF sectioning
                part, subpart

        See the list of columns above.
                
        So the file gives essential information to map objects/features/values to ISOcat data categories.
        It indicates how the LAF output can be chunked in parts and subparts.
        '''
        self.settings = settings
        self.simple = False
        self.plain = False
        file_handle = None
        fpfile = settings.env['feature_plain_info']
        ffile = settings.env['feature_info']
        ofile = settings.env['object_info']
        if os.path.exists(ffile):
            file_handle = open(ffile, encoding = 'utf-8')
        else:
            file_handle = open(ofile, encoding = 'utf-8')
            self.simple = True
            if os.path.exists(fpfile):
                self.plain = True

        if self.simple:
            for line in file_handle:
                all_fields = fillup(3, '', line.rstrip().split())
                (object_type, part, subpart) = all_fields
                self.object_info[object_type] = ('', '')
                self.object_subpart[part][object_type] = subpart
                self.part_object[part].add(object_type)
                this_info = self.part_info
                if part not in this_info: this_info[part] = {}
                this_info = this_info[part]
                if subpart not in this_info: this_info[subpart] = {}
                this_info = this_info[subpart]
                if object_type not in this_info: this_info[object_type] = set()
                this_info = self.value_info
                if object_type not in this_info: this_info[object_type] = {}
                this_info = self.feature_info
                if object_type not in this_info: this_info[object_type] = {}
                for lbytes in self.mql('select features from object type [{}]\ngo'.format(object_type)):
                    l = str(lbytes, encoding='utf8')
                    if l.startswith(('-', '+')) or ':' in l: continue
                    (fname, ftype, fdef, fcomp) = [f.strip() for f in l[1:-2].split('|')]
                    ftype = 'reference' if ftype == 'id_d' else 'string' if ftype.endswith('_e') else ftype 
                    self.part_info[part][subpart][object_type].add(fname)
                    self.part_feature[part][object_type].add(fname)
                    self.value_info[object_type][fname] = {}
                    self.feature_info[object_type][fname] = ('', object_type, '', '')
                    if ftype == 'reference':
                        self.reference_feature[fname] = fname not in settings.annotation_skip 
        else:
# the following fields are hierarchical : part, subpart, object_type, feature_name, etcbc_type
# they may inherit from one line to the next, and when one field changes, others have to be reset
# For each input line, we collect them in the list this_fields, and we maintain current values in cur_fields

            line_number = 0
            (cur_part, cur_subpart, cur_object_type, cur_feature_name, cur_etcbc_type) = ('', '', '', '', '')

            for line in file_handle:
                line_number += 1
# The first two lines in the feature info file are header lines. We skip them
                if line_number <= 2: continue

                all_fields = fillup(13, '', line.rstrip().split("\t"))
                used_fields = all_fields[0:6] + all_fields[7:8] + all_fields[11:13]
                (object_type, feature_name, defined_on, etcbc_type, feature_value, isocat_key, isocat_name, part, subpart) = used_fields
                o_atts = (isocat_key, isocat_name)
                f_atts = (defined_on, etcbc_type, isocat_key, isocat_name)
                v_atts = (etcbc_type, isocat_key, isocat_name)
                this_fields = (part, subpart, object_type, feature_name, etcbc_type)
# Reset parts of cur_fields when a hierarchically higher part changes
                if object_type != '':
                    cur_feature_name = ''; 
                    cur_etcbc_type = ''; 
                if feature_name != '': cur_etcbc_type = ''; 
                if part != '': cur_subpart = ''; 
                cur_fields = (cur_part, cur_subpart, cur_object_type, cur_feature_name, cur_etcbc_type)
# For fields that are empty on the current line, use the value saved in cur_fields
                (cur_part, cur_subpart, cur_object_type, cur_feature_name, cur_etcbc_type) = map(lambda c,t: t if t != '' else c, cur_fields, this_fields) 
# Identify the reference features
                if cur_etcbc_type == 'reference':
                    self.reference_feature[cur_feature_name] = cur_feature_name not in settings.annotation_skip 
# Add features to the (sub)part structure
                self.part_object[cur_part].add(cur_object_type)
                if cur_feature_name != '':
                    if cur_object_type not in self.part_feature[cur_part]:
                        self.part_feature[cur_part][cur_object_type] = set()
                    self.part_feature[cur_part][cur_object_type].add(cur_feature_name)

                this_info = self.part_info
                if cur_part not in this_info: this_info[cur_part] = {}
                this_info = this_info[cur_part]
                if cur_subpart not in this_info: this_info[cur_subpart] = {}
                this_info = this_info[cur_subpart]
                if cur_object_type not in this_info: this_info[cur_object_type] = set()
                if cur_feature_name != '':
                    this_info = this_info[cur_object_type]
                    if cur_feature_name not in this_info: this_info.add(cur_feature_name)

                self.object_subpart[cur_part][cur_object_type] = cur_subpart
# Add object info
                this_info = self.object_info
                if cur_object_type not in this_info: this_info[cur_object_type] = o_atts
# Add feature info
                this_info = self.feature_info
                if cur_object_type not in this_info: this_info[cur_object_type] = {}
                if cur_feature_name != '':
                    this_info = this_info[cur_object_type]
                    if cur_feature_name not in this_info: this_info[cur_feature_name] = f_atts
# Add value info
                this_info = self.value_info
                if cur_object_type not in this_info: this_info[cur_object_type] = {}
                if cur_feature_name != '':
                    this_info = this_info[cur_object_type]
                    if cur_feature_name not in this_info: this_info[cur_feature_name] = {}

                    if feature_value != '':
                        this_info = this_info[cur_feature_name]
                        if feature_value not in this_info: this_info[feature_value] = v_atts
        file_handle.close()

# create directories and queries if we have to query the EMDROS database for data
        if settings.flag('raw'):
            run('mkdir -p ' + settings.env['raw_emdros_dir'])
            run('mkdir -p ' + settings.env['query_dst_dir'])

[docs]    def check_raw_files(self, part):
        if not self.settings.flag('raw'): return
        print("INFO: BEGIN Generate raw MQL output from EMDROS")
        self.run_mql(self.make_query_file(part), self.raw_file(part))
        print("INFO: END Generate raw MQL output from EMDROS")

[docs]    def make_query_file(self, part):
        template = 'GET OBJECTS HAVING MONADS IN ALL\n[{object} {features}]\nGO\n'
        query_text = ''
        for object_type in self.object_list_part(part):
            features = ",\n\t\t".join(self._feature_list_part(part, object_type))
            copy = template.format(
                object = object_type,
                features = ("GET " if features else '') + features,
            )
            query_text += copy
        return self.make_mql("{}.mql".format(part), query_text)

[docs]    def run_mql(self, query_file, result_file):
        run('mql -b s3 -d {source} --console {query} > {result}'.format(
                source = self.settings.env['source_data'],
                query = query_file,
                result = result_file,
            ), dyld=True)

[docs]    def make_mql(self, name, query):
        query_file = '{}/{}'.format(self.settings.env['query_dst_dir'], name)
        file_handle = open(query_file, "w", encoding = 'utf-8')
        file_handle.write(query)
        file_handle.close()
        return query_file

[docs]    def mql(self, query):
        mql_opts = ['--console', '-b', 's3', '-d']
        proc = subprocess.Popen(
            ['mql'] + mql_opts + [self.settings.env['source_data']],
            stdout=subprocess.PIPE,
            stdin=subprocess.PIPE,
        )
        proc.stdin.write(bytes(query, encoding='utf8'))
        proc.stdin.close()
        result = proc.stdout.readlines()
        proc.stdout.close()
        return result

[docs]    def part_list(self): return sorted(self.part_info.keys())
[docs]    def subpart_list(self, part): return sorted(self.part_info[part].keys())
[docs]    def object_list_part(self, part): return sorted(self.part_object[part])
[docs]    def the_subpart(self, part, object_type): return self.object_subpart[part][object_type]
[docs]    def object_list(self, part, subpart): return sorted(self.part_info[part][subpart].keys())
[docs]    def feature_list(self, object_type): return sorted(self.feature_info[object_type].keys())
    def _feature_list_part(self, part, object_type):
        return sorted(x for x in self.part_feature[part][object_type] if x not in self.settings.annotation_skip)
[docs]    def feature_list_subpart(self, part, subpart, object_type):
        return sorted(x for x in self.part_info[part][subpart][object_type] if x not in self.settings.annotation_skip)
[docs]    def value_list(self, object_type, feature_name): return sorted(self.value_info[object_type][feature_name].keys())
[docs]    def object_atts(self, object_type): return self.object_info[object_type]
[docs]    def feature_atts(self, object_type, feature_name): return self.feature_info[object_type][feature_name]
[docs]    def value_atts(self, object_type, feature_name, feature_value): return self.value_info[object_type][feature_name][feature_value]
[docs]    def list_ref_noskip(self): return sorted(x for x in self.reference_feature if self.reference_feature[x])
[docs]    def is_ref_skip(self, feature_name): return feature_name in self.reference_feature and not self.reference_feature[feature_name]
[docs]    def raw_file(self, part): return self.settings.parts[part]['raw_text']