Source code for etcbc.featuredoc

import sys
import collections
from copy import deepcopy

[docs]class FeatureDoc(object):
    '''Extracts feature information for selected features.

    The information returned consists of value lists, number of occurrences, and
    an summary spreadsheet.
    '''

    def __init__(self, processor, study):
        '''Upon creation, re-initializes the laf processor with requested features plus some needed features.

        Args:
            study:
                A dictionary directing the feature study. Contains:
                    
                    * a list of features to be studied.
                      It is a list of feature names
                    * a set of *absence values*, i.e. values like ``none`` or ``unknown`` that somehow count as the absence of a value.
                    * VALUE_THRESHOLD: a parameter that indicates how many distinct values to list in the summary.
        '''
        self.BASELOAD = {
            "xmlids": {
                "node": False,
                "edge": False,
            },
            "features": ('''otype {}'''.format(study['vlabel']), ''),
            "primary": False,
        }

        self.processor = processor
        self.study = study
        this_load = deepcopy(self.BASELOAD)
        this_load['features'] = (
            this_load['features'][0] + ' ' + study['features']['node'],
            this_load['features'][1] + ' ' + study['features']['edge'],
        )
        processor.load_again(this_load, verbose='DETAIL')
        self.API = processor.api

[docs]    def feature_doc(self):
        '''Create the feature information.

        Based on the study information given at the creation of the FeatureDoc object, a set of files is created.

        * A tab separated overview of statistical feature/value information.
        * For each feature, a file with its values and number of occurrences.
        * A file of node types and the features they carry.

        '''
        msg = self.API['msg']
        outfile = self.API['outfile']
        F = self.API['F']
        FE = self.API['FE']
        NN = self.API['NN']
        EE = self.API['EE']
        msg = self.API['msg']
        outfile = self.API['outfile']
        my_file = self.API['my_file']

        msg("Looking up feature values ... ")
        node_feats = [ft.replace(':','_').replace('.','_') for ft in self.study['features']['node'].split()]
        edge_feats = [ft.replace(':','_').replace('.','_') for ft in self.study['features']['edge'].split()]
        absence_values = self.study['absence_values']
        VALUE_THRESHOLD = self.study['VALUE_THRESHOLD']

# values and object types for this feature
        
        vals = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
        vals_def = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
        vals_undef = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
        n_otypes = collections.defaultdict(lambda: collections.defaultdict(lambda: [0,0]))
        n_otypesi = collections.defaultdict(lambda: collections.defaultdict(lambda: [0,0]))
        e_otypes = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
        e_otypesi = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
        
        chunk_size = 100000
        ci = 0
        i = 0
        for node in NN():
            i += 1
            ci += 1
            if ci == chunk_size:
                ci = 0
                msg("{:>7} nodes done".format(i))
            for ft in node_feats:
                val = F.item[ft].v(node)
                if val != None:
                    otype = F.db_otype.v(node)
                    if val in absence_values:
                        n_otypes[otype][ft][0] += 1
                        n_otypesi[ft][otype][0] += 1
                        vals_undef[ft][val] += 1
                    else:
                        n_otypes[otype][ft][1] += 1
                        n_otypesi[ft][otype][1] += 1
                        vals_def[ft][val] += 1
        msg("{:>7} nodes done".format(i))

        ci = 0
        i = 0
        for edge in EE():
            i += 1
            ci += 1
            if ci == chunk_size:
                ci = 0
                msg("{:>7} edges done".format(i))
            for ft in edge_feats:
                val = FE.item[ft].v(edge[0])
                if val != None:
                    otype_from = F.db_otype.v(edge[1])
                    otype_to = F.db_otype.v(edge[2])
                    e_otypes[(otype_from, otype_to)][ft] += 1
                    e_otypesi[ft][(otype_from, otype_to)] += 1
                    vals[ft][val] += 1
        msg("{:>7} edges done".format(i))
        
        node_otypes = sorted(n_otypes.keys())
        edge_otypes = sorted(e_otypes.keys())

        msg("Computing results ...")
        
        for ft in node_feats:
            result_file = outfile("{} values.txt".format(ft))
            result_file.write("{} DIFFERENT DEFINED VALUES IN TOTAL\n".format(len(vals_def[ft])))
            result_file.write("UNDEFINED VALUES\n")
            for x in sorted(vals_undef[ft].items(), key=lambda y: (-y[1], y[0])):
                result_file.write("{} x {}\n".format(*x))
            result_file.write("\nDEFINED VALUES\n")
            for x in sorted(vals_def[ft].items(), key=lambda y: (-y[1], y[0])):
                result_file.write("{} x {}\n".format(*x))
            result_file.close()
            result_file = outfile("{}.rst".format(ft))
            result_file.write('''
{ft}
{ln}
.. literalinclude:: ../values/{ft} values.txt
'''.format(ft=ft, ln=('=' * len(ft))))
            result_file.close()
        
        for ft in edge_feats:
            result_file = outfile("edge {} values.txt".format(ft))
            result_file.write("\nVALUES\n")
            for x in sorted(vals[ft].items(), key=lambda y: (-y[1], y[0])):
                result_file.write("{} x {}\n".format(*x))
            result_file.close()
        
        result_file = outfile("1_types_node.txt")
        for ft in sorted(n_otypesi):
            for otype in sorted(n_otypesi[ft]):
                result_file.write("{}\t{}\t{}\t{}\n".format(ft, otype, *n_otypesi[ft][otype]))
        result_file.close()
        
        result_file = outfile("1_types_edge.txt")
        for ft in sorted(e_otypesi):
            for otype in sorted(e_otypesi[ft]):
                result_file.write("{}\t{}->{}\t{}\n".format(ft, otype[0], otype[1], e_otypesi[ft][otype]))
        result_file.close()
        
        n_vals_def = collections.defaultdict(lambda: 0)
        n_vals_undef = collections.defaultdict(lambda: 0)
        for ft in node_feats:
            for val in vals_def[ft]:
                n_vals_def[ft] += vals_def[ft][val]
            for val in vals_undef[ft]:
                n_vals_undef[ft] += vals_undef[ft][val]
        
        index_file = outfile("index.rst")
        index_file.write('''
Feature Index
#############
''')
        for ft in sorted(node_feats):
            index_file.write('''
:doc:`{ft} <{ft}>`
'''.format(ft=ft))
        index_file.close()

        summary_file = outfile("0_summary_node.csv")
        summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            'Feature',
            'val (-)',
            'val (+)',
            '#vals (-)',
            '#vals (+)',
            'occs (-)',
            'occs (+)',
            '\t'.join(["{} (-)\t{} (+)".format(otype, otype) for otype in node_otypes]),
        ))
                           
        for ft in node_feats:
            summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                ft,
                '',
                '',
                len(vals_undef[ft]),
                len(vals_def[ft]),
                n_vals_undef[ft],
                n_vals_def[ft],
                '\t'.join(["{}\t{}".format(*n_otypes[otype][ft]) for otype in node_otypes]),
            ))
            for (val, n) in sorted(vals_undef[ft].items(), key=lambda x: (-x[1], x[0])):
                summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    '',
                    val,
                    '',
                    '',
                    '',
                    n,
                    '',
                    '\t' * (2 * len(node_otypes) - 1),
            ))
            i = 0
            for (val, n) in sorted(vals_def[ft].items(), key=lambda x: (-x[1], x[0])):
                i += 1
                if i > VALUE_THRESHOLD:
                    summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                        '',
                        '',
                        "{} MORE".format(len(vals_def[ft]) - VALUE_THRESHOLD),
                        '',
                        '',
                        '',
                        '',
                        '\t' * (2 * len(node_otypes) - 1),
                    ))
                    break
                summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    '',
                    '',
                    val,
                    '',
                    '',
                    '',
                    n,
                    '\t' * (2 * len(node_otypes) - 1),
            ))
        summary_file.close()

        e_vals = collections.defaultdict(lambda: 0)
        for ft in edge_feats:
            for val in vals[ft]:
                e_vals[ft] += vals[ft][val]
        
        summary_file = outfile("0_summary_edge.csv")
        summary_file.write("{}\t{}\t{}\t{}\t{}\n".format(
            'Feature',
            'val',
            '#vals',
            'occs',
            '\t'.join(["{}->{}".format(*otype) for otype in edge_otypes]),
        ))
                           
        for ft in edge_feats:
            summary_file.write("{}\t{}\t{}\t{}\t{}\n".format(
                ft,
                '',
                len(vals[ft]),
                e_vals[ft],
                '\t'.join(["{}".format(e_otypes[otype][ft]) for otype in edge_otypes]),
            ))
            i = 0
            for (val, n) in sorted(vals[ft].items(), key=lambda x: (-x[1], x[0])):
                i += 1
                if i > VALUE_THRESHOLD:
                    summary_file.write("{}\t{}\t{}\t{}\n".format(
                        '',
                        "{} MORE".format(len(vals[ft]) - VALUE_THRESHOLD),
                        '',
                        '\t' * (len(edge_otypes) - 1),
                    ))
                    break
                summary_file.write("{}\t{}\t{}\t{}\n".format(
                    '',
                    val,
                    n,
                    '\t' * (len(edge_otypes) - 1),
            ))
        summary_file.close()
        
        msg("Done")