Source code for etcbc.featuredoc

import sys
import collections
from copy import deepcopy

[docs]class FeatureDoc(object): '''Extracts feature information for selected features. The information returned consists of value lists, number of occurrences, and an summary spreadsheet. ''' def __init__(self, processor, study): '''Upon creation, re-initializes the laf processor with requested features plus some needed features. Args: study: A dictionary directing the feature study. Contains: * a list of features to be studied. It is a list of feature names * a set of *absence values*, i.e. values like ``none`` or ``unknown`` that somehow count as the absence of a value. * VALUE_THRESHOLD: a parameter that indicates how many distinct values to list in the summary. ''' self.BASELOAD = { "xmlids": { "node": False, "edge": False, }, "features": ('''otype {}'''.format(study['vlabel']), ''), "primary": False, } self.processor = processor self.study = study this_load = deepcopy(self.BASELOAD) this_load['features'] = ( this_load['features'][0] + ' ' + study['features']['node'], this_load['features'][1] + ' ' + study['features']['edge'], ) processor.load_again(this_load, verbose='DETAIL') self.API = processor.api
[docs] def feature_doc(self): '''Create the feature information. Based on the study information given at the creation of the FeatureDoc object, a set of files is created. * A tab separated overview of statistical feature/value information. * For each feature, a file with its values and number of occurrences. * A file of node types and the features they carry. ''' msg = self.API['msg'] outfile = self.API['outfile'] F = self.API['F'] FE = self.API['FE'] NN = self.API['NN'] EE = self.API['EE'] msg = self.API['msg'] outfile = self.API['outfile'] my_file = self.API['my_file'] msg("Looking up feature values ... ") node_feats = [ft.replace(':','_').replace('.','_') for ft in self.study['features']['node'].split()] edge_feats = [ft.replace(':','_').replace('.','_') for ft in self.study['features']['edge'].split()] absence_values = self.study['absence_values'] VALUE_THRESHOLD = self.study['VALUE_THRESHOLD'] # values and object types for this feature vals = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) vals_def = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) vals_undef = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) n_otypes = collections.defaultdict(lambda: collections.defaultdict(lambda: [0,0])) n_otypesi = collections.defaultdict(lambda: collections.defaultdict(lambda: [0,0])) e_otypes = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) e_otypesi = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) chunk_size = 100000 ci = 0 i = 0 for node in NN(): i += 1 ci += 1 if ci == chunk_size: ci = 0 msg("{:>7} nodes done".format(i)) for ft in node_feats: val = F.item[ft].v(node) if val != None: otype = F.db_otype.v(node) if val in absence_values: n_otypes[otype][ft][0] += 1 n_otypesi[ft][otype][0] += 1 vals_undef[ft][val] += 1 else: n_otypes[otype][ft][1] += 1 n_otypesi[ft][otype][1] += 1 vals_def[ft][val] += 1 msg("{:>7} nodes done".format(i)) ci = 0 i = 0 for edge in EE(): i += 1 ci += 1 if ci == chunk_size: ci = 0 msg("{:>7} edges done".format(i)) for ft in edge_feats: val = FE.item[ft].v(edge[0]) if val != None: otype_from = F.db_otype.v(edge[1]) otype_to = F.db_otype.v(edge[2]) e_otypes[(otype_from, otype_to)][ft] += 1 e_otypesi[ft][(otype_from, otype_to)] += 1 vals[ft][val] += 1 msg("{:>7} edges done".format(i)) node_otypes = sorted(n_otypes.keys()) edge_otypes = sorted(e_otypes.keys()) msg("Computing results ...") for ft in node_feats: result_file = outfile("{} values.txt".format(ft)) result_file.write("{} DIFFERENT DEFINED VALUES IN TOTAL\n".format(len(vals_def[ft]))) result_file.write("UNDEFINED VALUES\n") for x in sorted(vals_undef[ft].items(), key=lambda y: (-y[1], y[0])): result_file.write("{} x {}\n".format(*x)) result_file.write("\nDEFINED VALUES\n") for x in sorted(vals_def[ft].items(), key=lambda y: (-y[1], y[0])): result_file.write("{} x {}\n".format(*x)) result_file.close() result_file = outfile("{}.rst".format(ft)) result_file.write(''' {ft} {ln} .. literalinclude:: ../values/{ft} values.txt '''.format(ft=ft, ln=('=' * len(ft)))) result_file.close() for ft in edge_feats: result_file = outfile("edge {} values.txt".format(ft)) result_file.write("\nVALUES\n") for x in sorted(vals[ft].items(), key=lambda y: (-y[1], y[0])): result_file.write("{} x {}\n".format(*x)) result_file.close() result_file = outfile("1_types_node.txt") for ft in sorted(n_otypesi): for otype in sorted(n_otypesi[ft]): result_file.write("{}\t{}\t{}\t{}\n".format(ft, otype, *n_otypesi[ft][otype])) result_file.close() result_file = outfile("1_types_edge.txt") for ft in sorted(e_otypesi): for otype in sorted(e_otypesi[ft]): result_file.write("{}\t{}->{}\t{}\n".format(ft, otype[0], otype[1], e_otypesi[ft][otype])) result_file.close() n_vals_def = collections.defaultdict(lambda: 0) n_vals_undef = collections.defaultdict(lambda: 0) for ft in node_feats: for val in vals_def[ft]: n_vals_def[ft] += vals_def[ft][val] for val in vals_undef[ft]: n_vals_undef[ft] += vals_undef[ft][val] index_file = outfile("index.rst") index_file.write(''' Feature Index ############# ''') for ft in sorted(node_feats): index_file.write(''' :doc:`{ft} <{ft}>` '''.format(ft=ft)) index_file.close() summary_file = outfile("0_summary_node.csv") summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( 'Feature', 'val (-)', 'val (+)', '#vals (-)', '#vals (+)', 'occs (-)', 'occs (+)', '\t'.join(["{} (-)\t{} (+)".format(otype, otype) for otype in node_otypes]), )) for ft in node_feats: summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( ft, '', '', len(vals_undef[ft]), len(vals_def[ft]), n_vals_undef[ft], n_vals_def[ft], '\t'.join(["{}\t{}".format(*n_otypes[otype][ft]) for otype in node_otypes]), )) for (val, n) in sorted(vals_undef[ft].items(), key=lambda x: (-x[1], x[0])): summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( '', val, '', '', '', n, '', '\t' * (2 * len(node_otypes) - 1), )) i = 0 for (val, n) in sorted(vals_def[ft].items(), key=lambda x: (-x[1], x[0])): i += 1 if i > VALUE_THRESHOLD: summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( '', '', "{} MORE".format(len(vals_def[ft]) - VALUE_THRESHOLD), '', '', '', '', '\t' * (2 * len(node_otypes) - 1), )) break summary_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( '', '', val, '', '', '', n, '\t' * (2 * len(node_otypes) - 1), )) summary_file.close() e_vals = collections.defaultdict(lambda: 0) for ft in edge_feats: for val in vals[ft]: e_vals[ft] += vals[ft][val] summary_file = outfile("0_summary_edge.csv") summary_file.write("{}\t{}\t{}\t{}\t{}\n".format( 'Feature', 'val', '#vals', 'occs', '\t'.join(["{}->{}".format(*otype) for otype in edge_otypes]), )) for ft in edge_feats: summary_file.write("{}\t{}\t{}\t{}\t{}\n".format( ft, '', len(vals[ft]), e_vals[ft], '\t'.join(["{}".format(e_otypes[otype][ft]) for otype in edge_otypes]), )) i = 0 for (val, n) in sorted(vals[ft].items(), key=lambda x: (-x[1], x[0])): i += 1 if i > VALUE_THRESHOLD: summary_file.write("{}\t{}\t{}\t{}\n".format( '', "{} MORE".format(len(vals[ft]) - VALUE_THRESHOLD), '', '\t' * (len(edge_otypes) - 1), )) break summary_file.write("{}\t{}\t{}\t{}\n".format( '', val, n, '\t' * (len(edge_otypes) - 1), )) summary_file.close() msg("Done")