Source code for emdros2laf.transform
import sys
import re
import collections
# This module has a big loop (in process_lines) which iterates over more than a million lines.
# Quite a lot has to happen for each line, so it is important to keep that
# loop as efficient as possible. That's why we declare all regular expressions
# globally and compile them.
infoscan = re.compile(r'\[ (\w+) (\d+) \{ ([\d ,-]+) \}')
featurescan = re.compile(r'(\w+)="([^"]*)"')
uniscan = re.compile(r'(?:\\x..)+')
wordscan = re.compile(r'([&-]|00)((?:_[NSP])*)$')
[docs]class Transform:
''' Transforms ETCBC data into a LAF resource
ETCBC knowledge comes from the Etcbc class
LAF knowledge comes from the Laf class
read data from raw MQL export
and build the annotations files
For part monad there are extra things:
* the primary data file will be built
* one of the annotations files only contains regions, and no annotations
'''
et = None
lf = None
settings = None
# remember the object types that have been encountered so far
# These types should be remembered across parts, in order to prevent repeated node
# generation if an object type occurs in multiple parts.
def __init__(self, settings, et, lf):
self.settings = settings
self.et = et
self.lf = lf
[docs] def transform(self, part):
self.lf.start_annot(part)
if part == 'monad': self.lf.start_primary()
self.et.check_raw_files(part)
self.process_lines(part)
self.lf.finish_annot(part)
if part == 'monad': self.lf.finish_primary()
[docs] def process_lines(self, part):
''' Data transformation for part.
Input: the lines of a raw emdros output file, which is processed line by line.
Every line contains an object type, object identifier, monad indicator and list of features.
This has to be translated to primary data and annotations.
Efficiency is very important. It will not do to call functions or follow long chains of dereferencing.
Yet a lot has to happen.
That is why this is a lengthy loop, and we maintain quite a lot of information from elsewhere in the program
in loop-global variables. Not doing so might increase the running time 10-fold.
Currently the complete programs runs within 15 minutes (inclusing generating raw data and validating) on an
MacBook Air mid 2012.
'''
monad_limit = self.settings.args.limit
part_settings = self.settings.parts[part]
region_handle = None
plain_handle = None
primary_handle = None
do_primary = 'do_primary' in part_settings
# The objects in some parts will get a separate file in LAF for the nodes.
# This is triggered by the config setting separate_node_file.
# The other LAF files will be generated with both nodes and annotations in a single file.
separate_node_file = 'separate_node_file' in part_settings
# we need an index that maps monad numbers to char positions in the primary data.
# This index is generated when doing part 'monad', and used when doing part 'section'.
# In part section we create new regions, corresponding to the books, chapters, verses and half-verses.
# Refererring to the regions that were created for words and white space and punctuation would be clumsy in LAF.
# Creation and use of the index is triggered by settings in the configuration file: make_index and use_index.
# The index data itself, once read in, is stored in the dictionary monad_chars.
index_handle = None
index_path = self.settings.env['monad_index']
make_index = 'make_index' in part_settings
use_index = 'use_index' in part_settings
no_monad_nodes = 'no_monad_nodes' in part_settings
monad_chars = {}
# The LAF object knows which annotation files we have to write. We get the file handles and store them in
# a local dictionary.
sub_handles = self.lf.file_handles
# For part 'section' we have to do something special: we have to link half_verses to their containing verses
# to their containing chapters to their containing books. This is done on the basis of monad embedding.
# An object is embedded in another object if and only if the monads of the first object are a subset of the monads of the second object.
# For part 'section' we make an ordered list of section objects according to the embedding/before relationships.
# Based on this list we can efficiently construct the relations between the section objects.
# In the list hierarchy we fetch the configuration setting that tells us which object types we should embed in which other ones.
# The process of creating the embedding is triggered by theconfig setting find_embedding
sort_objects = []
hierarchy = []
find_embedding = 'find_embedding' in part_settings
# compute the embedding relationships that have to be generated, if needed
if find_embedding:
hierarchy_levels = part_settings['hierarchy'].split(" ")
prev_level = ''
for level in hierarchy_levels:
if prev_level == '': prev_level = level
else:
hierarchy.append((prev_level, level))
prev_level = level
# initialize generation of primary data if needed
if do_primary:
primary_handle = self.lf.primary_handle
region_handle = self.lf.file_handles[self.settings.annotation_regions['name']][0]
primary_feature = self.settings.meta['primary']
trailer_feature = self.settings.meta['trailer']
add_verse_newline = self.settings.meta['verse_newline'] == '1'
if separate_node_file: plain_handle = self.lf.file_handles[''][0]
# create or use the monad char index if needed
if make_index: index_handle = open(index_path, 'w')
if use_index:
gminmonad = None
gmaxmonad = None
index_handle = open(index_path, 'r')
for line in index_handle:
(m, start, end_word, end_trailer) = line.rstrip("\n").split("\t")
monad_chars[m] = (start, end_word, end_trailer)
if gminmonad == None or int(m) < int(gminmonad): gminmonad = m
if gmaxmonad == None or int(m) > int(gmaxmonad): gmaxmonad = m
index_handle.close()
print("INFO: Monad-char index has {} items, from monad {} to monad {}".format(len(monad_chars), gminmonad, gmaxmonad))
# open the input file with raw data
file_handle = open(self.et.raw_file(part), encoding = 'utf-8')
# initialize some counters
n_line = 0 # number of current line in input file
n_unmatched = 0 # number of illegal lines in input file that has been encountered
n_nonword = 0 # number of non-word regions that has been generated (used to construct identifiers for such regions)
n_af = 0 # numbber of feature annotations that has been created (used to construct identifiers for annotations)
n_en = 0 # number of edges that has been created (used to construct identifiers for edges)
n_f = 0 # total number of features <f> elements created
n_n = 0 # total number of nodes
n_a = 0 # total number of annotations
n_e = 0 # total number of edges
n_r = 0 # total number of regions
n_m = 0 # total number of monads
o = 0 # current output char position (used to create regions)
# fetch template info
region_elem = self.lf.template['region_elem']
region_types = self.settings.annotation_regions
region_word = region_types['word']
region_punct = region_types['punct']
region_section = region_types['section']
annotation_elem = self.lf.template['annotation_elem']
node_elem = self.lf.template['node_elem']
edge_elem = self.lf.template['edge_elem']
edgenode_elem = self.lf.template['edgenode_elem']
feature_elem = self.lf.template['feature_elem'].rstrip("\n")
# fetch object list, subparts
object_types = self.et.object_list_part(part)
subparts = sorted(self.lf.file_handles.keys())
# fetch additional information
db_label = self.settings.annotation_label['db_label']
annotation_label = self.settings.annotation_label[part + '_label']
monad_type = self.settings.parts['monad']['object_type']
# set up a dictionary to collect statistics of annotation label usage
stats = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
statskind = stats[annotation_label]
statsdb = stats[db_label]
# fetch additional feature info: what are the relational features?
ref_features = set(self.et.list_ref_noskip())
# we must compute as little as possible in each iteration below
# That is why we maintain the list with features outside the loop
# Whenever a line switches to another object type, we will get the list of features for that object type
# But most of the time, the object type is the same as the one of the previous line
prev_object_type = ''
generate_nodes = True
prev_features = {}
#####################################################
# HERE STARTS THE MAIN LOOP
#####################################################
for line in file_handle:
# there are some non-informational lines in the input, they have a very small length, we skip them
if len(line) <= 5: continue
# we maintain the line number, and ever so often we print the current line number to the terminal
# by way of progress indication
n_line += 1
if n_line % 1000 == 0: sys.stderr.write("\rINFO: " + str(n_line))
# Get all the information from the line
# do some sanity first: convert escaped raw byte codes to real utf and insert xml escapes
line_norm = uniscan.sub(makeuni, line).replace('&', '&').replace('<', '<').replace('>', '>')
# Get OBJECT IDENTIFIER
match = infoscan.search(line_norm)
if not match:
print("ERROR: Unmatched line: {}: {}".format(n_line, line.rstrip("\n")))
continue
object_type, object_id, monads = match.group(1, 2, 3)
if do_primary: stats[object_type][''] += 1
# Set up list of FEATURE NAMES per subpart
# but only when the object type has changed!
do_get_features = False
if prev_object_type != object_type:
generate_nodes = (not no_monad_nodes) or object_type != monad_type
prev_features = {}
prev_object_type = object_type
this_subpart = self.et.the_subpart(part, object_type)
do_get_features = True
# Get FEATURES
features = featurescan.findall(line_norm)
feature_dict = dict(features)
# Get MONADS
regions = []
monad_num = -1
minmonad = -1
maxmonad = -1
monads = monads.replace(' ','')
# When we know that there is a single monad do it quickly.
# This occurs when the object type is the monad type.
if do_primary or object_type == monad_type:
if not monads.isdigit():
print("ERROR: Multiple monad numbers {} on line {}".format(monads, n_line))
continue
monad_num = int(monads)
regions = [monad_num]
minmonad = monad_num
maxmonad = monad_num
else:
# For other object types, we have to build a set of all monads specified by a comma separated list of ranges
ranges = monads.split(",")
for rng in ranges:
endpoints = rng.split("-")
br = endpoints[0]
er = br
if len(endpoints) == 2: er = endpoints[1]
regions += range(int(br), int(er) + 1)
minmonad = str(min(regions))
maxmonad = str(max(regions))
# Maybe we have to stop:
if monad_limit and int(maxmonad) > monad_limit: break
# Begin PRIMARY DATA generation
# Get PRIMARY DATA
# first we extract the primary data from the features.
# That's a bit tricky, we defer it to a function
# What needs to come out of that function is a word, possible punctuation behind it,
# an indication of the spacing, and some other characters that must be appended
if do_primary:
if primary_feature:
text_xml = feature_dict[primary_feature]
if trailer_feature:
trailer_xml = feature_dict[trailer_feature]
else:
trailer_xml = ' '
else:
text_xml, trailer_xml = primary_data(feature_dict['text'], feature_dict['suffix'])
feature_dict['text'] = text_xml
feature_dict['suffix'] = trailer_xml
text = text_xml.replace('&', '&').replace('<', '<').replace('>', '>')
trailer = trailer_xml.replace('&', '&').replace('<', '<').replace('>', '>')
# Construct PRIMARY DATA
# Now we build up the primary data file and the regions file
# We have to keep count of the char position of the data in the primary data file
# So compute the length of the unicode strings that we need to write to the primary data
ltext = len(text)
ltrailer = len(trailer)
n_m += 1
if make_index: index_handle.write("{}\t{}\t{}\t{}\n".format(monad_num, o, o + ltext, o + ltext + ltrailer))
# Every unit to be written contains exactly one word, plus a trailer which contains spacing, punctuations and final codes
# This is accumulated in p_data
# The current char position is maintained in o
#
# For every word and trailer we create a <region> element that indicates its char position
# This is accumulated in r_data
# There is always a word inside
r_data = region_elem.format(xmlid = "{}_{}".format(region_word, monads), start = o, end = o + ltext)
n_r +=1
o += ltext
p_data = text
# Possibly there is a trailer, some trailers end a verse, there we insert a newline
if trailer != '':
p_data += trailer
n_nonword += 1
boundaries = "{} {}".format(o, o + ltext) if trailer != '' else "{}".format(o)
r_data += region_elem.format(xmlid = "{}_{}".format(region_punct, n_nonword), start = o, end = o + ltrailer)
n_r += 1
o += ltrailer
if add_verse_newline and '׃' in trailer:
o += 1
p_data += "\n"
r_data += "\n"
# When all is constructed, we write it away to the appropriate files (having been opened by the initialization
# of the Laf class
# Here we write the regions to a separate file.
# And we write the nodes to a separate file
primary_handle.write(p_data)
region_handle.write(r_data)
# END PRIMARY DATA generation
# COMPUTE THE NODES FOR THE SEPARATE NODE FILE IF NEEDED
if separate_node_file and generate_nodes:
n_data = ''
if object_type == monad_type:
n_data = node_elem.format(
part = part[0], akind = db_label, aid = object_id, xmlid = object_id,
monads = monads, minmonad = minmonad, maxmonad = maxmonad, objectid = object_id, objecttype=object_type,
region = "{}_{}".format(region_word, monads)
)
else:
n_data = node_elem.format(
part = part[0], akind = db_label, aid = object_id, xmlid = object_id,
monads = monads, minmonad = minmonad, maxmonad = maxmonad, objectid = object_id, objecttype=object_type,
region = " ".join("{}_{}".format(region_word, str(x)) for x in regions)
)
statsdb[''] += 1
n_data += "\n"
n_n += 1
n_f += 4 # the node_elem template adds 4 separate features: monads, minmonad, maxmonad, objectid
plain_handle.write(n_data)
the_subparts = subparts if do_primary else [this_subpart]
for subpart in the_subparts:
# Construct REGIONS, NODES, ANNOTATIONS, EDGES if needed, per subpart
if subpart == '' and do_primary: continue
if subpart == region_types['name']: continue
# that was a subpart where a file with only regions has been constructed. So we can skip adding regions to the annotation file.
features = {}
if do_get_features:
features = self.et.feature_list_subpart(part, subpart, object_type)
prev_features[subpart] = features
else: features = prev_features[subpart]
r_data = '' # region data
n_data = '' # node data
a_data = '' # annotation data
f_data = '' # feature data
e_data = '' # edge data
# Construct REGIONS
# only construct regions if use_index is in the configuration file for this part
r_id = ''
if use_index:
minchar = 0
maxchar = 0
minmonad = str(min(regions))
if minmonad not in monad_chars:
print("WARNING: line {}: min monad {} not in monad_char index, taking monad {} instead.".format(n_line, minmonad, gminmonad))
minmonad = gminmonad
minchar = monad_chars[minmonad][0]
if maxmonad not in monad_chars:
print("WARNING: line {}: max monad {} not in monad_char index, taking monad {} instead.".format(n_line, maxmonad, gmaxmonad))
maxmonad = gmaxmonad
maxchar = monad_chars[maxmonad][2]
r_id = "{}_{}".format(region_section, object_id)
r_data = region_elem.format(xmlid = r_id, start = minchar, end = maxchar)
n_r += 1
# Construct FEATURE data
# Relational features translate to EDGES
has_real_features = False
for feature in features:
if feature in ref_features:
value = feature_dict[feature]
if value != "0" and value != "" and value != " ":
value = value.strip()
values = value.split(" ")
for val in values:
n_en += 1
e_data += edgenode_elem.format(part = part[0], eid = n_en, fr = object_id, to = val, aid = n_en, akind = annotation_label, fname = feature, value = '')
n_e += 1
stats[feature][subpart] += 1
else:
has_real_features = True
f_data += "\n\t" + feature_elem.format(name = feature, value = feature_dict[feature].replace('\n', '
'))
n_f += 1
# Prepare EDGES for hierarchical sections
# We collect a list of all objects with the first and last monad as extra information
# This list will be sorted later
if find_embedding: sort_objects.append((min(regions), max(regions), object_type, object_id))
n_af += 1
# Construct NODES if needed
if not separate_node_file:
if has_real_features:
if use_index:
n_data = node_elem.format(
part = part[0], akind = db_label, aid = object_id, xmlid = object_id,
monads = monads, minmonad = minmonad, maxmonad = maxmonad, objectid = object_id, objecttype=object_type,
region = r_id
)
else:
n_data = node_elem.format(
part = part[0], akind = db_label, aid = object_id, xmlid = object_id,
monads = monads, minmonad = minmonad, maxmonad = maxmonad, objectid = object_id, objecttype=object_type,
region = " ".join("{}_{}".format(region_word, str(x)) for x in regions)
)
statsdb[subpart] += 1
n_n += 1
n_f += 4 # the node_elem template adds 4 separate features: monads, minmonad, maxmonad, objectid
# Construct ANNOTATIONS
if has_real_features:
a_data = annotation_elem.format(part = part[0], xmlid = n_af, akind = annotation_label, objectid = object_id, features = f_data)
n_a += 1
outline = r_data + n_data + a_data + e_data
if outline:
sub_handles[subpart][0].write(outline + "\n")
stats[object_type][subpart] += 1
statskind[subpart] += 1
#####################################################
# HERE ENDS THE MAIN LOOP
#####################################################
print('')
# Generate additional EDGES based on whether objects are monad-wise included in each other
if find_embedding:
print("INFO: Creating embedding relations")
sorted_objects = sorted(sort_objects, key=interval)
for (outer, inner) in hierarchy:
print("INFO: Generating embedding of {} in {}".format(inner, outer))
cur_outer_id = -1
for (start, end, object_type, object_id) in sorted_objects:
if object_type == outer:
cur_outer_id = object_id
continue
if object_type == inner:
if int(cur_outer_id) < 0: print("ERROR: {} with id {} not inside any {}".format(inner, object_id, outer))
else:
n_en += 1
e_data = edge_elem.format(part = part[0], eid = n_en, fr = cur_outer_id, to = object_id)
n_e += 1
sub_handles[''][0].write(e_data)
# Transfer the statistics information to the LAF object, so that the annotation files can be finalised properly
for ob in stats:
for sp in stats[ob]: self.lf.stats["{}.{}".format(sp, ob)] = stats[ob][sp]
for subpart in subparts:
self.lf.stats["{}.{}".format(subpart, db_label)] = statsdb[subpart]
if subpart == '' and do_primary: continue
if subpart == region_types['name']: continue
self.lf.stats["{}.{}".format(subpart, annotation_label)] = statskind[subpart]
# Fill in general overall statistics
self.lf.gstats['a Number of regions'] += n_r
self.lf.gstats['b Number of nodes'] += n_n
self.lf.gstats['c Number of edges'] += n_e
self.lf.gstats['d Number of annotations'] += n_a
self.lf.gstats['e Number of features'] += n_f
self.lf.gstats['f Number of monads'] += n_m
self.lf.gstats['g Number of chars in primary data'] += o
if n_unmatched: print("ERROR: {} Unmatched lines".format(n_unmatched))
else: print("INFO: {} object lines".format(n_line))
if make_index: index_handle.close()
file_handle.close()
[docs]def primary_data(text, trailer):
''' Distil primary data from two features on the word objects.
Apply necessary tweaks!
'''
if text.endswith('׀'):
text = text.rstrip('׀')
trailer = ' ׀' + trailer
return (text, trailer)
[docs]def makeuni(match):
''' Make proper unicode of a text that contains byte escape codes such as backslash xb6
'''
byts = eval('"' + match.group(0) + '"')
return byts.encode('latin1').decode('utf-8')