# -*- coding: utf-8 -*- #Minas Abovyan #This script is called by others, not ran by itself. Mainly calling collate function, which uses the rest. import re, xml.dom.minidom as minidom numberSplitter = re.compile('\d+|\D+') def removeElementTags(element, parent): #turn "text text1 text2" into "text text1 text2" for child in parent.getElementsByTagName(element): parent.replaceChild(child.firstChild, child) def deleteElements(element, parent): #turn "text text1 text2" into "text text2" for child in parent.getElementsByTagName(element): parent.removeChild(child) def choose(choice): found = False choices = {'sic': 'corr', 'orig': 'reg', 'abbr': 'expan', 'seg': 'seg'} # given the choose tag, take the second one of the options (value in this dictionary) for ch in choices: if ch in [child.localName for child in choice.childNodes]: found = True if ch == 'seg': choice.removeChild(choice.firstChild) removeElementTags('seg', choice) else: choice.replaceChild(choice.getElementsByTagName(choices[ch])[0].firstChild, choice.getElementsByTagName(ch)[0]) deleteElements(choices[ch], choice) break if not found: raise Exception("I don't know what to do with this choice element: " + choice.toxml()) def splitTagsFromText(string): #split "text text1 text2 text3" into a list ['text', '', 'text1', '', 'text2 text3']. Is a generator, so should be called as list(splitTagsFromText(string)) """Create a generaor that splits given string into xml tags and text""" result = [] stack = [] for char in string: if char == '<': if result and not stack: yield ''.join(result) result = [] result.append(char) stack.append('>') continue result.append(char) if char == '>': stack.pop() if not stack: yield ''.join(result) result = [] if result: yield ''.join(result) def stripPunct(string): #rebuild the string stripping punctuation """Remove punctuation from a given string""" punct = u'“̈҃ⸯ·҇!#$%&=\'()*+,-.:;?@[\\]^_`{|}~”' assemble = [] inTag= False for char in string: if char == '<': inTag = True elif char == '>': inTag = False if inTag: assemble.append(char) #add char to the output unaltered, if it's inside an xml tag elif not char in punct: assemble.append(char.lower()) #if charis not part of tag and is not a punctuation mark, add char to the output lowercased return ''.join(assemble) def applyRule(word, ruleSet): #apply rules found in the rule file. match in to out and replace """Helper function to conflate. Applies rules from the conflation file""" if ruleSet[0].parentNode.localName == 'oneToOne': for rule in ruleSet: for char in rule.getElementsByTagName('in')[0].firstChild.nodeValue: word = word.replace(char, rule.getElementsByTagName('out')[0].firstChild.nodeValue) return word else: for rule in ruleSet: return word.replace(rule.getElementsByTagName('in')[0].firstChild.nodeValue, rule.getElementsByTagName('out')[0].firstChild.nodeValue) def degeminate(word): #turn consecutively repeating characters in a word into singlets """Helper function to conflate. Degeminates words.""" output = word[0] for index, char in enumerate(word): if index > 0: if word[index] == word[index-1]: continue output += char return output def padWithXs(word): # cut the soundex representation down to 4 chars long, or pad it up to being 4 using 0s if it's less than 4 """Helper function to conflate. Pads words with zeroes or cuts them off to have all words be 4 charslong""" word = word.replace(' ', '') if len(word) < 4: return word + 'X'*(4-len(word)) elif len(word) > 4: return word[:4] else: return word digits = {'1': u'а', '2': u'в', '3': u'г', '4': u'д', '5': u'е', '6': u'ѕ', '7': u'ӡ', '8': u'і', '9': u'ѳ'} tens = {'1': u'і', '2': u'к', '3': u'л', '4': u'м', '5': u'н', '6': u'ѯ', '7': u'о', '8': u'п', '9': u'ч'} hundreds = {'1': u'р', '2': u'с', '3': u'т', '4': u'у', '5': u'ф', '6': u'х', '7': u'ѱ', '8': u'ѡ', '9': u'ц'} def cyrrilizeNumber(num): num = num[::-1] c = 0 cyr = '' for char in num: c += 1 if not char == '0': if c == 1: cyr += digits[char] elif c == 2: cyr += tens[char] elif c == 3: cyr += hundreds[char] elif c == 4: cyr += digits[char] cyr += u'҂' elif c == 5: cyr += u'⃝' cyr += digits[char] return cyr[::-1] def getNumber(subpart): #generate u values from filename if not '-' in subpart: for char in subpart: if char == '0': continue startfrom = subpart.index(char) break try: return subpart[startfrom:] except UnboundLocalError: return '0' else: return '/'.join([getNumber(i) for i in subpart.split('-')]) def parseName(f): #generate u values from filename f = f[:-4].split('_') return ','.join([getNumber(i) for i in f[1:]]) def conflate(w): # main function that calls all of the above. Currently under reconstruction. """Execute conflation rules in a given order""" kids = [el for el in w.getElementsByTagName('*')] for kid in kids: elName = kid.localName parent = kid.parentNode if elName in ['add', 'hi', 'unclear']: removeElementTags(elName, parent) break elif elName in ['del', 'gap', 'lacuna', 'lb', 'pb']: deleteElements(elName, parent) break elif elName =='choice': choose(kid) break removeElementTags('choice', w) rules = minidom.parse(r'soundex-rules.xml') vowels = minidom.parse(r'vowels.xml') vowelList = [v.firstChild.nodeValue for v in vowels.getElementsByTagName('vowel')] manyToOne = rules.getElementsByTagName('manyToOne')[0].getElementsByTagName('set') oneToMany = rules.getElementsByTagName('oneToMany')[0].getElementsByTagName('set') oneToOne = rules.getElementsByTagName('oneToOne')[0].getElementsByTagName('set') generalVowels = vowels.getElementsByTagName('general')[0].getElementsByTagName('vowel') specialVowels = vowels.getElementsByTagName('special')[0].getElementsByTagName('vowel') wlist = [] for i in list(splitTagsFromText(w.toxml())): if not i.startswith('<'): wlist.append(i) word = stripPunct(''.join(wlist)).strip() if len(word) == 0: return 'PUNC' splitNumbers = re.findall(numberSplitter, word) if len(splitNumbers) > 1 or splitNumbers[0].isdigit(): temp = [] for group in splitNumbers: if group.isdigit(): group = cyrrilizeNumber(group) temp.extend(group) word = ''.join(temp) # apply rules as specified in soundex-rules.xml word = applyRule(word, manyToOne) word = applyRule(word, oneToMany) word = applyRule(word, oneToOne) # entirely eliminating vowels in the special category from all words temp = word[0] for char in word[1:]: for vowel in specialVowels: if not char == vowel: temp += char word = temp # degeminate words, get rid of noninitial vowels newWord = word[0] # Keep the first character even if it's a vowel degeminated = degeminate(word) for char in degeminated[1:]: #Append only consonants starting at the char in position 1 if not char in vowelList: newWord += char return padWithXs(newWord)