# -*- coding: utf-8 -*-
#parsing a single xml file, like the pavlova project, split into blocks. Output written into the same folder. Presumably deleted afterwards downthe pipeline.

import datetime, json, os, Preprocessing, sys, xml.dom.minidom as minidom
startTimeX2J = datetime.datetime.now()
os.chdir(os.path.abspath(os.path.dirname(__file__)))
args = sys.argv
assert len(args) == 3, "Expected 4 arguments exactly! -i followed by input directory path"
assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory"

path = args[args.index('-i')+1]
print path

xmls = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'xml' , os.listdir(path))
l = len(xmls)
count = 0
for afile in xmls:
    count += 1
    print 'XMLToJSON.py: Processing', afile, 'file', count, 'out of', l 
    unit = Preprocessing.parseName(afile) 
    root = {}
    alldocs = []
    rdgs = [el for el in minidom.parse(os.path.join(path, afile)).getElementsByTagName('*') if el.localName in ['lem', 'rdg']]
    for rdg in rdgs:
        docLevel = {}
        docLevel['id'] = rdg.getAttribute('wit')
        tokenList = []
        ws = rdg.getElementsByTagName('w')
        words = []
        for w in range(len(ws)):
            if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w
                continue
            currentWord = ws[w]
            previousWord = ''
            try:
                previousWord = ws[w-1]
            except IndexError:
                pass
            token = {}
            token['t'] = currentWord.toxml()[8 + len(ws[w].getAttribute('n')):-4]
            c = Preprocessing.conflate(currentWord)
##            if c == Preprocessing.conflate(previousWord):
##                c += '1' # tag '1' to the end of a wod that we suspect is repeated in the manuscript.
            token['n'] = c
            token['u'] = unit
            words.append(c)
            tokenList.append(token)
        docLevel['tokens'] = tokenList
        alldocs.append(docLevel)
    root['witnesses'] = alldocs
    with open(os.path.join(path, afile[:-3] + 'json'), 'w') as Json:
        Json.write(json.dumps(root, ensure_ascii=False).encode('utf-8'))
print 'Took', datetime.datetime.now()-startTimeX2J, 'to execute XMLToJSON.py'