import glob
import re
from nltk.stem.snowball import SnowballStemmer
import json
import numpy as np


snowball = SnowballStemmer("english")
re_alphanumeric = re.compile('[^a-z0-9 -]+')
re_multispace = re.compile(' +')



def processLine(line):

    line = line.lower()
    line = re_alphanumeric.sub('', line)
    line = re_multispace.sub(' ', line)

    return line

def processWord(word):
    word = word.lower()
    word = re_alphanumeric.sub('', word)
    word = re_multispace.sub(' ', word)
    word = snowball.stem(word)
    return word

def listSplitPlot(fd):
    lines = []
    while True:
        line = fd.readline()
        if line == '':
            break
        line = line.strip('\r\n')
        if len(line) > 1:
            line = processLine(line)
            line = line.split()
            lines.append(line)
    return lines


def compute():
    a=0
    b=0

    train = 0
    val = 0
    test = 0

    register = json.load(open('../word_vec/register.json'))
    QA = json.load(open('../raw_data/question/qa.json'))


    QA_train_vec = []
    QA_val_vec = []
    QA_test_vec = []
    with open('../word_vec/glove.42B.300d.json') as glove_file:
        glove_dict = json.load(glove_file)

        for qa in QA:
            qa_vec = {}
            qa_vec['qid'] = qa['qid']
            qa_vec['correct_index'] = qa['correct_index']
            qa_vec['imdb_key'] = qa['imdb_key']
            qa_vec['video_clips'] = qa['video_clips']

            question = processLine(qa['question']).split()
            question_vec = []
            for word in question:
                a+=1
                word_stem = snowball.stem(word)
                if word in glove_dict.keys():
                    b+=1
                    word_vec = glove_dict[word]
                    question_vec.append(word_vec)
                elif word_stem in glove_dict.keys():
                    b+=1
                    word_vec = glove_dict[word_stem]
                    question_vec.append(word_vec)                         
                elif word in register.keys():
                    word_vec = register[word]
                    question_vec.append(word_vec)
                else:
                    question_vec.append([0]*300)

            qa_vec['question'] = question_vec


            option_5 = [ processLine(X).split() for X in qa['answers'] ]
            option_5_vec = []
            for option in option_5:
                option_vec = []
                for word in option:
                    a+=1
                    if word in glove_dict.keys():
                        b+=1
                        word_vec = glove_dict[word]
                        option_vec.append(word_vec)
                    elif word in register.keys():
                        word_vec = register[word]
                        option_vec.append(word_vec)
                    else:
                        option_vec.append([0]*300)
                option_5_vec.append(option_vec)

            qa_vec['answers'] = option_5_vec

            if 'train' in qa['qid']:
                QA_train_vec.append(qa_vec)
                train+=1
            elif 'val' in qa['qid']:
                QA_val_vec.append(qa_vec)
                val+=1
            elif 'test' in qa['qid']:
                QA_test_vec.append(qa_vec)
                test+=1
            else:
                sys.exit("[ERROR]")         

    json.dump(QA_train_vec, open('../output_data/question/qa.train.json','w'))
    json.dump(QA_val_vec, open('../output_data/question/qa.val.json','w'))
    json.dump(QA_test_vec, open('../output_data/question/qa.test.json','w'))
    print('all: ',a,'  glove: ',b)
    print('train: ',train,'  val: ',val,'  test: ',test)

compute()

results matching ""

    No results matching ""