RSAN ValueError in python code for extraction of relation and entity extraction

49 Views Asked by At

I am trying to run the python project RSAN, I installed all required libraries but when I start to run code from data_prepare.py it continuously gives me this I tried to be solved but still not solve it.

*Traceback (most recent call last):
  File "C:\Users\Usuario\Desktop\faiz papers\new\RSAN-master\data_prepare.py", line 256, in <module>
    Prepare.prepare()
  File "C:\Users\Usuario\Desktop\faiz papers\new\RSAN-master\data_prepare.py", line 52, in prepare
    dev_f, dev_l = self.process_dev_test(self.dev_data)
  File "C:\Users\Usuario\Desktop\faiz papers\new\RSAN-master\data_prepare.py", line 119, in process_dev_test
    return np.array(features), np.array(sen_len)
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (5000, 4) + inhomogeneous part.
*

please let me know if somebody know its solution.

Solution for Value Error by managing the array value, but I don't understand where to set these values and how.

data_prepare.py code and error line mentioned with (# line 256 error****************************)

#encoding=utf-8
import config
import json
import nltk
import os
import numpy as np
import six
from six.moves import cPickle

def pickle_load(f):
    if six.PY3:
        return cPickle.load(f, encoding='latin-1')
    else:
        return cPickle.load(f)

def pickle_dump(obj, f):
    if six.PY3:
        return cPickle.dump(obj, f, protocol=2)
    else:
        return cPickle.dump(obj, f)


class DataPrepare(object):

    def __init__(self, opt):
        self.opt = opt
        vocab = np.load(opt.input_vocab)
        self.word2id = {j: i for i, j in enumerate(vocab)}
        self.id2word = {i: j for i, j in enumerate(vocab)}
        self.rel2id = json.load(open(opt.input_rel2id, 'r'))
        self.label2id = json.load(open(opt.input_label2id, 'r'))
        self.pos2id = json.load(open(opt.input_pos2id, 'r'))
        self.char2id = json.load(open(opt.input_char2id, 'r'))
        self.train_data = self.read_json(opt.input_train)
        self.test_data = self.read_json(opt.input_test)
        self.dev_data = self.read_json(opt.input_dev)

    def prepare(self):
        print('loading data ...')

        train_pos_f, train_pos_l, train_neg_f, train_neg_l = self.process_train(self.train_data)
        with open(os.path.join(''+self.opt.root, 'train_pos_features.pkl'), 'wb') as f:
            pickle_dump(train_pos_f, f)
        with open(os.path.join(''+self.opt.root, 'train_pos_len.pkl'), 'wb') as f:
            pickle_dump(train_pos_l, f)
        with open(os.path.join('' + self.opt.root, 'train_neg_features.pkl'), 'wb') as f:
            pickle_dump(train_neg_f, f)
        with open(os.path.join('' + self.opt.root, 'train_neg_len.pkl'), 'wb') as f:
            pickle_dump(train_neg_l, f)
        print('finish')

        dev_f, dev_l = self.process_dev_test(self.dev_data) # line 52 error******************************
        np.save(os.path.join(''+self.opt.root, 'dev_features.npy'), dev_f, allow_pickle=True)
        np.save(os.path.join(''+self.opt.root, 'dev_len.npy'), dev_l, allow_pickle=True)

        test_f, test_l = self.process_dev_test(self.test_data)
        np.save(os.path.join('' + self.opt.root, 'test_features.npy'), test_f, allow_pickle=True)
        np.save(os.path.join('' + self.opt.root, 'test_len.npy'), test_l, allow_pickle=True)

    def read_json(self, filename):
        data = []
        with open('' + filename, 'r') as f:
            for line in f.readlines():
                data.append(json.loads(line))
        return data

    def find_pos(self, sent_list, word_list):
        '''
        return position list
        '''
        l = len(word_list)
        for i in range(len(sent_list)):
            flag = True
            j = 0
            while j < l:
                if word_list[j] != sent_list[i + j]:
                    flag = False
                    break
                j += 1
            if flag:
                return range(i, i+l)
        return []

    def process_dev_test(self, dataset):
        features = []
        sen_len = []
        for i, data in enumerate(dataset):
            sent_text = data['sentText']
            sent_words, sent_ids, pos_ids, sent_chars, cur_len = self.process_sentence(sent_text)
            entities = data['entityMentions']
            raw_triples_ = data['relationMentions']
            # 去重
            triples_list = []
            for t in raw_triples_:
                triples_list.append((t['em1Text'], t['em2Text'], t['label']))
            triples_ = list(set(triples_list))
            triples_.sort(key=triples_list.index)

            triples = []
            for triple in triples_:
                head, tail, relation = triple
                try:
                    if triple[2] != 'None':
                        head_words = nltk.word_tokenize(head + ',')[:-1]
                        head_pos = self.find_pos(sent_words, head_words)
                        tail_words = nltk.word_tokenize(tail + ',')[:-1]
                        tail_pos = self.find_pos(sent_words, tail_words)
                        h_chunk = ('H', head_pos[0], head_pos[-1] + 1)
                        t_chunk = ('T', tail_pos[0], tail_pos[-1] + 1)
                        triples.append((h_chunk, t_chunk, self.rel2id[relation]))
                except:
                    continue


            features.append([sent_ids, pos_ids, sent_chars, triples])
            sen_len.append(cur_len)
            if (i + 1) * 1.0 % 10000 == 0:
                print('finish %f, %d/%d' % ((i + 1.0) / len(dataset), (i + 1), len(dataset)))
        return np.array(features), np.array(sen_len) # line 119 error****************************

    def process_train(self, dataset):
        positive_features = []
        positive_lens = []
        negative_features = []
        negative_lens = []
        c = 0
        for i, data in enumerate(dataset):
            positive_feature = []
            positive_len = []
            negative_feature = []
            negative_len = []
            sent_text = data['sentText']
            # sent_chars : (max_len, max_word_len)
            sent_words, sent_ids, pos_ids, sent_chars, cur_len = self.process_sentence(sent_text)
            entities_ = data['entityMentions']
            entities = []
            for e_ in entities_:
                entities.append(e_['text'])

            raw_triples_ = data['relationMentions']
            # 去重
            triples_list = []
            for t in raw_triples_:
                triples_list.append((t['em1Text'], t['em2Text'], t['label']))
            triples_ = list(set(triples_list))
            triples_.sort(key=triples_list.index)

            triples = []
            cur_relations_list = []
            cur_relations_list.append(0)
            for triple in triples_:
                cur_relations_list.append(self.rel2id[triple[2]])
                head, tail, relation = triple
                try:
                    if triple[2] != 'None':
                        head_words = nltk.word_tokenize(head + ',')[:-1]
                        head_pos = self.find_pos(sent_words, head_words)
                        tail_words = nltk.word_tokenize(tail + ',')[:-1]
                        tail_pos = self.find_pos(sent_words, tail_words)
                        h_chunk = ('H', head_pos[0], head_pos[-1] + 1)
                        t_chunk = ('T', tail_pos[0], tail_pos[-1] + 1)
                        triples.append((h_chunk, t_chunk, self.rel2id[relation]))
                except:
                    continue

            cur_relations = list(set(cur_relations_list))
            cur_relations.sort(key=cur_relations_list.index)

            if len(cur_relations) == 1 and cur_relations[0] == 0:
                continue
            c += 1
            none_label = ['O'] * cur_len + ['X'] * (self.opt.max_len - cur_len)
            all_labels = {} #['O'] * self.max_len

            for triple in triples_:
                head, tail, relation = triple
                rel_id = self.rel2id[relation]
                #cur_label = none_label.copy()
                cur_label = all_labels.get(rel_id, none_label.copy())
                if triple[2] != 'None':
                    #label head
                    head_words = nltk.word_tokenize(head + ',')[:-1]
                    head_pos = self.find_pos(sent_words, head_words)
                    try:
                        if len(head_pos) == 1:
                            cur_label[head_pos[0]] = 'S-H'
                        elif len(head_pos) >= 2:
                            cur_label[head_pos[0]] = 'B-H'
                            cur_label[head_pos[-1]] = 'E-H'
                            for ii in range(1, len(head_pos)-1):
                                cur_label[head_pos[ii]] = 'I-H'
                    except:
                        continue

                    #label tail
                    tail_words = nltk.word_tokenize(tail + ',')[:-1]
                    tail_pos = self.find_pos(sent_words, tail_words)
                    try:
                        # not overlap enntity
                        if len(tail_pos) == 1:
                            cur_label[tail_pos[0]] = 'S-T'
                        elif len(tail_pos) >= 2:
                            cur_label[tail_pos[0]] = 'B-T'
                            cur_label[tail_pos[-1]] = 'E-T'
                            for ii in range(1, len(tail_pos)-1):
                                cur_label[tail_pos[ii]] = 'I-T'

                    except:
                        continue
                    all_labels[rel_id] = cur_label
            for ii in all_labels.keys():
                cur_label_ids = [self.label2id[e] for e in all_labels[ii]]
                positive_feature.append([sent_ids, ii, cur_label_ids, pos_ids, sent_chars])
                #positive_triple.append()
                positive_len.append(cur_len)

            none_label_ids = [self.label2id[e] for e in none_label]
            for r_id in range(self.opt.rel_num):
                if r_id not in cur_relations:
                    negative_feature.append([sent_ids, r_id, none_label_ids, pos_ids, sent_chars])
                    negative_len.append(cur_len)
            if (i + 1) * 1.0 % 10000 == 0:
                print('finish %f, %d/%d' % ((i + 1.0) / len(dataset), (i + 1), len(dataset)))
            positive_features.append(positive_feature)
            positive_lens.append(positive_len)
            negative_features.append(negative_feature)
            negative_lens.append(negative_len)
        print(c)

        return positive_features, positive_lens, negative_features, negative_lens


    def process_sentence(self, sent_text):
        sent_words = nltk.word_tokenize(sent_text)
        sen_len = min(len(sent_words), self.opt.max_len)
        sent_pos = nltk.pos_tag(sent_words)
        sent_pos_ids = [self.pos2id.get(pos[1], 1) for pos in sent_pos][:sen_len]
        sent_ids = [self.word2id.get(w, 1) for w in sent_words][:sen_len]

        sent_chars = []
        for w in sent_words[:sen_len]:
            tokens = [self.char2id.get(token, 1) for token in list(w)]
            word_len = min(len(tokens), self.opt.max_word_len)
            for _ in range(self.opt.max_word_len - word_len):
                tokens.append(0)
            sent_chars.append(tokens[: self.opt.max_word_len])

        for _ in range(sen_len, self.opt.max_len):
            sent_ids.append(0)
            sent_pos_ids.append(0)
            sent_chars.append([0] * self.opt.max_word_len)
        return sent_words[:sen_len], sent_ids, sent_pos_ids, sent_chars, sen_len

opt = config.parse_opt()
Prepare = DataPrepare(opt)
Prepare.prepare() # line 256 error****************************

0

There are 0 best solutions below