RSAN ValueError in python code for extraction of relation and entity extraction

Question

RSAN ValueError in python code for extraction of relation and entity extraction

49 Views Asked by ahmad At 19 January 2023 at 12:12

I am trying to run the python project RSAN, I installed all required libraries but when I start to run code from data_prepare.py it continuously gives me this I tried to be solved but still not solve it.

*Traceback (most recent call last):
  File "C:\Users\Usuario\Desktop\faiz papers\new\RSAN-master\data_prepare.py", line 256, in <module>
    Prepare.prepare()
  File "C:\Users\Usuario\Desktop\faiz papers\new\RSAN-master\data_prepare.py", line 52, in prepare
    dev_f, dev_l = self.process_dev_test(self.dev_data)
  File "C:\Users\Usuario\Desktop\faiz papers\new\RSAN-master\data_prepare.py", line 119, in process_dev_test
    return np.array(features), np.array(sen_len)
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (5000, 4) + inhomogeneous part.
*

please let me know if somebody know its solution.

Solution for Value Error by managing the array value, but I don't understand where to set these values and how.

data_prepare.py code and error line mentioned with (# line 256 error****************************)

#encoding=utf-8
import config
import json
import nltk
import os
import numpy as np
import six
from six.moves import cPickle

def pickle_load(f):
    if six.PY3:
        return cPickle.load(f, encoding='latin-1')
    else:
        return cPickle.load(f)

def pickle_dump(obj, f):
    if six.PY3:
        return cPickle.dump(obj, f, protocol=2)
    else:
        return cPickle.dump(obj, f)


class DataPrepare(object):

    def __init__(self, opt):
        self.opt = opt
        vocab = np.load(opt.input_vocab)
        self.word2id = {j: i for i, j in enumerate(vocab)}
        self.id2word = {i: j for i, j in enumerate(vocab)}
        self.rel2id = json.load(open(opt.input_rel2id, 'r'))
        self.label2id = json.load(open(opt.input_label2id, 'r'))
        self.pos2id = json.load(open(opt.input_pos2id, 'r'))
        self.char2id = json.load(open(opt.input_char2id, 'r'))
        self.train_data = self.read_json(opt.input_train)
        self.test_data = self.read_json(opt.input_test)
        self.dev_data = self.read_json(opt.input_dev)

    def prepare(self):
        print('loading data ...')

        train_pos_f, train_pos_l, train_neg_f, train_neg_l = self.process_train(self.train_data)
        with open(os.path.join(''+self.opt.root, 'train_pos_features.pkl'), 'wb') as f:
            pickle_dump(train_pos_f, f)
        with open(os.path.join(''+self.opt.root, 'train_pos_len.pkl'), 'wb') as f:
            pickle_dump(train_pos_l, f)
        with open(os.path.join('' + self.opt.root, 'train_neg_features.pkl'), 'wb') as f:
            pickle_dump(train_neg_f, f)
        with open(os.path.join('' + self.opt.root, 'train_neg_len.pkl'), 'wb') as f:
            pickle_dump(train_neg_l, f)
        print('finish')

        dev_f, dev_l = self.process_dev_test(self.dev_data) # line 52 error******************************
        np.save(os.path.join(''+self.opt.root, 'dev_features.npy'), dev_f, allow_pickle=True)
        np.save(os.path.join(''+self.opt.root, 'dev_len.npy'), dev_l, allow_pickle=True)

        test_f, test_l = self.process_dev_test(self.test_data)
        np.save(os.path.join('' + self.opt.root, 'test_features.npy'), test_f, allow_pickle=True)
        np.save(os.path.join('' + self.opt.root, 'test_len.npy'), test_l, allow_pickle=True)

    def read_json(self, filename):
        data = []
        with open('' + filename, 'r') as f:
            for line in f.readlines():
                data.append(json.loads(line))
        return data

    def find_pos(self, sent_list, word_list):
        '''
        return position list
        '''
        l = len(word_list)
        for i in range(len(sent_list)):
            flag = True
            j = 0
            while j < l:
                if word_list[j] != sent_list[i + j]:
                    flag = False
                    break
                j += 1
            if flag:
                return range(i, i+l)
        return []

    def process_dev_test(self, dataset):
        features = []
        sen_len = []
        for i, data in enumerate(dataset):
            sent_text = data['sentText']
            sent_words, sent_ids, pos_ids, sent_chars, cur_len = self.process_sentence(sent_text)
            entities = data['entityMentions']
            raw_triples_ = data['relationMentions']
            # 去重
            triples_list = []
            for t in raw_triples_:
                triples_list.append((t['em1Text'], t['em2Text'], t['label']))
            triples_ = list(set(triples_list))
            triples_.sort(key=triples_list.index)

            triples = []
            for triple in triples_:
                head, tail, relation = triple
                try:
                    if triple[2] != 'None':
                        head_words = nltk.word_tokenize(head + ',')[:-1]
                        head_pos = self.find_pos(sent_words, head_words)
                        tail_words = nltk.word_tokenize(tail + ',')[:-1]
                        tail_pos = self.find_pos(sent_words, tail_words)
                        h_chunk = ('H', head_pos[0], head_pos[-1] + 1)
                        t_chunk = ('T', tail_pos[0], tail_pos[-1] + 1)
                        triples.append((h_chunk, t_chunk, self.rel2id[relation]))
                except:
                    continue


            features.append([sent_ids, pos_ids, sent_chars, triples])
            sen_len.append(cur_len)
            if (i + 1) * 1.0 % 10000 == 0:
                print('finish %f, %d/%d' % ((i + 1.0) / len(dataset), (i + 1), len(dataset)))
        return np.array(features), np.array(sen_len) # line 119 error****************************

    def process_train(self, dataset):
        positive_features = []
        positive_lens = []
        negative_features = []
        negative_lens = []
        c = 0
        for i, data in enumerate(dataset):
            positive_feature = []
            positive_len = []
            negative_feature = []
            negative_len = []
            sent_text = data['sentText']
            # sent_chars : (max_len, max_word_len)
            sent_words, sent_ids, pos_ids, sent_chars, cur_len = self.process_sentence(sent_text)
            entities_ = data['entityMentions']
            entities = []
            for e_ in entities_:
                entities.append(e_['text'])

            raw_triples_ = data['relationMentions']
            # 去重
            triples_list = []
            for t in raw_triples_:
                triples_list.append((t['em1Text'], t['em2Text'], t['label']))
            triples_ = list(set(triples_list))
            triples_.sort(key=triples_list.index)

            triples = []
            cur_relations_list = []
            cur_relations_list.append(0)
            for triple in triples_:
                cur_relations_list.append(self.rel2id[triple[2]])
                head, tail, relation = triple
                try:
                    if triple[2] != 'None':
                        head_words = nltk.word_tokenize(head + ',')[:-1]
                        head_pos = self.find_pos(sent_words, head_words)
                        tail_words = nltk.word_tokenize(tail + ',')[:-1]
                        tail_pos = self.find_pos(sent_words, tail_words)
                        h_chunk = ('H', head_pos[0], head_pos[-1] + 1)
                        t_chunk = ('T', tail_pos[0], tail_pos[-1] + 1)
                        triples.append((h_chunk, t_chunk, self.rel2id[relation]))
                except:
                    continue

            cur_relations = list(set(cur_relations_list))
            cur_relations.sort(key=cur_relations_list.index)

            if len(cur_relations) == 1 and cur_relations[0] == 0:
                continue
            c += 1
            none_label = ['O'] * cur_len + ['X'] * (self.opt.max_len - cur_len)
            all_labels = {} #['O'] * self.max_len

            for triple in triples_:
                head, tail, relation = triple
                rel_id = self.rel2id[relation]
                #cur_label = none_label.copy()
                cur_label = all_labels.get(rel_id, none_label.copy())
                if triple[2] != 'None':
                    #label head
                    head_words = nltk.word_tokenize(head + ',')[:-1]
                    head_pos = self.find_pos(sent_words, head_words)
                    try:
                        if len(head_pos) == 1:
                            cur_label[head_pos[0]] = 'S-H'
                        elif len(head_pos) >= 2:
                            cur_label[head_pos[0]] = 'B-H'
                            cur_label[head_pos[-1]] = 'E-H'
                            for ii in range(1, len(head_pos)-1):
                                cur_label[head_pos[ii]] = 'I-H'
                    except:
                        continue

                    #label tail
                    tail_words = nltk.word_tokenize(tail + ',')[:-1]
                    tail_pos = self.find_pos(sent_words, tail_words)
                    try:
                        # not overlap enntity
                        if len(tail_pos) == 1:
                            cur_label[tail_pos[0]] = 'S-T'
                        elif len(tail_pos) >= 2:
                            cur_label[tail_pos[0]] = 'B-T'
                            cur_label[tail_pos[-1]] = 'E-T'
                            for ii in range(1, len(tail_pos)-1):
                                cur_label[tail_pos[ii]] = 'I-T'

                    except:
                        continue
                    all_labels[rel_id] = cur_label
            for ii in all_labels.keys():
                cur_label_ids = [self.label2id[e] for e in all_labels[ii]]
                positive_feature.append([sent_ids, ii, cur_label_ids, pos_ids, sent_chars])
                #positive_triple.append()
                positive_len.append(cur_len)

            none_label_ids = [self.label2id[e] for e in none_label]
            for r_id in range(self.opt.rel_num):
                if r_id not in cur_relations:
                    negative_feature.append([sent_ids, r_id, none_label_ids, pos_ids, sent_chars])
                    negative_len.append(cur_len)
            if (i + 1) * 1.0 % 10000 == 0:
                print('finish %f, %d/%d' % ((i + 1.0) / len(dataset), (i + 1), len(dataset)))
            positive_features.append(positive_feature)
            positive_lens.append(positive_len)
            negative_features.append(negative_feature)
            negative_lens.append(negative_len)
        print(c)

        return positive_features, positive_lens, negative_features, negative_lens


    def process_sentence(self, sent_text):
        sent_words = nltk.word_tokenize(sent_text)
        sen_len = min(len(sent_words), self.opt.max_len)
        sent_pos = nltk.pos_tag(sent_words)
        sent_pos_ids = [self.pos2id.get(pos[1], 1) for pos in sent_pos][:sen_len]
        sent_ids = [self.word2id.get(w, 1) for w in sent_words][:sen_len]

        sent_chars = []
        for w in sent_words[:sen_len]:
            tokens = [self.char2id.get(token, 1) for token in list(w)]
            word_len = min(len(tokens), self.opt.max_word_len)
            for _ in range(self.opt.max_word_len - word_len):
                tokens.append(0)
            sent_chars.append(tokens[: self.opt.max_word_len])

        for _ in range(sen_len, self.opt.max_len):
            sent_ids.append(0)
            sent_pos_ids.append(0)
            sent_chars.append([0] * self.opt.max_word_len)
        return sent_words[:sen_len], sent_ids, sent_pos_ids, sent_chars, sen_len

opt = config.parse_opt()
Prepare = DataPrepare(opt)
Prepare.prepare() # line 256 error****************************

Original Q&A

RSAN ValueError in python code for extraction of relation and entity extraction

There are 0 best solutions below

Related Questions in PYTHON

Related Questions in DEEP-LEARNING

Related Questions in VALUEERROR

Related Questions in INFORMATION-EXTRACTION

Trending Questions

Popular # Hahtags

Popular Questions