【关系抽取-R-BERT】加载数据集

认识数据集 Component-Whole(e2,e1) The system as described above has its greatest application in an arrayed <e1> configuration </e1> of antenna <e2> elements </e2>. Other The <e1> child </e1> was carefully wrapped and bound into the <e2> cradle </e2> by means of a cord. Instrument-Agency(e2,e1) The <e1> author </e1> of a keygen uses a <e2> disassembler </e2> to look at the raw assembly code. Other A misty <e1> ridge </e1> uprises from the <e2> surge </e2>. Member-Collection(e1,e2) The <e1> student </e1> <e2> association </e2> is the voice of the undergraduate student population of the State University of New York at Buffalo. Other This is the sprawling <e1> complex </e1> that is Peru's largest <e2> producer </e2> of silver. Cause-Effect(e2,e1) The current view is that the chronic <e1> inflammation </e1> in the distal part of the stomach caused by Helicobacter pylori <e2> infection </e2> results in an increased acid production from the non-infected upper corpus region of the stomach. Entity-Destination(e1,e2) <e1> People </e1> have been moving back into <e2> downtown </e2>. Content-Container(e1,e2) The <e1> lawsonite </e1> was contained in a <e2> platinum crucible </e2> and the counter-weight was a plastic crucible with metal pieces. Entity-Destination(e1,e2) The solute was placed inside a beaker and 5 mL of the <e1> solvent </e1> was pipetted into a 25 mL glass <e2> flask </e2> for each trial. Member-Collection(e1,e2) The fifty <e1> essays </e1> collected in this <e2> volume </e2> testify to most of the prominent themes from Professor Quispel's scholarly career. Other Their <e1> composer </e1> has sunk into <e2> oblivion </e2>.

该数据是SemEval2010 Task8数据集,数据,具体介绍可以参考:https://blog.csdn.net/qq_29883591/article/details/88567561

处理数据相关代码:data_loader.py import copy import csv import json import logging import os import torch from torch.utils.data import TensorDataset from utils import get_label logger = logging.getLogger(__name__) class InputExample(object): """ A single training/test example for simple sequence classification. Args: guid: Unique id for the example. text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified. label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ def __init__(self, guid, text_a, label): self.guid = guid self.text_a = text_a self.label = label def __repr__(self): return str(self.to_json_string()) def to_dict(self): """Serializes this instance to a Python dictionary.""" output = copy.deepcopy(self.__dict__) return output def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" class InputFeatures(object): """ A single set of features of data. Args: input_ids: Indices of input sequence tokens in the vocabulary. attention_mask: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. token_type_ids: Segment token indices to indicate first and second portions of the inputs. """ def __init__(self, input_ids, attention_mask, token_type_ids, label_id, e1_mask, e2_mask): self.input_ids = input_ids self.attention_mask = attention_mask self.token_type_ids = token_type_ids self.label_id = label_id self.e1_mask = e1_mask self.e2_mask = e2_mask def __repr__(self): return str(self.to_json_string()) def to_dict(self): """Serializes this instance to a Python dictionary.""" output = copy.deepcopy(self.__dict__) return output def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" class SemEvalProcessor(object): """Processor for the Semeval data set """ def __init__(self, args): self.args = args self.relation_labels = get_label(args) @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" with open(input_file, "r", encoding="utf-8") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) lines = [] for line in reader: lines.append(line) return lines def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = line[1] label = self.relation_labels.index(line[0]) if i % 1000 == 0: logger.info(line) examples.append(InputExample(guid=guid, text_a=text_a, label=label)) return examples def get_examples(self, mode): """ Args: mode: train, dev, test """ file_to_read = None if mode == "train": file_to_read = self.args.train_file elif mode == "dev": file_to_read = self.args.dev_file elif mode == "test": file_to_read = self.args.test_file logger.info("LOOKING AT {}".format(os.path.join(self.args.data_dir, file_to_read))) return self._create_examples(self._read_tsv(os.path.join(self.args.data_dir, file_to_read)), mode) processors = {"semeval": SemEvalProcessor} def convert_examples_to_features( examples, max_seq_len, tokenizer, cls_token="[CLS]", cls_token_segment_id=0, sep_token="[SEP]", pad_token=0, pad_token_segment_id=0, sequence_a_segment_id=0, add_sep_token=False, mask_padding_with_zero=True, ): features = [] for (ex_index, example) in enumerate(examples): if ex_index % 5000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(examples))) tokens_a = tokenizer.tokenize(example.text_a) e11_p = tokens_a.index("<e1>") # the start position of entity1 e12_p = tokens_a.index("</e1>") # the end position of entity1 e21_p = tokens_a.index("<e2>") # the start position of entity2 e22_p = tokens_a.index("</e2>") # the end position of entity2 # Replace the token tokens_a[e11_p] = "$" tokens_a[e12_p] = "$" tokens_a[e21_p] = "#" tokens_a[e22_p] = "#" # Add 1 because of the [CLS] token e11_p += 1 e12_p += 1 e21_p += 1 e22_p += 1 # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. if add_sep_token: special_tokens_count = 2 else: special_tokens_count = 1 if len(tokens_a) > max_seq_len - special_tokens_count: tokens_a = tokens_a[: (max_seq_len - special_tokens_count)] tokens = tokens_a if add_sep_token: tokens += [sep_token] token_type_ids = [sequence_a_segment_id] * len(tokens) tokens = [cls_token] + tokens token_type_ids = [cls_token_segment_id] + token_type_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_len - len(input_ids) input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) # e1 mask, e2 mask e1_mask = [0] * len(attention_mask) e2_mask = [0] * len(attention_mask) for i in range(e11_p, e12_p + 1): e1_mask[i] = 1 for i in range(e21_p, e22_p + 1): e2_mask[i] = 1 assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len) assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format( len(attention_mask), max_seq_len ) assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format( len(token_type_ids), max_seq_len ) label_id = int(example.label) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % example.guid) logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label_id)) logger.info("e1_mask: %s" % " ".join([str(x) for x in e1_mask])) logger.info("e2_mask: %s" % " ".join([str(x) for x in e2_mask])) features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label_id=label_id, e1_mask=e1_mask, e2_mask=e2_mask, ) ) return features def load_and_cache_examples(args, tokenizer, mode): processor = processors[args.task](args) # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( mode, args.task, list(filter(None, args.model_name_or_path.split("http://www.likecs.com/"))).pop(), args.max_seq_len, ), ) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) if mode == "train": examples = processor.get_examples("train") elif mode == "dev": examples = processor.get_examples("dev") elif mode == "test": examples = processor.get_examples("test") else: raise Exception("For mode, Only train, dev, test is available") features = convert_examples_to_features( examples, args.max_seq_len, tokenizer, add_sep_token=http://www.likecs.com/args.add_sep_token ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_e1_mask = torch.tensor([f.e1_mask for f in features], dtype=torch.long) # add e1 mask all_e2_mask = torch.tensor([f.e2_mask for f in features], dtype=torch.long) # add e2 mask all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_mask, all_token_type_ids, all_label_ids, all_e1_mask, all_e2_mask, ) return dataset

这里面用到了utils.py中的get_label函数:

def get_label(args): return [label.strip() for label in open(os.path.join(args.data_dir, args.label_file), "r", encoding="utf-8")]

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/wssdsz.html