你的位置:首页 > 信息动态 > 新闻中心
信息动态
联系我们

NLP——常见任务的批量加载2.0

2021/12/24 0:24:01

NLP——常见任务的批量加载2.0

目标:针对NLP子任务,如文本分类、命名实体识别、文本匹配、关系抽取等,如何使用keras批量加载训练集、验证集或测试集,来提升训练或预测效率?

参考博客:NLP——如何批量加载数据

1、NER任务的数据生成器

import numpy as np
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.tokenizers import Tokenizer
from loguru import logger


def load_ner_data(filename):
    """
    功能: 加载数据, 单条格式:[text, (start, end, label), (start, end, label), ...],
         意味着text[start:end + 1]是类型为label的实体。
    :param filename: 数据路径
    :return:
    """
    ner_data, label_map = [], set()
    with open(filename, "r", encoding="utf-8") as file:
        f = file.read()
        for l in f.split("\n\n"):
            if not l:
                continue
            d = [""]
            for i, c in enumerate(l.split("\n")):
                char, flag = c.split(" ")
                d[0] += char
                if flag[0] == "B":
                    d.append([i, i, flag[2:]])
                    label_map.add(flag[2:])
                elif flag[0] == "I":
                    d[-1][1] = i
            ner_data.append(d)
    return ner_data, label_map


class NerDataGenerator(DataGenerator):
    def __init__(self, max_length, dict_path, data, categories, batch_size, buffer_size):
        # 子类继承父类, 并进行初始化
        super(NerDataGenerator, self).__init__(data=data, batch_size=batch_size, buffer_size=buffer_size)
        self.max_length = max_length
        self.tokenizer = Tokenizer(dict_path, do_lower_case=True)
        self.categories = categories

    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, d in self.sample(random):
            tokens = self.tokenizer.tokenize(d[0], maxlen=self.max_length)
            mapping = self.tokenizer.rematch(d[0], tokens)
            start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
            end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
            token_ids = self.tokenizer.tokens_to_ids(tokens)
            segment_ids = [0] * len(token_ids)
            labels = np.zeros(len(token_ids), dtype=int)
            for start, end, label in d[1:]:
                if start in start_mapping and end in end_mapping:
                    start = start_mapping[start]
                    end = end_mapping[end]
                    labels[start] = self.categories.index(label) * 2 + 1
                    labels[start + 1:end + 1] = self.categories.index(label) * 2 + 2
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append(labels)
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []


if __name__ == "__main__":
    # 命名实体识别数据生成器
    train_data, categories = load_ner_data('./china-people-daily-ner-corpus/example.train')
    ner_data_generator = NerDataGenerator(max_length=128,
                                          dict_path="./vocab.txt",
                                          data=train_data,
                                          categories=list(sorted(categories)),
                                          batch_size=32,
                                          buffer_size=None)
    flag = -1
    for x_true, y_true in ner_data_generator:
        # 输出每个batch中的数据
        for idx in range(len(x_true[0])):
            flag += 1
            logger.info("第{0}条文本: \"{1}\"".format(flag, train_data[flag][0]))
            logger.info("word2id: {0}".format(str(list(x_true[0][idx]))))
            logger.info("mask: {0}".format(str(list(x_true[1][idx]))))
            logger.info("label: {0}\n\n".format(str(list(y_true[idx]))))
        break

运行结果(前10行):

2021-12-24 00:06:44.949 | INFO     | __main__:<module>:164 - 第0条文本: "海钓比赛地点在厦门与金门之间的海域。"
2021-12-24 00:06:44.950 | INFO     | __main__:<module>:165 - word2id: [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.950 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.950 | INFO     | __main__:<module>:167 - label: [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2021-12-24 00:06:44.950 | INFO     | __main__:<module>:164 - 第1条文本: "这座依山傍水的博物馆由国内一流的设计师主持设计,整个建筑群精美而恢宏。"
2021-12-24 00:06:44.950 | INFO     | __main__:<module>:165 - word2id: [101, 6821, 2429, 898, 2255, 988, 3717, 4638, 1300, 4289, 7667, 4507, 1744, 1079, 671, 3837, 4638, 6392, 6369, 2360, 712, 2898, 6392, 6369, 8024, 3146, 702, 2456, 5029, 5408, 5125, 5401, 5445, 2612, 2131, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.950 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.951 | INFO     | __main__:<module>:167 - label: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2021-12-24 00:06:44.951 | INFO     | __main__:<module>:164 - 第2条文本: "但作为一个共产党员、人民公仆,应当胸怀宽阔,真正做到“先天下之忧而忧,后天下之乐而乐”,淡化个人的名利得失和宠辱悲喜,把改革大业摆在首位,这样才能超越自我,摆脱世俗,有所作为。"
2021-12-24 00:06:44.951 | INFO     | __main__:<module>:165 - word2id: [101, 852, 868, 711, 671, 702, 1066, 772, 1054, 1447, 510, 782, 3696, 1062, 789, 8024, 2418, 2496, 5541, 2577, 2160, 7333, 8024, 4696, 3633, 976, 1168, 100, 1044, 1921, 678, 722, 2569, 5445, 2569, 8024, 1400, 1921, 678, 722, 727, 5445, 727, 100, 8024, 3909, 1265, 702, 782, 4638, 1399, 1164, 2533, 1927, 1469, 2143, 6802, 2650, 1599, 8024, 2828, 3121, 7484, 1920, 689, 3030, 1762, 7674, 855, 8024, 6821, 3416, 2798, 5543, 6631, 6632, 5632, 2769, 8024, 3030, 5564, 686, 921, 8024, 3300, 2792, 868, 711, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.951 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.951 | INFO     | __main__:<module>:167 - label: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2021-12-24 00:06:44.951 | INFO     | __main__:<module>:164 - 第3条文本: "在发达国家,急救保险十分普及,已成为社会保障体系的重要组成部分。"
2021-12-24 00:06:44.951 | INFO     | __main__:<module>:165 - word2id: [101, 1762, 1355, 6809, 1744, 2157, 8024, 2593, 3131, 924, 7372, 1282, 1146, 3249, 1350, 8024, 2347, 2768, 711, 4852, 833, 924, 7397, 860, 5143, 4638, 7028, 6206, 5299, 2768, 6956, 1146, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.951 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.951 | INFO     | __main__:<module>:167 - label: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2021-12-24 00:06:44.952 | INFO     | __main__:<module>:164 - 第4条文本: "日俄两国国内政局都充满变数,尽管日俄关系目前是历史最佳时期,但其脆弱性不言自明。"
2021-12-24 00:06:44.952 | INFO     | __main__:<module>:165 - word2id: [101, 3189, 915, 697, 1744, 1744, 1079, 3124, 2229, 6963, 1041, 4007, 1359, 3144, 8024, 2226, 5052, 3189, 915, 1068, 5143, 4680, 1184, 3221, 1325, 1380, 3297, 881, 3198, 3309, 8024, 852, 1071, 5546, 2483, 2595, 679, 6241, 5632, 3209, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.952 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.952 | INFO     | __main__:<module>:167 - label: [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2021-12-24 00:06:44.952 | INFO     | __main__:<module>:164 - 第5条文本: "克马尔的女儿让娜今年读五年级,她所在的班上有30多名同学,该班的“家委会”由10名家长组成。"
2021-12-24 00:06:44.952 | INFO     | __main__:<module>:165 - word2id: [101, 1046, 7716, 2209, 4638, 1957, 1036, 6375, 2025, 791, 2399, 6438, 758, 2399, 5277, 8024, 1961, 2792, 1762, 4638, 4408, 677, 3300, 8114, 1914, 1399, 1398, 2110, 8024, 6421, 4408, 4638, 100, 2157, 1999, 833, 100, 4507, 8108, 1399, 2157, 7270, 5299, 2768, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.952 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.952 | INFO     | __main__:<module>:167 - label: [0, 5, 6, 6, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2021-12-24 00:06:44.952 | INFO     | __main__:<module>:164 - 第6条文本: "参加步行的有男有女,有年轻人,也有中年人。"
2021-12-24 00:06:44.952 | INFO     | __main__:<module>:165 - word2id: [101, 1346, 1217, 3635, 6121, 4638, 3300, 4511, 3300, 1957, 8024, 3300, 2399, 6768, 782, 8024, 738, 3300, 704, 2399, 782, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.953 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.953 | INFO     | __main__:<module>:167 - label: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2021-12-24 00:06:44.953 | INFO     | __main__:<module>:164 - 第7条文本: "沙特队教练佩雷拉:两支队都想胜,因此都作出了最大的努力。"
2021-12-24 00:06:44.953 | INFO     | __main__:<module>:165 - word2id: [101, 3763, 4294, 7339, 3136, 5298, 877, 7440, 2861, 8038, 697, 3118, 7339, 6963, 2682, 5526, 8024, 1728, 3634, 6963, 868, 1139, 749, 3297, 1920, 4638, 1222, 1213, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.953 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.953 | INFO     | __main__:<module>:167 - label: [0, 3, 4, 4, 0, 0, 5, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2021-12-24 00:06:44.953 | INFO     | __main__:<module>:164 - 第8条文本: "这种混乱局面导致有些海域使用者的合法权益难以得到维护。"
2021-12-24 00:06:44.953 | INFO     | __main__:<module>:165 - word2id: [101, 6821, 4905, 3921, 744, 2229, 7481, 2193, 5636, 3300, 763, 3862, 1818, 886, 4500, 5442, 4638, 1394, 3791, 3326, 4660, 7410, 809, 2533, 1168, 5335, 2844, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.953 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.954 | INFO     | __main__:<module>:167 - label: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2021-12-24 00:06:44.954 | INFO     | __main__:<module>:164 - 第9条文本: "鲁宾明确指出,对政府的这种指控完全没有事实根据,美国政府不想也没有向中国转让敏感技术,事实真相总有一天会大白于天下;众议院的这种做法令人“非常失望”,将使美国的商业卫星产业受到威胁,使美国的竞争力受到损害。"
2021-12-24 00:06:44.954 | INFO     | __main__:<module>:165 - word2id: [101, 7826, 2161, 3209, 4802, 2900, 1139, 8024, 2190, 3124, 2424, 4638, 6821, 4905, 2900, 2971, 2130, 1059, 3766, 3300, 752, 2141, 3418, 2945, 8024, 5401, 1744, 3124, 2424, 679, 2682, 738, 3766, 3300, 1403, 704, 1744, 6760, 6375, 3130, 2697, 2825, 3318, 8024, 752, 2141, 4696, 4685, 2600, 3300, 671, 1921, 833, 1920, 4635, 754, 1921, 678, 8039, 830, 6379, 7368, 4638, 6821, 4905, 976, 3791, 808, 782, 100, 7478, 2382, 1927, 3307, 100, 8024, 2199, 886, 5401, 1744, 4638, 1555, 689, 1310, 3215, 772, 689, 1358, 1168, 2014, 5516, 8024, 886, 5401, 1744, 4638, 4993, 751, 1213, 1358, 1168, 2938, 2154, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.954 | INFO     | __main__:<module>:166 - mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2021-12-24 00:06:44.954 | INFO     | __main__:<module>:167 - label: [0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

说明:

  • example.train的部分数据格式如下:
海 O
钓 O
比 O
赛 O
地 O
点 O
在 O
厦 B-LOC
门 I-LOC
与 O
金 B-LOC
门 I-LOC
之 O
间 O
的 O
海 O
域 O
。 O

这 O
座 O
依 O
山 O
傍 O
水 O
的 O
博 O
物 O
馆 O
由 O
国 O
内 O
一 O
流 O
的 O
设 O
计 O
师 O
主 O
持 O
设 O
计 O
, O
整 O
个 O
建 O
筑 O
群 O
精 O
美 O
而 O
恢 O
宏 O
。 O
  • vocab.txt文件:大规模预训练模型的词典文件

2、基于R-drop的文本分类数据生成器

todo