-
Notifications
You must be signed in to change notification settings - Fork 0
/
mydataset.py
93 lines (72 loc) · 3.2 KB
/
mydataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- encoding: utf-8 -*-
'''
@File : dataset.py
@Time : 2024/06/04 11:17:36
@Author : Feng zhixin
@Description : build dataset for classification
'''
# here put the import lib
from torch.utils.data import Dataset
import os
import json
import torch
class ClassificationDataset(Dataset):
def __init__(self, file_path):
if os.path.isfile(file_path) is False:
raise ValueError(f"Input file path {file_path} not found")
# 从json文件中读取数据
self.data = []
with open(file_path, 'r') as f:
for row in f:
self.data.append(json.loads(row))
def __len__(self):
return len(self.data)
def __getitem__(self, index):
label = self.data[index]['cluster']
cmd_block = self.data[index]['cmd']
return cmd_block, label
class Colab_ClassificationDataset(Dataset):
def __init__(self, file_path, tokenizer, max_length):
if os.path.isfile(file_path) is False:
raise ValueError(f"Input file path {file_path} not found")
# 从json文件中读取数据
self.data = []
with open(file_path, 'r') as f:
for row in f:
self.data.append(json.loads(row))
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, index):
label = self.data[index]['cluster'] # 对应的cluster_id
cmd_block = self.data[index]['cmd'] # 取出的cmd_block
# 给数据添加特殊字符
# 使用[sep]将cmd_block进行拼接
cmd_block = ' [SEP] '.join(cmd_block)
# tokens = self.tokenizer.tokenize(cmd_block)
cmd_inputs = self.tokenizer.encode_plus(cmd_block,
None,
add_special_tokens=True,
max_length=self.max_length,
pad_to_max_length=True,
return_token_type_ids=True)
ids = cmd_inputs['input_ids']
mask = cmd_inputs['attention_mask']
token_type_ids = cmd_inputs['token_type_ids']
return {'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(label, dtype=torch.float) }
if __name__ == "__main__":
# dataset = ClassificationDataset('processed_data/alarm_test.json')
# for i in range(len(dataset)):
# print(dataset[i])
# 验证Colab_ClassificationDataset
from transformers import BertTokenizer
tokenizer = BertTokenizer(vocab_file='/home/log_generation/4_BERT/myBert/processed_data/alarm_test_vocab.txt', do_lower_case=False, do_basic_tokenize=False)
tokenizer.model_max_length = 128
print("******** Tokenizer ********\n", tokenizer)
dataset = Colab_ClassificationDataset('/home/log_generation/4_BERT/myBert/processed_data/alarm_test.json', tokenizer, 128)
for i in range(len(dataset)):
print(dataset[i])