forked from jinhojsk515/SPMM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
spmm_custom_dataset.py
125 lines (96 loc) · 4.96 KB
/
spmm_custom_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from torch.utils.data import Dataset
import torch
import random
import pandas as pd
from rdkit import Chem
import pickle
from rdkit import RDLogger
from calc_property import calculate_property
RDLogger.DisableLog('rdApp.*')
class SMILESDataset_SHIN_MLM(Dataset):
def __init__(self, data_path, mode='train', fold_num=0, shuffle=False):
assert mode in ['train', 'val', 'test'], "Mode should be either 'train', 'val', or 'test'"
data = pd.read_csv(data_path)
self.data = [data.iloc[i] for i in range(len(data))]
self.validation_ranges = [(0, 400), (800, 1200), (1600, 2000), (2400, 2800), (3000, 3400)]
if mode == 'val':
val_start, val_end = self.validation_ranges[fold_num]
self.current_data = self.data[val_start:val_end]
elif mode == 'train':
val_start, val_end = self.validation_ranges[fold_num]
self.current_data = self.data[:val_start] + self.data[val_end:]
elif mode == 'test':
# For test mode, use the entire dataset
self.current_data = self.data
if shuffle:
random.shuffle(self.current_data)
def __len__(self):
return len(self.current_data)
def __getitem__(self, index):
mol = Chem.MolFromSmiles(self.current_data[index]['SMILES'])
smiles = Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True)
if 'MLM' in self.current_data[index] and 'HLM' in self.current_data[index]:
value1 = torch.tensor(self.current_data[index]['MLM'].item())
value2 = torch.tensor(self.current_data[index]['HLM'].item())
difference = torch.tensor(self.current_data[index]['HLM'].item() - self.current_data[index]['MLM'].item())
return '[CLS]' + smiles, value1, value2, difference
else:
return '[CLS]' + smiles
class SMILESDataset_SHIN_HLM(Dataset):
def __init__(self, data_path, mode='train', fold_num=0, shuffle=False):
assert mode in ['train', 'val', 'test'], "Mode should be either 'train', 'val', or 'test'"
data = pd.read_csv(data_path)
self.data = [data.iloc[i] for i in range(len(data))]
# Defining validation index ranges
self.validation_ranges = [(0, 400), (800, 1200), (1600, 2000), (2400, 2800), (3000, 3400)]
if mode == 'val':
val_start, val_end = self.validation_ranges[fold_num]
self.current_data = self.data[val_start:val_end]
elif mode == 'train':
val_start, val_end = self.validation_ranges[fold_num]
self.current_data = self.data[:val_start] + self.data[val_end:]
elif mode == 'test':
# For test mode, use the entire dataset
self.current_data = self.data
if shuffle:
random.shuffle(self.current_data)
def __len__(self):
return len(self.current_data)
def __getitem__(self, index):
mol = Chem.MolFromSmiles(self.current_data[index]['SMILES'])
smiles = Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True)
if 'HLM' in self.current_data[index] and 'MLM' in self.current_data[index]:
value1 = torch.tensor(self.current_data[index]['HLM'].item())
value2 = torch.tensor(self.current_data[index]['MLM'].item())
difference = torch.tensor(self.current_data[index]['MLM'].item() - self.current_data[index]['HLM'].item())
return '[CLS]' + smiles, value1, value2, difference
else:
return '[CLS]' + smiles
class FEATUREDataset(Dataset):
def __init__(self, data_path, mode='train', fold_num=0, shuffle=False):
assert mode in ['train', 'val', 'test'], "Mode should be either 'train', 'val', or 'test'"
data = pd.read_csv(data_path)
self.data = [data.iloc[i] for i in range(len(data))]
# Defining validation index ranges
self.validation_ranges = [(0, 400), (800, 1200), (1600, 2000), (2400, 2800), (3000, 3400)]
if mode == 'val':
val_start, val_end = self.validation_ranges[fold_num]
self.current_data = self.data[val_start:val_end]
elif mode == 'train':
val_start, val_end = self.validation_ranges[fold_num]
self.current_data = self.data[:val_start] + self.data[val_end:]
elif mode == 'test':
# For test mode, use the entire dataset
self.current_data = self.data
if shuffle:
random.shuffle(self.current_data)
self.mode = mode
def __len__(self):
return len(self.current_data)
def __getitem__(self, index):
row = self.current_data[index]
# For train and validation, exclude the first column
# For test, include all columns
values_to_convert = row.values[1:] if self.mode != 'test' else row.values
tensor_row = torch.tensor(values_to_convert, dtype=torch.float)
return tensor_row