-
Notifications
You must be signed in to change notification settings - Fork 56
/
Copy pathhf_dataset_example.py
77 lines (68 loc) · 2.42 KB
/
hf_dataset_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
#
from PIL import Image
import requests
from io import BytesIO
from datasets import load_dataset
import json
import numpy as np
import torch
from training.dr.transforms import compose_from_config
if __name__ == '__main__':
rconfig_aug = {
"normalize": {
"mean": [0.48145466, 0.4578275, 0.40821073],
"std": [0.26862954, 0.26130258, 0.27577711]
},
"rand_augment": {"enable": True, "p": 1.0},
"random_resized_crop": {"interpolation": "bicubic", "size": 224},
"to_rgb": {"enable": True},
"to_tensor": {"enable": True}
}
dr_transforms = compose_from_config(rconfig_aug)
dataset = load_dataset("apple/DataCompDR-12M", split="train", streaming=True)
sample = next(iter(dataset))
# Load image from URL
url = sample['url.txt']
response = requests.get(url)
img = Image.open(BytesIO(response.content))
sample["image"] = img
# Preprocess image
# Sample an image augmentation
param_augs = json.loads(sample["paug.json"]["param_aug"])
aug_idx = np.random.randint(0, len(param_augs))
params = param_augs[aug_idx]
params = dr_transforms.decompress(params)
image = sample["image"].convert('RGB')
image, _ = dr_transforms.reapply(image, params)
# Preprocess synthetic text
scapi = np.random.randint(0, len(sample["syn.json"]["syn_text"]))
syn_text = sample["syn.json"]["syn_text"][scapi]
# Preprocess embeddings
if "npz" in sample:
image_emb = sample["npz"]["image_emb"][aug_idx]
text_emb_all = sample["npz"]["text_emb"]
elif "pth.gz" in sample:
image_emb = sample["pth.gz"]["image_emb"][aug_idx]
text_emb_all = sample["pth.gz"]["text_emb"]
capi = 0
text_emb = text_emb_all[capi]
syn_text_emb = text_emb_all[1+scapi]
if not isinstance(image_emb, torch.Tensor):
image_emb = torch.tensor(image_emb)
text_emb = torch.tensor(text_emb)
syn_text_emb = torch.tensor(syn_text_emb)
image_emb = image_emb.type(torch.float32)
text_emb = text_emb.type(torch.float32)
syn_text_emb = syn_text_emb.type(torch.float32)
print(
{
'image': image.shape,
'image_emb': image_emb.shape,
'text_emb': text_emb.shape,
"syn_text": syn_text,
'syn_text_emb': syn_text_emb.shape,
}
)