-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataset.py
128 lines (97 loc) · 5.85 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os, argparse
import numpy as np
import tqdm
import openslide
import utils.dataset as dataset
def retrieve_file(dir, name_without_extension):
for file_ in os.listdir(dir):
if os.path.split(file_)[0] == name_without_extension:
return os.path.join(dir, file_)
return None
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--annotations_path", required=True, type=str, help="path to directory with regions annotations")
parser.add_argument("--wsi_path", required=True, type=str, help="path to directory with WSI files")
parser.add_argument("--output_path", required=True, type=str, help="path to output directory")
parser.add_argument("--mask_level", required=True, type=int, help="level to create tissue mask")
parser.add_argument("--patch_level", required=True, type=int, help="level to extract patches")
parser.add_argument("--patch_size", required=True, type=int, help="size of patches (width and heigth)")
parser.add_argument("--max_length", default=256, type=int, help="maximum length of a sequence")
parser.add_argument("--min_length", default=200, type=int, help="minimum length of a sequence")
parser.add_argument("--min_tissue_ratio", required=True, type=float, help="threshold of tissue in patch")
args = parser.parse_args()
for stain in os.listdir(args.annotations_path):
print(stain)
for annot_file in tqdm.tqdm(os.listdir(os.path.join(args.annotations_path, stain)), ncols=50):
slide_name = os.path.split(annot_file)[0]
slide = retrieve_file(args.wsi_path, slide_name)
if slide is None:
continue
# open slide file
slide = openslide.OpenSlide(slide)
# dir to save patches
slide_dir = os.path.join(args.output_path, slide_name)
if not os.path.isdir(slide_dir):
os.makedirs(slide_dir, exist_ok=True)
# tissue mask of the whole slide
tissue_mask = dataset.get_tissue_mask(slide, args.mask_level)
# load regions
regions = dataset.load_contours_from_json(os.path.join(args.annotations_path, stain, annot_file))
for id_, ctr in regions.items():
region_dir = os.path.join(slide_dir, id_)
os.makedirs(region_dir, exist_ok=True)
stain_dir = os.path.join(region_dir, stain)
os.makedirs(stain_dir, exist_ok=True)
# coordinates of region's bounding-box at level 0
x0 = ctr[..., 0].min()
y0 = ctr[..., 1].min()
x1 = ctr[..., 0].max()
y1 = ctr[..., 1].max()
# creates region mask
region_mask = dataset.create_object_mask(slide, ctr, args.mask_level)
# combine region and tissue masks
region_mask = np.logical_and(region_mask, tissue_mask)
W, H = slide.level_dimensions[args.patch_level]
# iterate over WSI to extract patches within the region
region_patches = []
for x in range(0, W - args.patch_size, args.patch_size):
for y in range(0, H - args.patch_size, args.patch_size):
# coordinates of patch at level 0
x, y = dataset.scale_coordinates(slide, (x, y), args.patch_level, 0)
# check patch upper-left corner is within region
if not (x > x0 and x < x1 and y > y0 and y < y1):
continue
# coordinates of patch at mask level
x, y = dataset.scale_coordinates(slide, (x, y), 0, args.patch_level)
xbr, ybr = dataset.scale_coordinates(slide,
(x + args.patch_size, y + args.patch_size),
args.patch_level,
args.mask_level)
x, y = dataset.scale_coordinates(slide, (x, y), args.patch_level, args.mask_level)
# tissue ratio
patch_mask = region_mask[y:ybr, x:xbr]
tissue_ratio = patch_mask.sum() / patch_mask.size
if tissue_ratio >= args.min_tissue_ratio:
# coordinates of patch at level 0
x, y = dataset.scale_coordinates(slide, (x, y), args.mask_level, 0)
# store patch location
region_patches.append((x,y))
# split region list to max_length * (equal length lists) and shuffle
region_patches = np.array_split(region_patches, args.max_length)
region_patches = [np.random.permutation(i).tolist() for i in region_patches]
# creates list of samples
samples = []
while all(region_patches):
one_sample = [i.pop() for i in region_patches]
samples.append(one_sample)
# save sequences on disk
for i, patches in enumerate(samples):
# check there are enough patches
if len(patches) < args.min_length:
continue
# extract and save the patches in a numpy file
fn = lambda p : slide.read_region(p, args.patch_level, (args.patch_size, args.patch_size)).convert('RGB')
patches = np.stack(list(map(fn, patches)), axis=0)
path_ = os.path.join(stain_dir, '{}.npy'.format(i))
np.save(path_, patches)
print("done")