Camelyon16数据集切块批量预处理

news/2025/1/15 22:00:43/

参考自: Camelyon16数据集切块预处理
区别是这里做了批量处理

数据集目录格式:
在这里插入图片描述

** main.py**

python"># !/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2024/9/4 20:21
# @Author  : 猫娜Lisa
# @File    : camelyon16_get_patch.py
# @Software: PyCharmimport os
import json
import numpy as np
import xml.etree.ElementTree as ET
import openslide  # 这个的下载有点子麻烦的哦
import cv2
import shutil
from PIL import Image
from skimage.color import rgb2hsv
from skimage.filters import threshold_otsu
from multiprocessing import Pool
from pathos.multiprocessing import ProcessingPool as paPoolfrom config import train_root, test_root
from config import root, wsi_dir
from config import xml_dir, json_dir
from config import tumor_npy_dir, tissue_npy_dir, no_tumor_npy_dir
from config import tumor_txt_dir, no_tumor_txt_dir, normal_txt_dir
from config import tumor_patch_dir, no_tumor_patch_dir, normal_patch_dir
from config import level, RGB_min, patch_number, patch_size, level_patch, num_process
from config import if_trans_xml_to_json, if_get_tumor_mask_npy, if_get_tissue_mask_npy, if_get_no_tumor_mask_npy, if_get_sample_txt
from config import (if_get_train_tumor_patches, if_get_train_no_tumor_patches, if_get_train_normal_patches,if_get_test_tumor_patches, if_get_test_no_tumor_patches, if_get_test_normal_patches)# 将xml标注转换为json格式
# 每个注释都是多边形列表,其中每个多边形都由其顶点表示。阳性多边形表示肿瘤区域,阴性多边形表示正常区域。在本阶段,将标注格式转换成更简单的 .json 格式。
# xml_dir, json_dir, wsi_names
def camelyon16xml2json(xml_dir, json_dir, wsi_names):"""Convert an annotation of camelyon16 xml format into a json format.Arguments:inxml: string, path to the input camelyon16 xml formatoutjson: string, path to the output json format"""for wsi_name in wsi_names:inxml = xml_dir + wsi_name + '.xml'outjson = json_dir + wsi_name + '.json'root = ET.parse(inxml).getroot()annotations_tumor = \root.findall('./Annotations/Annotation[@PartOfGroup="Tumor"]')annotations_0 = \root.findall('./Annotations/Annotation[@PartOfGroup="_0"]')annotations_1 = \root.findall('./Annotations/Annotation[@PartOfGroup="_1"]')annotations_2 = \root.findall('./Annotations/Annotation[@PartOfGroup="_2"]')annotations_positive = \annotations_tumor + annotations_0 + annotations_1annotations_negative = annotations_2json_dict = {}json_dict['positive'] = []json_dict['negative'] = []for annotation in annotations_positive:X = list(map(lambda x: float(x.get('X')),annotation.findall('./Coordinates/Coordinate')))Y = list(map(lambda x: float(x.get('Y')),annotation.findall('./Coordinates/Coordinate')))vertices = np.round([X, Y]).astype(int).transpose().tolist()name = annotation.attrib['Name']json_dict['positive'].append({'name': name, 'vertices': vertices})for annotation in annotations_negative:X = list(map(lambda x: float(x.get('X')),annotation.findall('./Coordinates/Coordinate')))Y = list(map(lambda x: float(x.get('Y')),annotation.findall('./Coordinates/Coordinate')))vertices = np.round([X, Y]).astype(int).transpose().tolist()name = annotation.attrib['Name']json_dict['negative'].append({'name': name, 'vertices': vertices})with open(outjson, 'w') as f:json.dump(json_dict, f, indent=1)# 获得tumor区域的mask
# 本阶段利用json标注得到tumor区域的mask文件,格式为 .npy 。
def get_tumor_mask(wsi_dir, level, json_dir, tumor_npy_dir, wsi_names):for wsi_name in wsi_names:wsi_path = wsi_dir + wsi_name + '.tif'json_path = json_dir + wsi_name + '.json'tumor_npy_path = tumor_npy_dir + wsi_name + '.npy'slide = openslide.OpenSlide(wsi_path)w, h = slide.level_dimensions[level]mask_tumor = np.zeros((h, w))  # the init mask, and all the value is 0factor = slide.level_downsamples[level]  # get the factor of level * e.g. level 6 is 2^6with open(json_path) as f:dicts = json.load(f)tumor_polygons = dicts['positive']for tumor_polygon in tumor_polygons:# plot a polygonvertices = np.array(tumor_polygon["vertices"]) / factorvertices = vertices.astype(np.int32)cv2.fillPoly(mask_tumor, [vertices], (255))mask_tumor = mask_tumor[:] > 127mask_tumor = np.transpose(mask_tumor)np.save(tumor_npy_path, mask_tumor)  # 获得Tumor_001.tif在level_6下的tumor区域掩码# 获得tissue区域的mask
# 使用大津算法进行图像分割即可获得组织区域。RGB_min可以手动调整,确定最低阈值。可以将tissue_mask转化为二值图像保存下来看看效果.
def get_tissue_mask(wsi_dir, level, tissue_npy_dir, RGB_min, wsi_names):for wsi_name in wsi_names:wsi_path = wsi_dir + wsi_name + '.tif'tissue_npy_path = tissue_npy_dir + wsi_name + '.npy'slide = openslide.OpenSlide(wsi_path)img_RGB = np.transpose(np.array(slide.read_region((0, 0),level,slide.level_dimensions[level]).convert('RGB')),axes=[1, 0, 2])img_HSV = rgb2hsv(img_RGB)background_R = img_RGB[:, :, 0] > threshold_otsu(img_RGB[:, :, 0])background_G = img_RGB[:, :, 1] > threshold_otsu(img_RGB[:, :, 1])background_B = img_RGB[:, :, 2] > threshold_otsu(img_RGB[:, :, 2])tissue_RGB = np.logical_not(background_R & background_G & background_B)tissue_S = img_HSV[:, :, 1] > threshold_otsu(img_HSV[:, :, 1])min_R = img_RGB[:, :, 0] > RGB_minmin_G = img_RGB[:, :, 1] > RGB_minmin_B = img_RGB[:, :, 2] > RGB_mintissue_mask = tissue_S & tissue_RGB & min_R & min_G & min_Bnp.save(tissue_npy_path, tissue_mask)  # 获得Tumor_001.tif在level_6下的组织掩码# img = Image.fromarray(tissue_mask)# img.save('tumor_001_tissue.png') # 可以保存二值图像看看效果如何# 获得no_tumor区域的mask
# tissue区域包含了tumor和no_tumor,所以只需要通过tissue_mask和tumor_mask做一下逻辑运算即可得到no_tumor区域的mask。
def get_no_tumor_mask(tumor_npy_dir, tissue_npy_dir, no_tumor_npy_dir, wsi_names):for wsi_name in wsi_names:tumor_npy_path = tumor_npy_dir + wsi_name + '.npy'tissue_npy_path = tissue_npy_dir + wsi_name + '.npy'no_tumor_npy_path = no_tumor_npy_dir + wsi_name + '.npy'tumor_mask = np.load(tumor_npy_path)tissue_mask = np.load(tissue_npy_path)no_tumor_mask = tissue_mask & (~ tumor_mask)np.save(no_tumor_npy_path, no_tumor_mask)# 随机采样各组织(tumor、no_tumor)区域。
# 一张WSI就可以切出来成千上万块patch,但并不需要全部的,只需要在每张WSI中采样出一定数量就可以了。
# 采样原理比较简单,由于前面拿到的都是WSI 在level 6 下的mask,大概1k * 2k的分辨率,直接在低分辨率的mask中采样一些点,
# 得到采样点在level 6下的坐标,再乘以缩放倍数就能算出他们在level 0 下的坐标(patch的中心点坐标)。得到采样坐标txt文件。
def sample_from_mask(npy_dir, patch_number, level, txt_dir, wsi_names):for wsi_name in wsi_names:npy_path = npy_dir + wsi_name + '.npy'txt_path = txt_dir + wsi_name + '.txt'mask_tissue = np.load(npy_path)X_idcs, Y_idcs = np.where(mask_tissue)centre_points = np.stack(np.vstack((X_idcs.T, Y_idcs.T)), axis=1)if centre_points.shape[0] > patch_number:sampled_points = centre_points[np.random.randint(centre_points.shape[0],size=patch_number), :]else:sampled_points = centre_points  # 点数不够就全要sampled_points = (sampled_points * 2 ** level).astype(np.int32)  # make sure the factormask_only_name = os.path.split(npy_path)[-1].split(".")[0]name = np.full((sampled_points.shape[0], 1), mask_only_name)center_points = np.hstack((name, sampled_points))with open(txt_path, "a") as f:np.savetxt(f, center_points, fmt="%s", delimiter=",")# 得到patch数据集
# 根据采样点的坐标,在level 0 下切割WSI即可得到patch。需要对tumor和no_tumor分别操作,
# 得到两类patch。还需要对测试集切块,都是一样的流程。仅以训练集的tumor切块举例。
def process(opts):  # , patch_size, wsi_path, level_patch, patch_dirj, pid, x_center, y_center, wsi_path, patch_size, level_patch, patch_dir = optsx = int(int(x_center) - patch_size / 2)y = int(int(y_center) - patch_size / 2)slide = openslide.OpenSlide(wsi_path)img = slide.read_region((x,y),level_patch,(patch_size,patch_size)).convert('RGB')img.save(os.path.join(patch_dir,pid+'_'+str(100000+j)+'.png'))# 得到patch数据集
def get_patches(txt_dir, wsi_dir, num_process, patch_size, level_patch, patch_dir, wsi_names):for wsi_name in wsi_names:txt_path = txt_dir + wsi_name + '.txt'wsi_path = wsi_dir + wsi_name + '.tif'opt_list = []with open(txt_path) as f:for j, line in enumerate(f):pid, x_center, y_center = line.strip('\n').split(',')# pid为不带后缀的文件名字,如tumor_001opt_list.append((j, pid, x_center, y_center, wsi_path, patch_size, level_patch, patch_dir))# print(j)pool = Pool(processes=num_process)# print(opt_list)pool.map(process, opt_list)# pool.close()# pool.join()def camelyon16_process(wsi_names):# 将xml标注转换为json格式if if_trans_xml_to_json:shutil.rmtree(json_dir)os.mkdir(json_dir)camelyon16xml2json(xml_dir, json_dir, wsi_names)# 获得tumor区域的maskif if_get_tumor_mask_npy:shutil.rmtree(tumor_npy_dir)os.mkdir(tumor_npy_dir)get_tumor_mask(wsi_dir, level, json_dir, tumor_npy_dir, wsi_names)# 获得tissue区域的maskif if_get_tissue_mask_npy:shutil.rmtree(tissue_npy_dir)os.mkdir(tissue_npy_dir)get_tissue_mask(wsi_dir, level, tissue_npy_dir, RGB_min, wsi_names)# 获得no_tumor区域的maskif if_get_no_tumor_mask_npy:shutil.rmtree(no_tumor_npy_dir)os.mkdir(no_tumor_npy_dir)get_no_tumor_mask(tumor_npy_dir, tissue_npy_dir, no_tumor_npy_dir, wsi_names)if if_get_train_tumor_patches or if_get_test_tumor_patches:npy_dir = tumor_npy_dirpatch_dir = tumor_patch_dirtxt_dir = tumor_txt_direlif if_get_train_no_tumor_patches or if_get_test_no_tumor_patches:npy_dir = no_tumor_npy_dirpatch_dir = no_tumor_patch_dirtxt_dir = no_tumor_txt_direlif if_get_train_normal_patches or if_get_test_normal_patches:npy_dir = tissue_npy_dirpatch_dir = normal_patch_dirtxt_dir = normal_txt_direlse:assert False#  随机采样各组织(tumor、no_tumor)区域。if if_get_sample_txt:shutil.rmtree(txt_dir)os.mkdir(txt_dir)sample_from_mask(npy_dir, patch_number, level, txt_dir, wsi_names)# 得到patch数据集shutil.rmtree(patch_dir)os.mkdir(patch_dir)get_patches(txt_dir, wsi_dir, num_process, patch_size, level_patch, patch_dir, wsi_names)if __name__=='__main__':# 获取wsi文件夹下所有的tif文件名wsi_names = []if if_get_train_tumor_patches or if_get_train_no_tumor_patches or if_get_train_normal_patches:for wsi_name in os.listdir(wsi_dir):if os.path.splitext(wsi_name)[1] == '.tif':wsi_names.append(wsi_name.split('.')[0])elif if_get_test_tumor_patches or if_get_test_no_tumor_patches:wsi_all_names = []for wsi_name in os.listdir(wsi_dir):if os.path.splitext(wsi_name)[1] == '.tif':wsi_all_names.append(wsi_name.split('.')[0])xml_names = []for xml_name in os.listdir(xml_dir):if os.path.splitext(xml_name)[1] == '.xml':xml_names.append(xml_name.split('.')[0])wsi_names = list(set(wsi_all_names)&set(xml_names))else:wsi_all_names = []for wsi_name in os.listdir(wsi_dir):if os.path.splitext(wsi_name)[1] == '.tif':wsi_all_names.append(wsi_name.split('.')[0])xml_names = []for xml_name in os.listdir(xml_dir):if os.path.splitext(xml_name)[1] == '.xml':xml_names.append(xml_name.split('.')[0])wsi_names = list(set(wsi_all_names).difference(set(wsi_all_names) & set(xml_names)))print(wsi_names)camelyon16_process(wsi_names)

config.py

python"># !/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2024/9/5 20:18
# @Author  : 猫娜Lisa
# @File    : config.py
# @Software: PyCharmtrain_root = 'xxx\\CAMELYON16\\train\\'
test_root = 'xxx\\CAMELYON16\\test\\'# train
root = train_root
# tumor
wsi_dir = train_root + 'tumor\\'
# normal
# wsi_dir = train_root + 'normal\\'# test
# root = test_root
# wsi_dir = test_root + 'image\\'xml_dir = root + 'util_annotations\\lesion_annotations\\'
json_dir = root + 'util_annotations\\json_annotations\\'tumor_npy_dir = root + 'util_tumor\\tumor_npy\\'
no_tumor_npy_dir = root + 'util_no_tumor\\no_tumor_npy\\'
tissue_npy_dir = root + 'util_tissue_npy\\'  # mask文件输出路径tumor_txt_dir = root + 'util_tumor\\tumor_txt\\'
no_tumor_txt_dir = root + 'util_no_tumor\\no_tumor_txt\\'
normal_txt_dir = root + 'util_normal\\normal_txt\\'tumor_patch_dir = root + 'train_patch\\tumor'  # patch输出文件夹路径
no_tumor_patch_dir = root + 'train_patch\\no_tumor'  # patch输出文件夹路径
normal_patch_dir = root + 'train_patch\\normal'  # patch输出文件夹路径level = 6  # at which WSI level to obtain the mask
RGB_min = 50  # min value for RGB channel
patch_number = 10  # 采样点数 1000
patch_size = 224  # patch 的尺寸 默认256*256大小
level_patch = 0  # 默认在level 0 切割WSI
num_process = 2  # 进程数,使用多进程切块要快得多 16# 
if_trans_xml_to_json = True
if_get_tumor_mask_npy = True
if_get_tissue_mask_npy = True
if_get_no_tumor_mask_npy = True
if_get_sample_txt = True#
if_get_train_tumor_patches = True
if_get_train_no_tumor_patches = False
if_get_train_normal_patches = False
if_get_test_tumor_patches = False
if_get_test_no_tumor_patches = False
if_get_test_normal_patches = False

http://www.ppmy.cn/news/1521473.html

相关文章

java 根据给定的子网掩码和网关计算起始IP和结束IP

java 根据给定的子网掩码和网关计算起始IP和结束IP 以下是一个Java工具类,用于根据给定的子网掩码和网关计算起始IP和结束IP。 import java.net.InetAddress; import java.net.UnknownHostException;public class IPUtils {public static void main(String[] args…

力扣-9. 回文数

文章目录 力扣题目代码工程方法1:方法2: 力扣题目 给你一个整数 x ,如果 x 是一个回文整数,返回 true ;否则,返回 false 。 回文数 是指正序(从左向右)和倒序(从右向左…

Linux基础网络编程-Socket通信

本文使用C语言,在Centos实现Socket两种通信类型(TCP和UDP) 文章目录 一、安装gcc二、使用TCP协议,实现Socket(SOCKE_STREAM)流式通信1. 编写TCP_server.c函数和参数解释 2.编写TCP_client.c函数和参数解释 3. 编译并运行上述两个文件3.1 编译3.2 运行(启…

MyBatis的动态SQL---组合条件查询与传递集合和数组(超详细)

MyBatis的动态SQL—组合条件查询与传递集合和数组(超详细) 文章目录 MyBatis的动态SQL---组合条件查询与传递集合和数组(超详细)一、 定义二、mybatis提供的几大标签1. if 标签2. choose、when和otherwise标签3. trim标签4. forea…

微信小程序垃圾回收的前景方向

在当今这个环保意识日渐增强的时代,如何有效处理日常生活产生的垃圾已成为亟待解决的社会问题。微信小程序凭借其便捷性和广泛的用户基础,在推广垃圾分类与回收方面展现出巨大潜力。作为一款集智能化分类指导、在线预约回收、环保知识普及于一体的微信小…

OSPFV3 华为

1 OSPFV3简介 (IPV6) 定义 OSPF(Open Shortest Path First)是IETF组织开发的一个基于链路状态的内部网关协议(Interior Gateway Protocol)。 目前针对IPv4协议使用的是OSPF Version 2,针对IPv6协议使用OSPF Version 3。 OSPFv3是OSPF Version 3的简称。OSPFv3是运行于I…

opencv 实现两个图片的拼接去重功能

基础知识介绍 cv::Mat 是OpenCV库中用来表示图像和矩阵数据的核心类之一。它是一个多维数组,可以存储图像像素数据、矩阵数据以及其他类型的数据。以下是关于 cv::Mat 类的一些详细解释: 构造函数:cv::Mat 类有多个构造函数,可以用…

计算机的错误计算(八十四)

摘要 讨论双曲余割函数 csch(x)的计算精度问题。 例1. 计算 csch(320.97) . 不妨在 LibreOffice的电子表格中计算,则有: 若利用 csch(x) 1/sinh(x) 在Java中计算: import java.lang.Math; public class Csch{public static void main(S…