285 lines
9.7 KiB
Python
285 lines
9.7 KiB
Python
import base64
|
|
import collections
|
|
import datetime
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import shelve
|
|
import shutil
|
|
import tempfile
|
|
|
|
import dicom2nifti
|
|
import pydicom
|
|
|
|
EXCLUDED_HASH = [
|
|
# "GWJU7LPC",
|
|
]
|
|
|
|
INCLUDED_HASH = [
|
|
# "GWJU7LPC",
|
|
]
|
|
|
|
|
|
LAST_DAY = datetime.datetime.strptime("2025-11-01", "%Y-%m-%d")
|
|
MAX_PATIENT = 10//3
|
|
|
|
SRC_ROOT = "/mnt/t24/Public/kimo/TSHA"
|
|
RAW_DIR = "/mnt/t24/Public/kimo/raw/"
|
|
|
|
DST_ROOT = os.path.join(RAW_DIR, "DICOM")
|
|
imagesTs_DIR = os.path.join(RAW_DIR, "Dataset2602_BraTS-CK/imagesTs/")
|
|
NII_JSON_PATH = os.path.join(RAW_DIR, 'nii.json')
|
|
|
|
if os.path.exists(NII_JSON_PATH):
|
|
with open(NII_JSON_PATH, 'r') as f:
|
|
NII_DICT = json.load(f)
|
|
else:
|
|
NII_DICT = {}
|
|
|
|
# {'', 'PELVISLOWEXTREM', 'BRAIN', 'CSPINE', 'KNEE', 'TSPINE', 'CAROTID', 'NECK', 'ABDOMEN', 'ORBIT', 'HEAD', 'CHEST', 'IAC', 'WHOLEBODY', 'WHOLESPINE', 'ABDOMENPELVIS', 'PELVIS', 'LSPINE', 'SPINE', 'CIRCLEOFWILLIS'}
|
|
# BodyPartExamined: Counter({'BRAIN': 152087, 'ABDOMEN': 14101, 'HEAD': 11806, 'ABDOMENPELVIS': 10905, 'SPINE': 9277, 'CHEST': 3746, 'PELVIS': 3208, 'NECK': 3205, 'CSPINE': 1527, 'CAROTID': 1186,
|
|
# 'HEART': 1122, 'LSPINE': 1080, 'KNEE': 591, 'PELVISLOWEXTREM': 496, '': 385, 'ORBIT': 360, 'CIRCLEOFWILLIS': 322, 'HUMERUS': 320, 'ARM': 304, 'IAC': 291,
|
|
# 'EXTREMITY': 287, 'SHOULDER': 242, 'WHOLEBODY': 190, 'TSPINE': 150, 'HEADNECK': 48, 'WHOLESPINE': 45})
|
|
|
|
|
|
BodyPartExamined = collections.Counter()
|
|
BodyPartIncluded = set([
|
|
'BRAIN',
|
|
'CIRCLEOFWILLIS',
|
|
'HEAD',
|
|
'IAC',
|
|
# 'ORBIT',
|
|
])
|
|
|
|
def is_axial(o):
|
|
return o[1]==0 and o[2]==0 and o[3]==0 and o[5]==0
|
|
|
|
def check_study(study_dir):
|
|
SeriesDescription = set()
|
|
series = {}
|
|
for root, dirs, files in os.walk(study_dir):
|
|
for file in sorted(files, key=lambda x: int(x.split("_")[-1].split(".")[0])):
|
|
if file.endswith(".dcm"):
|
|
dcm_file = os.path.join(root, file)
|
|
ds = pydicom.dcmread(dcm_file, force=True, stop_before_pixels=True)
|
|
|
|
if 'BodyPartExamined' in ds:
|
|
BodyPartExamined[ds.BodyPartExamined] += 1
|
|
if 'StudyDescription' in ds:
|
|
StudyDescription = ds.StudyDescription
|
|
|
|
if 'ImageOrientationPatient' not in ds:
|
|
continue
|
|
|
|
# print(f"{dcm_file}")
|
|
series_instance_uid = ds.SeriesInstanceUID
|
|
SeriesDescription.add(ds.SeriesDescription)
|
|
# print(body_part_examined, series_description)
|
|
|
|
if series_instance_uid not in series:
|
|
series[series_instance_uid] = {
|
|
'FileDataset': ds,
|
|
|
|
'1st_file': dcm_file,
|
|
|
|
'orientations': [],
|
|
'files': [],
|
|
}
|
|
|
|
series[series_instance_uid]['files'].append(dcm_file)
|
|
series[series_instance_uid]['orientations'].append(tuple(ds.ImageOrientationPatient))
|
|
# print(ds.ImageOrientationPatient)
|
|
# exit()
|
|
|
|
brain_list = []
|
|
body_parts = set()
|
|
|
|
for uid, s in series.items():
|
|
# logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].BodyPartExamined} {s['FileDataset'].SeriesDescription} {len(s['files'])} {s['1st_file']}")
|
|
if 'BodyPartExamined' in s['FileDataset']:
|
|
if s['FileDataset'].BodyPartExamined in BodyPartIncluded:
|
|
brain_list.append(s)
|
|
else:
|
|
body_parts.add(s['FileDataset'].BodyPartExamined)
|
|
|
|
if not brain_list:
|
|
if body_parts:
|
|
logging.info(f"no brain, BodyPartExamined: {body_parts}")
|
|
return None
|
|
else:
|
|
logging.info(f"BodyPartExamined is empty")
|
|
if 'brain' in StudyDescription.lower():
|
|
logging.info(f"brain in {StudyDescription}, adding all series")
|
|
brain_list = list(series.values())
|
|
# print(series)
|
|
# print(brain_list)
|
|
# exit()
|
|
else:
|
|
logging.info(f"no brain in {StudyDescription}")
|
|
return None
|
|
|
|
t1c = []
|
|
|
|
for s in brain_list:
|
|
sd = s['FileDataset'].SeriesDescription.lower()
|
|
|
|
if not ('+' in sd or 'gd' in sd):
|
|
continue
|
|
|
|
if 't1' not in sd and (
|
|
'flair' in sd or
|
|
't2' in sd or
|
|
'perf' in sd # perfusion series (ep2d_perf)
|
|
):
|
|
continue
|
|
|
|
t1c.append(s)
|
|
|
|
if not t1c:
|
|
logging.info(f"no t1c in {StudyDescription}")
|
|
for s in brain_list:
|
|
logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {len(s['files'])}")
|
|
return None
|
|
|
|
t1c_axial = []
|
|
|
|
for s in t1c:
|
|
c = collections.Counter(s['orientations'])
|
|
orientation_str = c.most_common(1)[0][0]
|
|
orientation_float = tuple(float(f) for f in orientation_str)
|
|
orientation = tuple(round(f) for f in orientation_float)
|
|
|
|
s['Orientation'] = orientation
|
|
if is_axial(orientation):
|
|
logging.info(f"--- {s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])}")
|
|
t1c_axial.append(s)
|
|
|
|
if not t1c_axial:
|
|
logging.info(f"no axial t1c in {study_dir}")
|
|
for s in t1c:
|
|
logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])} {StudyDescription}")
|
|
return None
|
|
|
|
best_series = max(t1c_axial, key=lambda x: (len(x['files']), -x['FileDataset'].SeriesNumber))
|
|
# best_series = min(t1c_axial, key=lambda x: len(x['files'], x['FileDataset'].SeriesNumber)))
|
|
logging.info(f"{best_series['FileDataset'].SeriesNumber} {best_series['FileDataset'].SeriesDescription} {best_series['Orientation']} {len(best_series['files'])}")
|
|
|
|
return best_series
|
|
|
|
def hashptid(mrn, hosp='NTUH'):
|
|
|
|
ptsalt = (mrn+hosp).upper().encode()
|
|
hash_in_bytes = hashlib.md5(ptsalt)
|
|
|
|
md5 = hash_in_bytes.hexdigest()
|
|
hash = base64.b32encode(hash_in_bytes.digest())[:8].decode()
|
|
return md5, hash
|
|
|
|
def anonymize_series_to_nifti(series_files, dst_dir):
|
|
os.makedirs(dst_dir, exist_ok=True)
|
|
for f in series_files:
|
|
ds = pydicom.dcmread(f)
|
|
md5, hash = hashptid(ds.PatientID)
|
|
for elem in ds:
|
|
if elem.tag.group == 0x0010:
|
|
elem.value = ''
|
|
|
|
ds.PatientID = hash
|
|
dst_file = os.path.join(dst_dir, os.path.basename(f.split("_")[-1]))
|
|
ds.save_as(dst_file)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
dicom2nifti.convert_directory(dst_dir, tmpdirname, compression=True)
|
|
for e in os.scandir(tmpdirname):
|
|
if e.is_file() and e.name.endswith(".nii.gz"):
|
|
stem = f"{hash}-{ds.StudyDate}"
|
|
dst_file = os.path.join(imagesTs_DIR, f"{stem}_0000.nii.gz")
|
|
logging.info(f"copying to {dst_file}")
|
|
shutil.copyfile(e.path, dst_file)
|
|
NII_DICT[stem] = os.path.relpath(dst_dir, DST_ROOT)
|
|
|
|
def main():
|
|
|
|
FORMAT = '%(asctime)s [%(filename)s:%(lineno)d] %(message)s'
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format=FORMAT,
|
|
handlers=[
|
|
logging.StreamHandler(),
|
|
# logging.FileHandler(__file__.replace('.py','.%s.log'%str(datetime.datetime.now()).replace(':','')), encoding='utf-8')
|
|
logging.FileHandler(__file__.replace('.py','.log'), encoding='utf-8')
|
|
]
|
|
)
|
|
|
|
shutil.rmtree(imagesTs_DIR, ignore_errors=True)
|
|
os.makedirs(imagesTs_DIR, exist_ok=True)
|
|
|
|
for patho in sorted(os.listdir(SRC_ROOT)):
|
|
patho_dir = os.path.join(SRC_ROOT, patho)
|
|
|
|
num_patient = 0
|
|
|
|
for patient in sorted(os.listdir(patho_dir)):
|
|
md5, hash = hashptid(patient)
|
|
|
|
if INCLUDED_HASH:
|
|
if hash not in INCLUDED_HASH:
|
|
continue
|
|
|
|
if hash in EXCLUDED_HASH:
|
|
continue
|
|
|
|
patient_dir = os.path.join(patho_dir, patient)
|
|
if not os.path.isdir(patient_dir):
|
|
continue
|
|
|
|
if not os.path.isfile(os.path.join(patho_dir, f"{patient}.complete")):
|
|
logging.info(f"skip {patient_dir}")
|
|
continue
|
|
|
|
md5, hash = hashptid(patient)
|
|
|
|
dst_patient_dir = os.path.join(DST_ROOT, patho, hash)
|
|
complete_file = os.path.join(DST_ROOT, patho, f'{hash}.complete')
|
|
|
|
if os.path.exists(complete_file):
|
|
logging.info(f"skip {patient_dir}")
|
|
continue
|
|
|
|
num_study = 0
|
|
|
|
for study in sorted(os.listdir(patient_dir), reverse=True):
|
|
study_date = study.split('_')[0]
|
|
if datetime.datetime.strptime(study_date, "%Y%m%d") > LAST_DAY:
|
|
logging.info(f"skip {study_date}")
|
|
continue
|
|
study_dir = os.path.join(patient_dir, study)
|
|
if not os.path.isdir(study_dir):
|
|
continue
|
|
|
|
logging.info(study_dir)
|
|
best_series = check_study(study_dir)
|
|
if not best_series:
|
|
continue
|
|
|
|
dst_dir = os.path.join(dst_patient_dir, study)
|
|
|
|
anonymize_series_to_nifti(best_series['files'], dst_dir)
|
|
num_study += 1
|
|
|
|
if num_study > 0:
|
|
with open(complete_file, 'w') as f:
|
|
f.write('done')
|
|
num_patient += 1
|
|
if num_patient >= MAX_PATIENT:
|
|
break
|
|
# break
|
|
print(NII_DICT)
|
|
logging.info(f"BodyPartExamined: {BodyPartExamined}")
|
|
|
|
with open(NII_JSON_PATH, 'w') as f:
|
|
json.dump(NII_DICT, f, indent=1)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|