import base64 import collections import datetime import hashlib import json import logging import os import shelve import shutil import tempfile import dicom2nifti import pydicom EXCLUDED_HASH = [ # "GWJU7LPC", ] INCLUDED_HASH = [ # "GWJU7LPC", ] LAST_DAY = datetime.datetime.strptime("2025-11-01", "%Y-%m-%d") SRC_ROOT = "/mnt/t24/Public/kimo/TSHA" RAW_DIR = "/mnt/t24/Public/kimo/raw/" DST_ROOT = os.path.join(RAW_DIR, "DICOM") imagesTs_DIR = os.path.join(RAW_DIR, "Dataset2602_BraTS-CK/imagesTs/") NII_JSON_PATH = os.path.join(RAW_DIR, 'nii.json') if os.path.exists(NII_JSON_PATH): with open(NII_JSON_PATH, 'r') as f: NII_DICT = json.load(f) else: NII_DICT = {} # {'', 'PELVISLOWEXTREM', 'BRAIN', 'CSPINE', 'KNEE', 'TSPINE', 'CAROTID', 'NECK', 'ABDOMEN', 'ORBIT', 'HEAD', 'CHEST', 'IAC', 'WHOLEBODY', 'WHOLESPINE', 'ABDOMENPELVIS', 'PELVIS', 'LSPINE', 'SPINE', 'CIRCLEOFWILLIS'} # BodyPartExamined: Counter({'BRAIN': 152087, 'ABDOMEN': 14101, 'HEAD': 11806, 'ABDOMENPELVIS': 10905, 'SPINE': 9277, 'CHEST': 3746, 'PELVIS': 3208, 'NECK': 3205, 'CSPINE': 1527, 'CAROTID': 1186, # 'HEART': 1122, 'LSPINE': 1080, 'KNEE': 591, 'PELVISLOWEXTREM': 496, '': 385, 'ORBIT': 360, 'CIRCLEOFWILLIS': 322, 'HUMERUS': 320, 'ARM': 304, 'IAC': 291, # 'EXTREMITY': 287, 'SHOULDER': 242, 'WHOLEBODY': 190, 'TSPINE': 150, 'HEADNECK': 48, 'WHOLESPINE': 45}) BodyPartExamined = collections.Counter() BodyPartIncluded = set([ 'BRAIN', 'CIRCLEOFWILLIS', 'HEAD', 'IAC', # 'ORBIT', ]) def is_axial(o): return o[1]==0 and o[2]==0 and o[3]==0 and o[5]==0 def check_study(study_dir): SeriesDescription = set() series = {} for root, dirs, files in os.walk(study_dir): for file in sorted(files, key=lambda x: int(x.split("_")[-1].split(".")[0])): if file.endswith(".dcm"): dcm_file = os.path.join(root, file) ds = pydicom.dcmread(dcm_file, force=True, stop_before_pixels=True) if 'BodyPartExamined' in ds: BodyPartExamined[ds.BodyPartExamined] += 1 if 'StudyDescription' in ds: StudyDescription = ds.StudyDescription if 'ImageOrientationPatient' not in ds: continue # print(f"{dcm_file}") series_instance_uid = ds.SeriesInstanceUID SeriesDescription.add(ds.SeriesDescription) # print(body_part_examined, series_description) if series_instance_uid not in series: series[series_instance_uid] = { 'FileDataset': ds, '1st_file': dcm_file, 'orientations': [], 'files': [], } series[series_instance_uid]['files'].append(dcm_file) series[series_instance_uid]['orientations'].append(tuple(ds.ImageOrientationPatient)) # print(ds.ImageOrientationPatient) # exit() brain_list = [] body_parts = set() for uid, s in series.items(): # logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].BodyPartExamined} {s['FileDataset'].SeriesDescription} {len(s['files'])} {s['1st_file']}") if 'BodyPartExamined' in s['FileDataset']: if s['FileDataset'].BodyPartExamined in BodyPartIncluded: brain_list.append(s) else: body_parts.add(s['FileDataset'].BodyPartExamined) if not brain_list: if body_parts: logging.info(f"no brain, BodyPartExamined: {body_parts}") return None else: logging.info(f"BodyPartExamined is empty") if 'brain' in StudyDescription.lower(): logging.info(f"brain in {StudyDescription}, adding all series") brain_list = list(series.values()) # print(series) # print(brain_list) # exit() else: logging.info(f"no brain in {StudyDescription}") return None t1c = [] for s in brain_list: sd = s['FileDataset'].SeriesDescription.lower() if not ('+' in sd or 'gd' in sd): continue if 't1' not in sd and ( 'flair' in sd or 't2' in sd or 'perf' in sd # perfusion series (ep2d_perf) ): continue t1c.append(s) if not t1c: logging.info(f"no t1c in {StudyDescription}") for s in brain_list: logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {len(s['files'])}") return None t1c_axial = [] for s in t1c: c = collections.Counter(s['orientations']) orientation_str = c.most_common(1)[0][0] orientation_float = tuple(float(f) for f in orientation_str) orientation = tuple(round(f) for f in orientation_float) s['Orientation'] = orientation if is_axial(orientation): logging.info(f"--- {s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])}") t1c_axial.append(s) if not t1c_axial: logging.info(f"no axial t1c in {study_dir}") for s in t1c: logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])} {StudyDescription}") return None best_series = max(t1c_axial, key=lambda x: (len(x['files']), -x['FileDataset'].SeriesNumber)) # best_series = min(t1c_axial, key=lambda x: len(x['files'], x['FileDataset'].SeriesNumber))) logging.info(f"{best_series['FileDataset'].SeriesNumber} {best_series['FileDataset'].SeriesDescription} {best_series['Orientation']} {len(best_series['files'])}") return best_series def hashptid(mrn, hosp='NTUH'): ptsalt = (mrn+hosp).upper().encode() hash_in_bytes = hashlib.md5(ptsalt) md5 = hash_in_bytes.hexdigest() hash = base64.b32encode(hash_in_bytes.digest())[:8].decode() return md5, hash def anonymize_series_to_nifti(series_files, dst_dir): os.makedirs(dst_dir, exist_ok=True) for f in series_files: ds = pydicom.dcmread(f) md5, hash = hashptid(ds.PatientID) for elem in ds: if elem.tag.group == 0x0010: elem.value = '' ds.PatientID = hash dst_file = os.path.join(dst_dir, os.path.basename(f.split("_")[-1])) ds.save_as(dst_file) with tempfile.TemporaryDirectory() as tmpdirname: dicom2nifti.convert_directory(dst_dir, tmpdirname, compression=True) for e in os.scandir(tmpdirname): if e.is_file() and e.name.endswith(".nii.gz"): stem = f"{hash}-{ds.StudyDate}" dst_file = os.path.join(imagesTs_DIR, f"{stem}_0000.nii.gz") logging.info(f"copying to {dst_file}") shutil.copyfile(e.path, dst_file) NII_DICT[stem] = os.path.relpath(dst_dir, DST_ROOT) def main(): FORMAT = '%(asctime)s [%(filename)s:%(lineno)d] %(message)s' logging.basicConfig( level=logging.INFO, format=FORMAT, handlers=[ logging.StreamHandler(), # logging.FileHandler(__file__.replace('.py','.%s.log'%str(datetime.datetime.now()).replace(':','')), encoding='utf-8') logging.FileHandler(__file__.replace('.py','.log'), encoding='utf-8') ] ) shutil.rmtree(imagesTs_DIR, ignore_errors=True) os.makedirs(imagesTs_DIR, exist_ok=True) for patho in sorted(os.listdir(SRC_ROOT)): patho_dir = os.path.join(SRC_ROOT, patho) for patient in sorted(os.listdir(patho_dir)): md5, hash = hashptid(patient) if INCLUDED_HASH: if hash not in INCLUDED_HASH: continue if hash in EXCLUDED_HASH: continue patient_dir = os.path.join(patho_dir, patient) if not os.path.isdir(patient_dir): continue if not os.path.isfile(os.path.join(patho_dir, f"{patient}.complete")): logging.info(f"skip {patient_dir}") continue md5, hash = hashptid(patient) dst_patient_dir = os.path.join(DST_ROOT, patho, hash) complete_file = os.path.join(DST_ROOT, patho, f'{hash}.complete') if os.path.exists(complete_file): logging.info(f"skip {patient_dir}") continue num_study = 0 for study in sorted(os.listdir(patient_dir), reverse=True): study_date = study.split('_')[0] if datetime.datetime.strptime(study_date, "%Y%m%d") > LAST_DAY: logging.info(f"skip {study_date}") continue study_dir = os.path.join(patient_dir, study) if not os.path.isdir(study_dir): continue logging.info(study_dir) best_series = check_study(study_dir) if not best_series: continue dst_dir = os.path.join(dst_patient_dir, study) anonymize_series_to_nifti(best_series['files'], dst_dir) num_study += 1 if num_study > 0: with open(complete_file, 'w') as f: f.write('done') # break # break print(NII_DICT) logging.info(f"BodyPartExamined: {BodyPartExamined}") with open(NII_JSON_PATH, 'w') as f: json.dump(NII_DICT, f, indent=1) if __name__ == '__main__': main()