tsha-mri-tumor-labeling/kimo/2-check-series.py

import base64
import collections
import datetime
import hashlib
import json
import logging
import os
import shelve
import shutil
import tempfile

import dicom2nifti
import pydicom

EXCLUDED_HASH = [
    # "GWJU7LPC",
]

INCLUDED_HASH = [
    # "GWJU7LPC",
]


LAST_DAY = datetime.datetime.strptime("2025-11-01", "%Y-%m-%d")

SRC_ROOT = "/mnt/t24/Public/kimo/TSHA"
RAW_DIR = "/mnt/t24/Public/kimo/raw/"

DST_ROOT = os.path.join(RAW_DIR, "DICOM")
imagesTs_DIR = os.path.join(RAW_DIR, "Dataset2602_BraTS-CK/imagesTs/")
NII_JSON_PATH = os.path.join(RAW_DIR, 'nii.json')

if os.path.exists(NII_JSON_PATH):
    with open(NII_JSON_PATH, 'r') as f:
        NII_DICT = json.load(f)
else:
    NII_DICT = {}

# {'', 'PELVISLOWEXTREM', 'BRAIN', 'CSPINE', 'KNEE', 'TSPINE', 'CAROTID', 'NECK', 'ABDOMEN', 'ORBIT', 'HEAD', 'CHEST', 'IAC', 'WHOLEBODY', 'WHOLESPINE', 'ABDOMENPELVIS', 'PELVIS', 'LSPINE', 'SPINE', 'CIRCLEOFWILLIS'}
# BodyPartExamined: Counter({'BRAIN': 152087, 'ABDOMEN': 14101, 'HEAD': 11806, 'ABDOMENPELVIS': 10905, 'SPINE': 9277, 'CHEST': 3746, 'PELVIS': 3208, 'NECK': 3205, 'CSPINE': 1527, 'CAROTID': 1186, 
#     'HEART': 1122, 'LSPINE': 1080, 'KNEE': 591, 'PELVISLOWEXTREM': 496, '': 385, 'ORBIT': 360, 'CIRCLEOFWILLIS': 322, 'HUMERUS': 320, 'ARM': 304, 'IAC': 291, 
#     'EXTREMITY': 287, 'SHOULDER': 242, 'WHOLEBODY': 190, 'TSPINE': 150, 'HEADNECK': 48, 'WHOLESPINE': 45})


BodyPartExamined = collections.Counter()
BodyPartIncluded = set([
    'BRAIN', 
    'CIRCLEOFWILLIS',
    'HEAD',
    'IAC', 
    # 'ORBIT',
])

def is_axial(o):
    return o[1]==0 and o[2]==0 and o[3]==0 and o[5]==0

def check_study(study_dir):
    SeriesDescription = set()
    series = {}
    for root, dirs, files in os.walk(study_dir):
        for file in sorted(files, key=lambda x: int(x.split("_")[-1].split(".")[0])):
            if file.endswith(".dcm"):
                dcm_file = os.path.join(root, file)
                ds = pydicom.dcmread(dcm_file, force=True, stop_before_pixels=True)

                if 'BodyPartExamined' in ds:
                    BodyPartExamined[ds.BodyPartExamined] += 1
                if 'StudyDescription' in ds:
                    StudyDescription = ds.StudyDescription

                if 'ImageOrientationPatient' not in ds:
                    continue

                # print(f"{dcm_file}")
                series_instance_uid = ds.SeriesInstanceUID
                SeriesDescription.add(ds.SeriesDescription)
                # print(body_part_examined, series_description)

                if series_instance_uid not in series:
                    series[series_instance_uid] = {
                        'FileDataset': ds,

                        '1st_file': dcm_file,

                        'orientations': [],
                        'files': [],
                        }

                series[series_instance_uid]['files'].append(dcm_file)
                series[series_instance_uid]['orientations'].append(tuple(ds.ImageOrientationPatient))
                # print(ds.ImageOrientationPatient)
                # exit()

    brain_list = []
    body_parts = set()

    for uid, s in series.items():
        # logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].BodyPartExamined} {s['FileDataset'].SeriesDescription} {len(s['files'])} {s['1st_file']}")
        if 'BodyPartExamined' in s['FileDataset']:
            if s['FileDataset'].BodyPartExamined in BodyPartIncluded:
                brain_list.append(s)
            else:
                body_parts.add(s['FileDataset'].BodyPartExamined)

    if not brain_list:
        if body_parts:
            logging.info(f"no brain, BodyPartExamined: {body_parts}")
            return None
        else:
            logging.info(f"BodyPartExamined is empty")
            if 'brain' in StudyDescription.lower():
                logging.info(f"brain in {StudyDescription}, adding all series")
                brain_list = list(series.values())
                # print(series)
                # print(brain_list)
                # exit()
            else:
                logging.info(f"no brain in {StudyDescription}")
                return None

    t1c = []

    for s in brain_list:
        sd = s['FileDataset'].SeriesDescription.lower()

        if not ('+' in sd or 'gd' in sd):
            continue

        if 't1' not in sd and (
            'flair' in sd or 
            't2' in sd or 
            'perf' in sd # perfusion series (ep2d_perf)
            ):
            continue

        t1c.append(s)

    if not t1c:
        logging.info(f"no t1c in {StudyDescription}")
        for s in brain_list:
            logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {len(s['files'])}")
        return None

    t1c_axial = []

    for s in t1c:
        c = collections.Counter(s['orientations'])
        orientation_str = c.most_common(1)[0][0]
        orientation_float = tuple(float(f) for f in orientation_str)
        orientation = tuple(round(f) for f in orientation_float)

        s['Orientation'] = orientation
        if is_axial(orientation):
            logging.info(f"--- {s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])}")
            t1c_axial.append(s)

    if not t1c_axial:
        logging.info(f"no axial t1c in {study_dir}")
        for s in t1c:
            logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])} {StudyDescription}")
        return None

    best_series = max(t1c_axial, key=lambda x: (len(x['files']), -x['FileDataset'].SeriesNumber))
    # best_series = min(t1c_axial, key=lambda x: len(x['files'], x['FileDataset'].SeriesNumber)))
    logging.info(f"{best_series['FileDataset'].SeriesNumber} {best_series['FileDataset'].SeriesDescription} {best_series['Orientation']} {len(best_series['files'])}")

    return best_series

def hashptid(mrn, hosp='NTUH'):

    ptsalt = (mrn+hosp).upper().encode()
    hash_in_bytes = hashlib.md5(ptsalt)

    md5  = hash_in_bytes.hexdigest()
    hash = base64.b32encode(hash_in_bytes.digest())[:8].decode()
    return md5, hash

def anonymize_series_to_nifti(series_files, dst_dir):
    os.makedirs(dst_dir, exist_ok=True)
    for f in series_files:
        ds = pydicom.dcmread(f)
        md5, hash = hashptid(ds.PatientID)
        for elem in ds:
            if elem.tag.group == 0x0010:
                elem.value = ''
        
        ds.PatientID = hash
        dst_file = os.path.join(dst_dir, os.path.basename(f.split("_")[-1]))
        ds.save_as(dst_file)

    with tempfile.TemporaryDirectory() as tmpdirname:
        dicom2nifti.convert_directory(dst_dir, tmpdirname, compression=True)
        for e in os.scandir(tmpdirname):
            if e.is_file() and e.name.endswith(".nii.gz"):
                stem = f"{hash}-{ds.StudyDate}"
                dst_file = os.path.join(imagesTs_DIR, f"{stem}_0000.nii.gz")
                logging.info(f"copying to {dst_file}")
                shutil.copyfile(e.path, dst_file)
                NII_DICT[stem] = os.path.relpath(dst_dir, DST_ROOT)

def main():

    FORMAT = '%(asctime)s [%(filename)s:%(lineno)d] %(message)s'
    logging.basicConfig(
        level=logging.INFO,
        format=FORMAT,
        handlers=[
            logging.StreamHandler(),
            # logging.FileHandler(__file__.replace('.py','.%s.log'%str(datetime.datetime.now()).replace(':','')), encoding='utf-8')
            logging.FileHandler(__file__.replace('.py','.log'), encoding='utf-8')
        ]
    )

    shutil.rmtree(imagesTs_DIR, ignore_errors=True)
    os.makedirs(imagesTs_DIR, exist_ok=True)

    for patho in sorted(os.listdir(SRC_ROOT)):
        patho_dir = os.path.join(SRC_ROOT, patho)

        for patient in sorted(os.listdir(patho_dir)):
            md5, hash = hashptid(patient)

            if INCLUDED_HASH:
                if hash not in INCLUDED_HASH:
                    continue

            if hash in EXCLUDED_HASH:
                continue

            patient_dir = os.path.join(patho_dir, patient)
            if not os.path.isdir(patient_dir):
                continue

            if not os.path.isfile(os.path.join(patho_dir, f"{patient}.complete")):
                logging.info(f"skip {patient_dir}")
                continue

            md5, hash = hashptid(patient)

            dst_patient_dir = os.path.join(DST_ROOT, patho, hash)
            complete_file = os.path.join(DST_ROOT, patho, f'{hash}.complete')
            
            if os.path.exists(complete_file):
                logging.info(f"skip {patient_dir}")
                continue

            num_study = 0

            for study in sorted(os.listdir(patient_dir), reverse=True):
                study_date = study.split('_')[0]
                if datetime.datetime.strptime(study_date, "%Y%m%d") > LAST_DAY:
                    logging.info(f"skip {study_date}")
                    continue
                study_dir = os.path.join(patient_dir, study)
                if not os.path.isdir(study_dir):
                    continue
                    
                logging.info(study_dir)       
                best_series = check_study(study_dir)
                if not best_series:
                    continue

                dst_dir = os.path.join(dst_patient_dir, study)

                anonymize_series_to_nifti(best_series['files'], dst_dir)
                num_study += 1

            if num_study > 0:
                with open(complete_file, 'w') as f:
                    f.write('done')
            # break
        # break
    print(NII_DICT)
    logging.info(f"BodyPartExamined: {BodyPartExamined}")

    with open(NII_JSON_PATH, 'w') as f:
        json.dump(NII_DICT, f, indent=1)

if __name__ == '__main__':
    main()
feat: Add MRI data preprocessing pipeline including series checking, anonymization, and NIfTI conversion. 2026-02-22 01:49:47 +00:00			`import base64`
			`import collections`
			`import datetime`
			`import hashlib`
			`import json`
			`import logging`
			`import os`
			`import shelve`
			`import shutil`
			`import tempfile`

			`import dicom2nifti`
			`import pydicom`

			`EXCLUDED_HASH = [`
			`# "GWJU7LPC",`
			`]`

			`INCLUDED_HASH = [`
			`# "GWJU7LPC",`
			`]`


			`LAST_DAY = datetime.datetime.strptime("2025-11-01", "%Y-%m-%d")`

			`SRC_ROOT = "/mnt/t24/Public/kimo/TSHA"`
			`RAW_DIR = "/mnt/t24/Public/kimo/raw/"`

			`DST_ROOT = os.path.join(RAW_DIR, "DICOM")`
			`imagesTs_DIR = os.path.join(RAW_DIR, "Dataset2602_BraTS-CK/imagesTs/")`
			`NII_JSON_PATH = os.path.join(RAW_DIR, 'nii.json')`

			`if os.path.exists(NII_JSON_PATH):`
			`with open(NII_JSON_PATH, 'r') as f:`
			`NII_DICT = json.load(f)`
			`else:`
			`NII_DICT = {}`

			`# {'', 'PELVISLOWEXTREM', 'BRAIN', 'CSPINE', 'KNEE', 'TSPINE', 'CAROTID', 'NECK', 'ABDOMEN', 'ORBIT', 'HEAD', 'CHEST', 'IAC', 'WHOLEBODY', 'WHOLESPINE', 'ABDOMENPELVIS', 'PELVIS', 'LSPINE', 'SPINE', 'CIRCLEOFWILLIS'}`
			`# BodyPartExamined: Counter({'BRAIN': 152087, 'ABDOMEN': 14101, 'HEAD': 11806, 'ABDOMENPELVIS': 10905, 'SPINE': 9277, 'CHEST': 3746, 'PELVIS': 3208, 'NECK': 3205, 'CSPINE': 1527, 'CAROTID': 1186,`
			`# 'HEART': 1122, 'LSPINE': 1080, 'KNEE': 591, 'PELVISLOWEXTREM': 496, '': 385, 'ORBIT': 360, 'CIRCLEOFWILLIS': 322, 'HUMERUS': 320, 'ARM': 304, 'IAC': 291,`
			`# 'EXTREMITY': 287, 'SHOULDER': 242, 'WHOLEBODY': 190, 'TSPINE': 150, 'HEADNECK': 48, 'WHOLESPINE': 45})`


			`BodyPartExamined = collections.Counter()`
			`BodyPartIncluded = set([`
			`'BRAIN',`
			`'CIRCLEOFWILLIS',`
			`'HEAD',`
			`'IAC',`
			`# 'ORBIT',`
			`])`

			`def is_axial(o):`
			`return o[1]==0 and o[2]==0 and o[3]==0 and o[5]==0`

			`def check_study(study_dir):`
			`SeriesDescription = set()`
			`series = {}`
			`for root, dirs, files in os.walk(study_dir):`
			`for file in sorted(files, key=lambda x: int(x.split("_")[-1].split(".")[0])):`
			`if file.endswith(".dcm"):`
			`dcm_file = os.path.join(root, file)`
			`ds = pydicom.dcmread(dcm_file, force=True, stop_before_pixels=True)`

			`if 'BodyPartExamined' in ds:`
			`BodyPartExamined[ds.BodyPartExamined] += 1`
			`if 'StudyDescription' in ds:`
			`StudyDescription = ds.StudyDescription`

			`if 'ImageOrientationPatient' not in ds:`
			`continue`

			`# print(f"{dcm_file}")`
			`series_instance_uid = ds.SeriesInstanceUID`
			`SeriesDescription.add(ds.SeriesDescription)`
			`# print(body_part_examined, series_description)`

			`if series_instance_uid not in series:`
			`series[series_instance_uid] = {`
			`'FileDataset': ds,`

			`'1st_file': dcm_file,`

			`'orientations': [],`
			`'files': [],`
			`}`

			`series[series_instance_uid]['files'].append(dcm_file)`
			`series[series_instance_uid]['orientations'].append(tuple(ds.ImageOrientationPatient))`
			`# print(ds.ImageOrientationPatient)`
			`# exit()`

			`brain_list = []`
			`body_parts = set()`

			`for uid, s in series.items():`
			`# logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].BodyPartExamined} {s['FileDataset'].SeriesDescription} {len(s['files'])} {s['1st_file']}")`
			`if 'BodyPartExamined' in s['FileDataset']:`
			`if s['FileDataset'].BodyPartExamined in BodyPartIncluded:`
			`brain_list.append(s)`
			`else:`
			`body_parts.add(s['FileDataset'].BodyPartExamined)`

			`if not brain_list:`
			`if body_parts:`
			`logging.info(f"no brain, BodyPartExamined: {body_parts}")`
			`return None`
			`else:`
			`logging.info(f"BodyPartExamined is empty")`
			`if 'brain' in StudyDescription.lower():`
			`logging.info(f"brain in {StudyDescription}, adding all series")`
			`brain_list = list(series.values())`
			`# print(series)`
			`# print(brain_list)`
			`# exit()`
			`else:`
			`logging.info(f"no brain in {StudyDescription}")`
			`return None`

			`t1c = []`

			`for s in brain_list:`
			`sd = s['FileDataset'].SeriesDescription.lower()`

			`if not ('+' in sd or 'gd' in sd):`
			`continue`

			`if 't1' not in sd and (`
			`'flair' in sd or`
			`'t2' in sd or`
			`'perf' in sd # perfusion series (ep2d_perf)`
			`):`
			`continue`

			`t1c.append(s)`

			`if not t1c:`
			`logging.info(f"no t1c in {StudyDescription}")`
			`for s in brain_list:`
			`logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {len(s['files'])}")`
			`return None`

			`t1c_axial = []`

			`for s in t1c:`
			`c = collections.Counter(s['orientations'])`
			`orientation_str = c.most_common(1)[0][0]`
			`orientation_float = tuple(float(f) for f in orientation_str)`
			`orientation = tuple(round(f) for f in orientation_float)`

			`s['Orientation'] = orientation`
			`if is_axial(orientation):`
			`logging.info(f"--- {s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])}")`
			`t1c_axial.append(s)`

			`if not t1c_axial:`
			`logging.info(f"no axial t1c in {study_dir}")`
			`for s in t1c:`
			`logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])} {StudyDescription}")`
			`return None`

			`best_series = max(t1c_axial, key=lambda x: (len(x['files']), -x['FileDataset'].SeriesNumber))`
			`# best_series = min(t1c_axial, key=lambda x: len(x['files'], x['FileDataset'].SeriesNumber)))`
			`logging.info(f"{best_series['FileDataset'].SeriesNumber} {best_series['FileDataset'].SeriesDescription} {best_series['Orientation']} {len(best_series['files'])}")`

			`return best_series`

			`def hashptid(mrn, hosp='NTUH'):`

			`ptsalt = (mrn+hosp).upper().encode()`
			`hash_in_bytes = hashlib.md5(ptsalt)`

			`md5 = hash_in_bytes.hexdigest()`
			`hash = base64.b32encode(hash_in_bytes.digest())[:8].decode()`
			`return md5, hash`

			`def anonymize_series_to_nifti(series_files, dst_dir):`
			`os.makedirs(dst_dir, exist_ok=True)`
			`for f in series_files:`
			`ds = pydicom.dcmread(f)`
			`md5, hash = hashptid(ds.PatientID)`
			`for elem in ds:`
			`if elem.tag.group == 0x0010:`
			`elem.value = ''`

			`ds.PatientID = hash`
			`dst_file = os.path.join(dst_dir, os.path.basename(f.split("_")[-1]))`
			`ds.save_as(dst_file)`

			`with tempfile.TemporaryDirectory() as tmpdirname:`
			`dicom2nifti.convert_directory(dst_dir, tmpdirname, compression=True)`
			`for e in os.scandir(tmpdirname):`
			`if e.is_file() and e.name.endswith(".nii.gz"):`
			`stem = f"{hash}-{ds.StudyDate}"`
			`dst_file = os.path.join(imagesTs_DIR, f"{stem}_0000.nii.gz")`
			`logging.info(f"copying to {dst_file}")`
			`shutil.copyfile(e.path, dst_file)`
			`NII_DICT[stem] = os.path.relpath(dst_dir, DST_ROOT)`

			`def main():`

			`FORMAT = '%(asctime)s [%(filename)s:%(lineno)d] %(message)s'`
			`logging.basicConfig(`
			`level=logging.INFO,`
			`format=FORMAT,`
			`handlers=[`
			`logging.StreamHandler(),`
			`# logging.FileHandler(__file__.replace('.py','.%s.log'%str(datetime.datetime.now()).replace(':','')), encoding='utf-8')`
			`logging.FileHandler(__file__.replace('.py','.log'), encoding='utf-8')`
			`]`
			`)`

			`shutil.rmtree(imagesTs_DIR, ignore_errors=True)`
			`os.makedirs(imagesTs_DIR, exist_ok=True)`

			`for patho in sorted(os.listdir(SRC_ROOT)):`
			`patho_dir = os.path.join(SRC_ROOT, patho)`

			`for patient in sorted(os.listdir(patho_dir)):`
			`md5, hash = hashptid(patient)`

			`if INCLUDED_HASH:`
			`if hash not in INCLUDED_HASH:`
			`continue`

			`if hash in EXCLUDED_HASH:`
			`continue`

			`patient_dir = os.path.join(patho_dir, patient)`
			`if not os.path.isdir(patient_dir):`
			`continue`

			`if not os.path.isfile(os.path.join(patho_dir, f"{patient}.complete")):`
			`logging.info(f"skip {patient_dir}")`
			`continue`

			`md5, hash = hashptid(patient)`

			`dst_patient_dir = os.path.join(DST_ROOT, patho, hash)`
			`complete_file = os.path.join(DST_ROOT, patho, f'{hash}.complete')`

			`if os.path.exists(complete_file):`
			`logging.info(f"skip {patient_dir}")`
			`continue`

			`num_study = 0`

			`for study in sorted(os.listdir(patient_dir), reverse=True):`
			`study_date = study.split('_')[0]`
			`if datetime.datetime.strptime(study_date, "%Y%m%d") > LAST_DAY:`
			`logging.info(f"skip {study_date}")`
			`continue`
			`study_dir = os.path.join(patient_dir, study)`
			`if not os.path.isdir(study_dir):`
			`continue`

			`logging.info(study_dir)`
			`best_series = check_study(study_dir)`
			`if not best_series:`
			`continue`

			`dst_dir = os.path.join(dst_patient_dir, study)`

			`anonymize_series_to_nifti(best_series['files'], dst_dir)`
			`num_study += 1`

			`if num_study > 0:`
			`with open(complete_file, 'w') as f:`
			`f.write('done')`
			`# break`
			`# break`
			`print(NII_DICT)`
			`logging.info(f"BodyPartExamined: {BodyPartExamined}")`

			`with open(NII_JSON_PATH, 'w') as f:`
			`json.dump(NII_DICT, f, indent=1)`

			`if __name__ == '__main__':`
			`main()`