tsha-mri-tumor-labeling/kimo/2-check-series.py

281 lines
9.6 KiB
Python
Raw Normal View History

import base64
import collections
import datetime
import hashlib
import json
import logging
import os
import shelve
import shutil
import tempfile
import dicom2nifti
import pydicom
EXCLUDED_HASH = [
# "GWJU7LPC",
]
INCLUDED_HASH = [
# "GWJU7LPC",
]
LAST_DAY = datetime.datetime.strptime("2025-11-01", "%Y-%m-%d")
SRC_ROOT = "/mnt/t24/Public/kimo/TSHA"
RAW_DIR = "/mnt/t24/Public/kimo/raw/"
DST_ROOT = os.path.join(RAW_DIR, "DICOM")
imagesTs_DIR = os.path.join(RAW_DIR, "Dataset2602_BraTS-CK/imagesTs/")
NII_JSON_PATH = os.path.join(RAW_DIR, 'nii.json')
if os.path.exists(NII_JSON_PATH):
with open(NII_JSON_PATH, 'r') as f:
NII_DICT = json.load(f)
else:
NII_DICT = {}
# {'', 'PELVISLOWEXTREM', 'BRAIN', 'CSPINE', 'KNEE', 'TSPINE', 'CAROTID', 'NECK', 'ABDOMEN', 'ORBIT', 'HEAD', 'CHEST', 'IAC', 'WHOLEBODY', 'WHOLESPINE', 'ABDOMENPELVIS', 'PELVIS', 'LSPINE', 'SPINE', 'CIRCLEOFWILLIS'}
# BodyPartExamined: Counter({'BRAIN': 152087, 'ABDOMEN': 14101, 'HEAD': 11806, 'ABDOMENPELVIS': 10905, 'SPINE': 9277, 'CHEST': 3746, 'PELVIS': 3208, 'NECK': 3205, 'CSPINE': 1527, 'CAROTID': 1186,
# 'HEART': 1122, 'LSPINE': 1080, 'KNEE': 591, 'PELVISLOWEXTREM': 496, '': 385, 'ORBIT': 360, 'CIRCLEOFWILLIS': 322, 'HUMERUS': 320, 'ARM': 304, 'IAC': 291,
# 'EXTREMITY': 287, 'SHOULDER': 242, 'WHOLEBODY': 190, 'TSPINE': 150, 'HEADNECK': 48, 'WHOLESPINE': 45})
BodyPartExamined = collections.Counter()
BodyPartIncluded = set([
'BRAIN',
'CIRCLEOFWILLIS',
'HEAD',
'IAC',
# 'ORBIT',
])
def is_axial(o):
return o[1]==0 and o[2]==0 and o[3]==0 and o[5]==0
def check_study(study_dir):
SeriesDescription = set()
series = {}
for root, dirs, files in os.walk(study_dir):
for file in sorted(files, key=lambda x: int(x.split("_")[-1].split(".")[0])):
if file.endswith(".dcm"):
dcm_file = os.path.join(root, file)
ds = pydicom.dcmread(dcm_file, force=True, stop_before_pixels=True)
if 'BodyPartExamined' in ds:
BodyPartExamined[ds.BodyPartExamined] += 1
if 'StudyDescription' in ds:
StudyDescription = ds.StudyDescription
if 'ImageOrientationPatient' not in ds:
continue
# print(f"{dcm_file}")
series_instance_uid = ds.SeriesInstanceUID
SeriesDescription.add(ds.SeriesDescription)
# print(body_part_examined, series_description)
if series_instance_uid not in series:
series[series_instance_uid] = {
'FileDataset': ds,
'1st_file': dcm_file,
'orientations': [],
'files': [],
}
series[series_instance_uid]['files'].append(dcm_file)
series[series_instance_uid]['orientations'].append(tuple(ds.ImageOrientationPatient))
# print(ds.ImageOrientationPatient)
# exit()
brain_list = []
body_parts = set()
for uid, s in series.items():
# logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].BodyPartExamined} {s['FileDataset'].SeriesDescription} {len(s['files'])} {s['1st_file']}")
if 'BodyPartExamined' in s['FileDataset']:
if s['FileDataset'].BodyPartExamined in BodyPartIncluded:
brain_list.append(s)
else:
body_parts.add(s['FileDataset'].BodyPartExamined)
if not brain_list:
if body_parts:
logging.info(f"no brain, BodyPartExamined: {body_parts}")
return None
else:
logging.info(f"BodyPartExamined is empty")
if 'brain' in StudyDescription.lower():
logging.info(f"brain in {StudyDescription}, adding all series")
brain_list = list(series.values())
# print(series)
# print(brain_list)
# exit()
else:
logging.info(f"no brain in {StudyDescription}")
return None
t1c = []
for s in brain_list:
sd = s['FileDataset'].SeriesDescription.lower()
if not ('+' in sd or 'gd' in sd):
continue
if 't1' not in sd and (
'flair' in sd or
't2' in sd or
'perf' in sd # perfusion series (ep2d_perf)
):
continue
t1c.append(s)
if not t1c:
logging.info(f"no t1c in {StudyDescription}")
for s in brain_list:
logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {len(s['files'])}")
return None
t1c_axial = []
for s in t1c:
c = collections.Counter(s['orientations'])
orientation_str = c.most_common(1)[0][0]
orientation_float = tuple(float(f) for f in orientation_str)
orientation = tuple(round(f) for f in orientation_float)
s['Orientation'] = orientation
if is_axial(orientation):
logging.info(f"--- {s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])}")
t1c_axial.append(s)
if not t1c_axial:
logging.info(f"no axial t1c in {study_dir}")
for s in t1c:
logging.info(f"{s['FileDataset'].SeriesNumber} {s['FileDataset'].SeriesDescription} {s['Orientation']} {len(s['files'])} {StudyDescription}")
return None
best_series = max(t1c_axial, key=lambda x: (len(x['files']), -x['FileDataset'].SeriesNumber))
# best_series = min(t1c_axial, key=lambda x: len(x['files'], x['FileDataset'].SeriesNumber)))
logging.info(f"{best_series['FileDataset'].SeriesNumber} {best_series['FileDataset'].SeriesDescription} {best_series['Orientation']} {len(best_series['files'])}")
return best_series
def hashptid(mrn, hosp='NTUH'):
ptsalt = (mrn+hosp).upper().encode()
hash_in_bytes = hashlib.md5(ptsalt)
md5 = hash_in_bytes.hexdigest()
hash = base64.b32encode(hash_in_bytes.digest())[:8].decode()
return md5, hash
def anonymize_series_to_nifti(series_files, dst_dir):
os.makedirs(dst_dir, exist_ok=True)
for f in series_files:
ds = pydicom.dcmread(f)
md5, hash = hashptid(ds.PatientID)
for elem in ds:
if elem.tag.group == 0x0010:
elem.value = ''
ds.PatientID = hash
dst_file = os.path.join(dst_dir, os.path.basename(f.split("_")[-1]))
ds.save_as(dst_file)
with tempfile.TemporaryDirectory() as tmpdirname:
dicom2nifti.convert_directory(dst_dir, tmpdirname, compression=True)
for e in os.scandir(tmpdirname):
if e.is_file() and e.name.endswith(".nii.gz"):
stem = f"{hash}-{ds.StudyDate}"
dst_file = os.path.join(imagesTs_DIR, f"{stem}_0000.nii.gz")
logging.info(f"copying to {dst_file}")
shutil.copyfile(e.path, dst_file)
NII_DICT[stem] = os.path.relpath(dst_dir, DST_ROOT)
def main():
FORMAT = '%(asctime)s [%(filename)s:%(lineno)d] %(message)s'
logging.basicConfig(
level=logging.INFO,
format=FORMAT,
handlers=[
logging.StreamHandler(),
# logging.FileHandler(__file__.replace('.py','.%s.log'%str(datetime.datetime.now()).replace(':','')), encoding='utf-8')
logging.FileHandler(__file__.replace('.py','.log'), encoding='utf-8')
]
)
shutil.rmtree(imagesTs_DIR, ignore_errors=True)
os.makedirs(imagesTs_DIR, exist_ok=True)
for patho in sorted(os.listdir(SRC_ROOT)):
patho_dir = os.path.join(SRC_ROOT, patho)
for patient in sorted(os.listdir(patho_dir)):
md5, hash = hashptid(patient)
if INCLUDED_HASH:
if hash not in INCLUDED_HASH:
continue
if hash in EXCLUDED_HASH:
continue
patient_dir = os.path.join(patho_dir, patient)
if not os.path.isdir(patient_dir):
continue
if not os.path.isfile(os.path.join(patho_dir, f"{patient}.complete")):
logging.info(f"skip {patient_dir}")
continue
md5, hash = hashptid(patient)
dst_patient_dir = os.path.join(DST_ROOT, patho, hash)
complete_file = os.path.join(DST_ROOT, patho, f'{hash}.complete')
if os.path.exists(complete_file):
logging.info(f"skip {patient_dir}")
continue
num_study = 0
for study in sorted(os.listdir(patient_dir), reverse=True):
study_date = study.split('_')[0]
if datetime.datetime.strptime(study_date, "%Y%m%d") > LAST_DAY:
logging.info(f"skip {study_date}")
continue
study_dir = os.path.join(patient_dir, study)
if not os.path.isdir(study_dir):
continue
logging.info(study_dir)
best_series = check_study(study_dir)
if not best_series:
continue
dst_dir = os.path.join(dst_patient_dir, study)
anonymize_series_to_nifti(best_series['files'], dst_dir)
num_study += 1
if num_study > 0:
with open(complete_file, 'w') as f:
f.write('done')
# break
# break
print(NII_DICT)
logging.info(f"BodyPartExamined: {BodyPartExamined}")
with open(NII_JSON_PATH, 'w') as f:
json.dump(NII_DICT, f, indent=1)
if __name__ == '__main__':
main()