Merge pull request #3 from czyjtu/feature/data-utils

Initial
czyjtu · Apr 8, 2023 · 696d2c6 · 696d2c6
2 parents 59ad812 4015687
commit 696d2c6
Show file tree

Hide file tree

Showing 404 changed files with 3,166 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3  127,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
 .idea/
diff --git a/coronaryx/__init__.py b/coronaryx/__init__.py
@@ -0,0  1,3 @@
 from coronaryx.data import CoronagraphyScan, VesselBranch
 from coronaryx.plot import plot_scan, plot_branch
 from coronaryx.utils import read_dataset
diff --git a/coronaryx/algorithms/__init__.py b/coronaryx/algorithms/__init__.py
@@ -0,0  1,3 @@
 from coronaryx.algorithms.traverse import traverse_branch_edges, traverse_branch_nodes
 from coronaryx.algorithms.structure import split_into_branches
 from coronaryx.algorithms.extraction import sweep
diff --git a/coronaryx/algorithms/extraction.py b/coronaryx/algorithms/extraction.py
@@ -0,0  1,48 @@
 import numpy as np
 import numpy.typing as npt
 
 import coronaryx.functions as cxf
 from coronaryx.data import VesselBranch
 from coronaryx.algorithms.traverse import traverse_branch_edges
 
 
 # TODO do we really need width? maybe we should consider masks too? do we need to pass the extrapolation argument?
 # TODO rename to sweep_along_branch?
 def sweep(
         branch: VesselBranch,
         matrix: np.ndarray['n, m', float],
         sampling_rate: int = 4,
         width: int = 50,
         return_roi_positives: bool = False
 ) -> np.ndarray['n, width', float] | tuple[np.ndarray['n, width', float], np.ndarray['n', float]]:
 
     sweeps = []
     positives = []
     for v, w in traverse_branch_edges(branch):
         v = np.array(v, dtype=np.float32)
         w = np.array(w, dtype=np.float32)
         # TODO vvv
         # we could normalize this and omit `sampling_rate`
         # maybe sampling_rate is good? maybe we should get another argument - sampling_strategy
         # it will be either 'norm' or 'edge', norm means sampling_rate is times per unit vector,
         # and edge is times per edge, which will likely result in inconsistent distances between samples
         vessel_direction = w - v
         orthonormal_vector = cxf.orthonormal_2d_vector(vessel_direction)  # TODO make direction deterministic? is it now?
 
         multipliers = np.arange(width) - width / 2   0.5
         sweep_vector = multipliers[:, np.newaxis] * orthonormal_vector
 
         for i in range(sampling_rate):
             anchor = v   (vessel_direction * i) / sampling_rate
             sweep_points = anchor   sweep_vector
             sweep_cut = cxf.interp_matrix(matrix, sweep_points[:, 0], sweep_points[:, 1])
 
             sweeps.append(sweep_cut)
             positives.append(branch.scan.in_any_roi(anchor))
 
     if return_roi_positives:
         return np.array(sweeps), np.array(positives)
     return np.array(sweeps)
 
 
 # TODO define alias functions for unraveling, width calculation and so on? should we?
diff --git a/coronaryx/algorithms/structure.py b/coronaryx/algorithms/structure.py
@@ -0,0  1,27 @@
 import networkx as nx
 
 from coronaryx.data import CoronagraphyScan, VesselBranch
 
 
 def split_into_branches(scan: CoronagraphyScan) -> list[VesselBranch]:
     branches = []
     branching_nodes = []
 
     G: nx.Graph = scan.centerline.copy()
     for node in G.nodes:
         if len(G[node]) >= 3:
             branching_nodes.append(node)
 
     G.remove_nodes_from(branching_nodes)
 
     for component in nx.connected_components(G):
         subgraph = G.subgraph(component).copy()
         branches.append(subgraph)
 
         # adding the branching nodes to each branch, if they were connected
         for branching_node in branching_nodes:
             for bn_neighbor in scan.centerline[branching_node]:
                 if bn_neighbor in component:
                     subgraph.add_edge(branching_node, bn_neighbor)
 
     return [VesselBranch(scan, branch) for branch in branches]
diff --git a/coronaryx/algorithms/traverse.py b/coronaryx/algorithms/traverse.py
@@ -0,0  1,15 @@
 from typing import Any
 
 import networkx as nx
 
 from coronaryx.data import VesselBranch
 
 
 def traverse_branch_nodes(branch: VesselBranch) -> list[Any]:
     start_node, end_node = branch.boundary_nodes
     return list(nx.dfs_preorder_nodes(branch.branch, source=start_node))
 
 
 def traverse_branch_edges(branch: VesselBranch) -> list[tuple[Any, Any]]:
     nodes = traverse_branch_nodes(branch)
     return list(zip(nodes[:-1], nodes[1:]))
diff --git a/coronaryx/data.py b/coronaryx/data.py
@@ -0,0  1,46 @@
 from dataclasses import dataclass
 from typing import Any
 
 import numpy as np
 import numpy.typing as npt
 import networkx as nx
 
 
 @dataclass
 class ROI:
     start_x: float
     start_y: float
     end_x: float
     end_y: float
 
     def __contains__(self, item: tuple[int, int] | np.ndarray['2', float]) -> bool:
         y, x = item
         return self.start_x <= x < self.end_x and self.start_y <= y < self.end_y
 
 
 @dataclass
 class CoronagraphyScan:
     name: str
     scan: np.ndarray['scan_height, scan_width', np.uint8]
     vessel_mask: np.ndarray['scan_height, scan_width', bool]
     centerline: nx.Graph
     rois: list[ROI]
 
     def in_any_roi(self, item: tuple[int, int] | np.ndarray['2', float]) -> bool:
         return any([item in roi for roi in self.rois])
 
 
 @dataclass
 class VesselBranch:
     scan: CoronagraphyScan
     branch: nx.Graph
 
     def __post_init__(self):
         # TODO some kind of deterministic sorting? preferably from the vessel start to the end
         self._boundary_nodes: tuple[Any, Any] = \
             tuple([node for node in self.branch.nodes if len(self.branch[node]) == 1])
         assert len(self._boundary_nodes) == 2
 
     @property
     def boundary_nodes(self) -> tuple[Any, Any]:
         return self._boundary_nodes
diff --git a/coronaryx/functions.py b/coronaryx/functions.py
@@ -0,0  1,58 @@
 import numpy as np
 import numpy.typing as npt
 from scipy.interpolate import interp2d
 
 
 def interp_matrix(
         matrix: np.ndarray,
         xs: float | np.ndarray['n', float],
         ys: float | np.ndarray['n', float],
         kind: str = 'linear'
 ) -> float | np.ndarray:
     """
     Interpolates matrix values and allows using float indices.
 
     :param matrix: 2d numpy ndarray
     :param xs: float index or a vector of float indices (first dimension)
     :param ys: float index or a vector of float indices (second dimension)
     :param kind: interpolation method: {'linear', 'cubic', 'quintic'}
     :return: interpolated float value or a vector of interpolated float values
     """
 
     if matrix.ndim != 2:
         raise ValueError('unable to process non-2d arrays')
 
     xs = np.array(xs)
     ys = np.array(ys)
 
     if xs.shape != ys.shape:
         raise ValueError('xs and ys must have the same shape')
 
     x_idxs = np.arange(matrix.shape[0])
     y_idxs = np.arange(matrix.shape[1])
 
     # scipy considers x, y in reverse
     interp = interp2d(y_idxs, x_idxs, matrix, kind=kind)
 
     if xs.shape == tuple():
         return interp(ys, xs)
 
     interpolated = []
     for x, y in zip(xs, ys):
         interpolated.append(interp(y, x))
 
     # FIXME vectorized way sorts the input, and I don't know how to map it to the original
     # interpolated = np.diag(interp(ys, xs)).reshape(xs.shape)
 
     return np.array(interpolated).reshape(xs.shape)
 
 
 def orthonormal_2d_vector(v: np.ndarray['2', float]) -> np.ndarray['2', float]:
     """
     Computes a normalized orthogonal (orthonormal) vector to the specified in the 2d space
     :param v: input vector
     :return: orthonormal vector to `v`
     """
 
     ov = np.array([v[1], -v[0]], dtype=v.dtype)
     return ov / np.linalg.norm(ov)
diff --git a/coronaryx/plot.py b/coronaryx/plot.py
@@ -0,0  1,56 @@
 from copy import deepcopy
 
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 
 from coronaryx.data import CoronagraphyScan, VesselBranch
 
 
 def plot_dataset(dataset: list[CoronagraphyScan], figsize: tuple[int, int] = (8, 8)) -> None:
     n_items = len(dataset)
     n_rows = 10
     n_cols = n_items // 10   1
     fig, axes = plt.subplots(n_rows, n_cols)
 
     i = 0
     for axes_row in axes:
         for ax in axes_row:
             if i < n_items:
                 ax.imshow(dataset[i].scan, cmap='gray', extent=None)
             else:
                 ax.imshow(np.zeros_like(dataset[0].scan), cmap='gray', extent=None)
             ax.axis("off")
             i  = 1
 
     fig.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.95, wspace=0.05, hspace=0.05)
     fig.set_size_inches(*figsize)
     plt.show()
 
 
 def plot_scan(scan: CoronagraphyScan):
     fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 8))
 
     ax1.imshow(scan.scan, cmap='gray')
     ax2.imshow(scan.vessel_mask, cmap='gray')
 
     nodes = np.array(scan.centerline.nodes)
     ax2.scatter(nodes[:, 1], nodes[:, 0], s=10, c='blue')
 
     for (x1, y1), (x2, y2) in scan.centerline.edges:
         ax2.plot([y1, y2], [x1, x2], c='pink')
 
     for roi in scan.rois:
         rect = patches.Rectangle(
             (roi.start_x, roi.start_y), roi.end_x - roi.start_x, roi.end_y - roi.start_y,
             linewidth=2, edgecolor='r', facecolor='none'
         )
         ax2.add_patch(rect)
 
     plt.show()
 
 
 def plot_branch(branch: VesselBranch):
     scan_copy = deepcopy(branch.scan)
     scan_copy.centerline = branch.branch
     plot_scan(scan_copy)
diff --git a/coronaryx/utils.py b/coronaryx/utils.py
@@ -0,0  1,48 @@
 from pathlib import Path
 import pickle
 
 import numpy as np
 from PIL import Image
 
 from coronaryx.data import CoronagraphyScan, ROI
 
 
 def read_dataset(dataset_dir: str):
     dataset_dir = Path(dataset_dir)
 
     items = []
     for item_dir in dataset_dir.iterdir():
         if not item_dir.is_dir() and not item_dir.name.startswith('seg'):
             continue
 
         # scan
         with open(item_dir / 'representative_frame.p', 'rb') as f:
             scan = pickle.load(f)
 
         # centerline
         with open(item_dir / 'centreline.p', 'rb') as f:
             centerline = pickle.load(f)
 
         # vessel mask
         mask_img = Image.open(item_dir / 'segmentation.png')
         mask = np.array(mask_img)[:, :, 0] == 253
 
         # roi
         with open(item_dir / 'roi.p', 'rb') as f:
             roi_list = pickle.load(f)
 
         rois = []
         for d in roi_list:
             roi = ROI(d['start']['x'], d['start']['y'], d['end']['x'], d['end']['y'])
             rois.append(roi)
 
         # adding
         item = CoronagraphyScan(
             item_dir.name,
             scan,
             mask,
             centerline,
             rois
         )
         items.append(item)
     return items
diff --git a/data/data_desc.md b/data/data_desc.md
@@ -0,0  1,53 @@
 # File desciption
 
 Each directory contains two files:
 1. centreline.p - Pickle file containing the graph representation of the vessel.
 2. representative_frame.p - Pickle file containing representative frame.
 3. roi.py - Pickle file containing lesion markings.
 4. segmentation.png - PNG file containing binary vessel segmentation mask.
 
 # Centreline file
 
 In order to read the Pickle file, use the built-in pickle library in Python. You can use the following code:
 ```Python
 import pickle
 
 with open('centreline.p', 'rb') as file:
     vessel_graph = pickle.load(file)
 ```
 This will get you a NetworkX ([another Python library for graph manipulation](https://networkx.org/)) graph object. The graph's nodes are points on the vessel centreline. To obtain the centreline, you will have to connect the dots :). You can do this simply and less accurately - by drawing straight lines between the nodes - or by employing some kind of an interpolation method.
 
 The graphs were generated automatically. They were also automatically cleaned of small branches and detached segments. They can contain some erroneous nodes and edges. If you encounter any issues, please report them back. This will help me develop a better version of the algorithm.
 
 # Representative frame
 Similarly to the centreline file, this one also has to be opened via the pickle library. It contains a numpy array with the frame on top of which the vessel segmentation was performed. Read the file the same way you read the centreline file.
 
 # Stenosis marking
 Again, a pickle file, containing a Python array with stenosis markings. Each array element is a region of interest designated by a start (upper left) and end point (lower right). It may contain additional attributes under the *form* key. These attributes are used for the calculation of [SyntaxScore](https://www.syntaxscore.org/). If the ROI array is empty, then there was no ROI marked. An example array:
 ```JSON
 [ {
     "start" : {
     "x" : 158.89999389648438,
     "y" : 122.59999084472656
     },
     "end" : {
     "x" : 214.89999389648438,
     "y" : 170.59999084472656
     },
     "form" : {
     "bifurcation" : true,
     "bifurcationType" : "B",
     "bifurcationAngle" : "LT70"
     },
     "thumbnail" : "..."
 } ]
 ```
 
 # Segmentation file
 As mentioned above, this is a PNG file that contains a binary mask. This is the segmentation of the vessel structure. You can use matplotlib to read it into an numpy array:
 ```Python
 import maplotlib.pyplot as plt
 
 segmentation = plt.imread('segmentation.png')
 ```
 The segmentations all have dimensions of 512x512 pixels and were hand-drawn by medical specialists. **It is important to note that multiple MDs might have prepared the segmentation of a single vessel file!** There is a slight chance that some masks were used for testing and do not properly depict the vessel structure. However, I did my best to eliminate them (hence the missing numbers in the 'seg' folders). I hope the data works for you!
diff --git a/data/seg0/centreline.p b/data/seg0/centreline.p
diff --git a/data/seg0/representative_frame.p b/data/seg0/representative_frame.p
diff --git a/data/seg0/roi.p b/data/seg0/roi.p
@@ -0,0  1 @@
 �]�.
diff --git a/data/seg0/segmentation.png b/data/seg0/segmentation.png
diff --git a/data/seg1/centreline.p b/data/seg1/centreline.p
diff --git a/data/seg1/representative_frame.p b/data/seg1/representative_frame.p
diff --git a/data/seg1/roi.p b/data/seg1/roi.p
diff --git a/data/seg1/segmentation.png b/data/seg1/segmentation.png
diff --git a/data/seg100/centreline.p b/data/seg100/centreline.p
diff --git a/data/seg100/representative_frame.p b/data/seg100/representative_frame.p
diff --git a/data/seg100/roi.p b/data/seg100/roi.p
@@ -0,0  1 @@
 �]�.
diff --git a/data/seg100/segmentation.png b/data/seg100/segmentation.png
diff --git a/data/seg101/centreline.p b/data/seg101/centreline.p
diff --git a/data/seg101/representative_frame.p b/data/seg101/representative_frame.p
diff --git a/data/seg101/roi.p b/data/seg101/roi.p
@@ -0,0  1 @@
 �]�.
diff --git a/data/seg101/segmentation.png b/data/seg101/segmentation.png
diff --git a/data/seg102/centreline.p b/data/seg102/centreline.p
diff --git a/data/seg102/representative_frame.p b/data/seg102/representative_frame.p
diff --git a/data/seg102/roi.p b/data/seg102/roi.p
@@ -0,0  1 @@
 �]�.
diff --git a/data/seg102/segmentation.png b/data/seg102/segmentation.png
diff --git a/data/seg103/centreline.p b/data/seg103/centreline.p
diff --git a/data/seg103/representative_frame.p b/data/seg103/representative_frame.p
diff --git a/data/seg103/roi.p b/data/seg103/roi.p
@@ -0,0  1 @@
 �]�.
diff --git a/data/seg103/segmentation.png b/data/seg103/segmentation.png
diff --git a/data/seg104/centreline.p b/data/seg104/centreline.p
diff --git a/data/seg104/representative_frame.p b/data/seg104/representative_frame.p
diff --git a/data/seg104/roi.p b/data/seg104/roi.p
@@ -0,0  1 @@
 �]�.
diff --git a/data/seg104/segmentation.png b/data/seg104/segmentation.png
diff --git a/data/seg105/centreline.p b/data/seg105/centreline.p
diff --git a/data/seg105/representative_frame.p b/data/seg105/representative_frame.p
diff --git a/data/seg105/roi.p b/data/seg105/roi.p
@@ -0,0  1 @@
 �]�.
diff --git a/data/seg105/segmentation.png b/data/seg105/segmentation.png
diff --git a/data/seg106/centreline.p b/data/seg106/centreline.p
diff --git a/data/seg106/representative_frame.p b/data/seg106/representative_frame.p
diff --git a/data/seg106/roi.p b/data/seg106/roi.p
@@ -0,0  1 @@
 �]�.
diff --git a/data/seg106/segmentation.png b/data/seg106/segmentation.png
diff --git a/data/seg107/centreline.p b/data/seg107/centreline.p
diff --git a/data/seg107/representative_frame.p b/data/seg107/representative_frame.p
diff --git a/data/seg107/roi.p b/data/seg107/roi.p
@@ -0,0  1 @@
 �]�.
diff --git a/data/seg107/segmentation.png b/data/seg107/segmentation.png
diff --git a/data/seg108/centreline.p b/data/seg108/centreline.p
diff --git a/data/seg108/representative_frame.p b/data/seg108/representative_frame.p
Original file line number	Diff line number	Diff line change
Expand Up		@@ -127,3 127,4 @@ dmypy.json

		# Pyre type checker
		.pyre/
		.idea/