Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions deep_ancestry/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from flan import *
56 changes: 39 additions & 17 deletions flan/nn/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,17 @@ def astype(self, new_type):
return new_y


def load_phenotype(phenotype_path: str, out_type = numpy.float32, encode = False) -> numpy.ndarray:
def load_phenotype(phenotype_path: str, out_type = numpy.float32, encode = False, keep_iids = None) -> numpy.ndarray:
"""
:param phenotype_path: Phenotypes location
:param out_type: convert to type
:param encode: whether phenotypes are strings and we want to code them as ints)
"""
data = pandas.read_table(phenotype_path)
# Highlighted Fix: If a list of aligned IIDs is provided, filter and order by them
if keep_iids is not None:
data = data.set_index('IID').reindex(keep_iids).reset_index()
Comment on lines +31 to +40

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (bug_risk): NaNs introduced by keep_iids are silently encoded into integers, hiding missing phenotypes.

When keep_iids is used, set_index(...).reindex(keep_iids).reset_index() will produce NaNs for IIDs present in keep_iids but missing from the phenotype file. Because numpy.unique(..., return_inverse=True) runs before any NaN check, those NaNs are treated as a category and converted to integers, so a later numpy.isnan(phenotype) check can never detect them. Please either (a) validate for NaNs immediately after the reindex and fail if any are found, or (b) perform the NaN check on the unencoded data before calling numpy.unique.


data = data.iloc[:, -1].values
if encode:
_, data = numpy.unique(data, return_inverse=True)
Expand All @@ -48,8 +52,9 @@ def load_plink_pcs(path, order_as_in_file=None):

if order_as_in_file is not None:
y = pandas.read_csv(order_as_in_file, sep='\t').set_index('IID')
assert len(df) == len(y)
df = df.reindex(y.index)
# Highlighted Fix: Drop the strict assert check and intersect valid indices instead
common_ids = y.index.intersection(df.index)
df = df.reindex(common_ids)

return df
Comment on lines 53 to 59

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Silently dropping non-common IDs in load_plink_pcs can hide serious data issues.

Using common_ids = y.index.intersection(df.index) and reindexing means mismatched phenotype/PCA IDs now result in silently dropped samples. If overlap is very small (or zero), downstream code may unknowingly run on an unrepresentative or empty dataset. Please add a sanity check (e.g., ensure len(common_ids) > 0 and that the overlap is close to len(y)), and raise or log if the overlap is unexpectedly small.

Suggested change
if order_as_in_file is not None:
y = pandas.read_csv(order_as_in_file, sep='\t').set_index('IID')
assert len(df) == len(y)
df = df.reindex(y.index)
# Highlighted Fix: Drop the strict assert check and intersect valid indices instead
common_ids = y.index.intersection(df.index)
df = df.reindex(common_ids)
return df
if order_as_in_file is not None:
y = pandas.read_csv(order_as_in_file, sep='\t').set_index('IID')
# Drop the strict assert check but enforce sanity checks on the overlap
common_ids = y.index.intersection(df.index)
# Sanity checks on overlap between phenotype/order file and PCA data
n_expected = len(y)
n_overlap = len(common_ids)
if n_overlap == 0:
raise ValueError(
f"No overlapping IIDs between order file ({order_as_in_file}) "
f"and PCA data (0 / {n_expected} overlapped)."
)
overlap_ratio = n_overlap / n_expected
if overlap_ratio < 0.95:
logging.getLogger(__name__).warning(
"Only %d/%d (%.1f%%) IIDs overlap between order file (%s) and PCA data; "
"samples with mismatched IDs will be dropped.",
n_overlap,
n_expected,
overlap_ratio * 100.0,
order_as_in_file,
)
df = df.reindex(common_ids)
return df


Expand All @@ -58,38 +63,55 @@ class LocalDataLoader:
def __init__(self) -> None:
self.logger = logging.getLogger()

def _load_phenotype(self, path: str) -> numpy.ndarray:
phenotype = load_phenotype(path, out_type=numpy.int64, encode=True)
def _load_phenotype(self, path: str, keep_iids = None) -> numpy.ndarray:
phenotype = load_phenotype(path, out_type=numpy.int64, encode=True, keep_iids=keep_iids)
print(f'Phenotype dtype is {phenotype.dtype}')
if numpy.isnan(phenotype).sum() > 0:
raise ValueError(f'There are {numpy.isnan(phenotype).sum()} nan values in phenotype from {path}')
else:
return phenotype

def load(self, cache: FileCache, fold: int) -> Tuple[X, Y]:

y_train = self._load_phenotype(cache.phenotype_path(fold, 'train'))
y_val = self._load_phenotype(cache.phenotype_path(fold, 'val'))
y_test = self._load_phenotype(cache.phenotype_path(fold, 'test'))
# Highlighted Fix: Dynamically read available sample IIDs from the generated sscore files
iids_train = pandas.read_csv(cache.pca_path(fold, 'train', 'sscore'), sep='\t').rename(columns={'#IID': 'IID'})['IID'].values
iids_val = pandas.read_csv(cache.pca_path(fold, 'val', 'sscore'), sep='\t').rename(columns={'#IID': 'IID'})['IID'].values
iids_test = pandas.read_csv(cache.pca_path(fold, 'test', 'sscore'), sep='\t').rename(columns={'#IID': 'IID'})['IID'].values

# Load features matching the safe intersections
x = self._load_pcs(cache, fold, iids_train, iids_val, iids_test)

# Load phenotypes safely aligned with those exact feature IDs
y_train = self._load_phenotype(cache.phenotype_path(fold, 'train'), keep_iids=iids_train)
y_val = self._load_phenotype(cache.phenotype_path(fold, 'val'), keep_iids=iids_val)
y_test = self._load_phenotype(cache.phenotype_path(fold, 'test'), keep_iids=iids_test)
y = Y(y_train, y_val, y_test)

x = self._load_pcs(cache, fold)
return x, y

def _load_pcs(self, cache: FileCache, fold: int) -> X:
def _load_pcs(self, cache: FileCache, fold: int, iids_train=None, iids_val=None, iids_test=None) -> X:
X_train = load_plink_pcs(path=cache.pca_path(fold, 'train', 'sscore'),
order_as_in_file=cache.phenotype_path(fold, 'train')).values.astype(numpy.float32)
order_as_in_file=cache.phenotype_path(fold, 'train'))
if iids_train is not None:
X_train = X_train.reindex(iids_train)
X_train = X_train.values.astype(numpy.float32)

X_val = load_plink_pcs(path=cache.pca_path(fold, 'val', 'sscore'),
order_as_in_file=cache.phenotype_path(fold, 'val')).values.astype(numpy.float32)
order_as_in_file=cache.phenotype_path(fold, 'val'))
if iids_val is not None:
Comment on lines +91 to +100

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (bug_risk): Reindexing PCs with IIDs can create all-NaN rows when PCA IIDs and phenotype IIDs differ.

In _load_pcs, load_plink_pcs already intersects PCA sscore IIDs with the phenotype IIDs from order_as_in_file. When you then reindex on iids_train / iids_val / iids_test, any IIDs present only in the sscore file will be reintroduced as all-NaN rows. These NaNs persist through .values.astype(numpy.float32), so you can end up with feature rows that are empty or misaligned with your phenotype filtering. Consider intersecting iids_* with the DataFrame index before reindexing, or at least asserting that the number of non-NaN rows matches expectations.

X_val = X_val.reindex(iids_val)
X_val = X_val.values.astype(numpy.float32)

X_test = load_plink_pcs(path=cache.pca_path(fold, 'test', 'sscore'),
order_as_in_file=cache.phenotype_path(fold, 'test')).values.astype(numpy.float32)
order_as_in_file=cache.phenotype_path(fold, 'test'))
if iids_test is not None:
X_test = X_test.reindex(iids_test)
X_test = X_test.values.astype(numpy.float32)

return X(X_train, X_val, X_test)

def load_for_prediction(self, cache: FileCache) -> Tuple[numpy.ndarray, numpy.ndarray]:
X_pred = load_plink_pcs(path=cache.pca_path(None, 'pred', 'sscore')).values.astype(numpy.float32)
# TODO: if fold 0 train dataset does not contain all possible target values, then we are in trouble
data = pandas.read_table(cache.phenotype_path(0, 'train'))
data = data.iloc[:, -1].values
unique, _ = numpy.unique(data, return_inverse=True)
return X_pred, unique

return X_pred, unique
12 changes: 8 additions & 4 deletions flan/pca/local_plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,30 @@ def fit(self, cache: FileCache) -> None:
def transform(self, cache: FileCache) -> None:
for fold in trange(cache.num_folds, desc='PCA projection on fold', unit='fold'):
for part in ['train', 'val', 'test']:
# Kept the original clean arguments list
run_plink(args_list=['--score', str(cache.pca_path(fold, 'train', 'allele')),
'2', '5',
'header-read', 'no-mean-imputation', 'variance-standardize'],
args_dict={'--pfile': str(cache.pfile_path(fold, part)),
'--read-freq': str(cache.pca_path(fold, 'train', 'counts')),
# Removed '--read-freq' to prevent loading training set NaN/0 frequencies
'--mac': '1', # Filters out 0-frequency variants locally within the split part
'--score-col-nums': f'6-{6+self.args.n_components - 1}',
'--out': cache.pfile_path(fold, part)})

self.pc_scatterplot(cache, fold, part)

def predict(self, cache: FileCache) -> None:
# Kept the original clean arguments list here too
run_plink(args_list=['--score', str(cache.pca_path(0, 'train', 'allele')),
'2', '5',
'header-read', 'no-mean-imputation', 'variance-standardize'],
args_dict={'--pfile': str(cache.pfile_path(part='pred')),
'--read-freq': str(cache.pca_path(0, 'train', 'counts')),
# Removed '--read-freq'
'--mac': '1',
'--score-col-nums': f'6-{6+self.args.n_components - 1}',
'--out': cache.pfile_path(part='pred')})


def pc_scatterplot(self, cache: FileCache, fold: int, part: str) -> None:
""" Visualises eigenvector with scatterplot [matrix] """
eigenvec = pandas.read_table(cache.pca_path(fold, part, 'sscore'))[['#IID', 'PC1_AVG', 'PC2_AVG']]
Expand Down
22 changes: 17 additions & 5 deletions flan/preprocess/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,23 @@ def __init__(self, qc_config: Dict) -> None:
self.qc_config = qc_config

def fit_transform(self, cache: FileCache) -> None:
run_plink(args_list=['--pfile', str(cache.pfile_path()), 'vzs', '--make-pgen'],
args_dict={**{'--out': str(cache.pfile_path()), # Merging dicts here
'--set-missing-var-ids': '@:#'},
**self.qc_config})

# Create a new output path for QC-processed data
qc_path = str(cache.pfile_path()) + "_qc"

run_plink(
args_list=[
'--pfile', str(cache.pfile_path()),
'--make-pgen'
],
args_dict={
'--out': qc_path,
'--set-missing-var-ids': '@:#:$r:$a',
**self.qc_config
}
)

# ✅ VERY IMPORTANT: update cache to point to QC output
cache._pfile_path = qc_path
Comment on lines +34 to +35

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Directly mutating cache._pfile_path and using string concatenation may break expectations about pfile_path type and structure.

Assigning qc_path (built as str(cache.pfile_path()) + "_qc") back into cache._pfile_path changes its type from a Path-like object to a plain string. Any downstream code using pfile_path() as a Path (e.g. with / or .with_suffix) may then fail unexpectedly. The same pattern appears in SampleSplitter.fit_transform. Please construct qc_path as a Path (for example via with_name) and update the cache via a public method rather than mutating the private _pfile_path attribute directly.

Suggested implementation:

            args_dict={
                '--out': str(qc_path),
                '--set-missing-var-ids': '@:#:$r:$a',
                **self.qc_config
            }
        )

        # ✅ VERY IMPORTANT: update cache to point to QC output
        cache.set_pfile_path(qc_path)
  1. Ensure qc_path is constructed as a Path-like object rather than via string concatenation. For example, wherever qc_path is currently defined (likely as str(cache.pfile_path()) + "_qc"), replace it with something like:
    • qc_path = cache.pfile_path().with_name(cache.pfile_path().name + "_qc")
      or, if you need to preserve suffixes differently, use an appropriate pathlib.Path method.
  2. Implement the new public method set_pfile_path on the cache object’s class (if it does not already exist). It should:
    • Accept a Path-like object.
    • Store it internally (e.g. in _pfile_path) without changing its type.
    • Maintain any invariants or side effects previously associated with updating _pfile_path.
  3. Update SampleSplitter.fit_transform to follow the same pattern:
    • Construct its QC (or split) path using Path operations, not string concatenation.
    • Use cache.set_pfile_path(...) (or the equivalent public API) instead of mutating cache._pfile_path directly.


def transform(self, source_path: str, dest_path: str) -> None:
run_plink(args_list=['--make-pgen', '--pfile', str(source_path)],
Expand Down
17 changes: 14 additions & 3 deletions flan/preprocess/sample_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ def _split_ids(self,
y: y can be passed to trigger StratifiedKFold instead of KFold
random_state (int): Fixed random_state for train_test_split sklearn function
"""
# adding min 5 folds
num_folds = getattr(self.args, "num_folds", 5)
self.args.num_folds = num_folds
Comment on lines +32 to +34

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (bug_risk): Forcing a minimum of 5 folds on self.args.num_folds can desync from cache.num_folds and the on-disk layout.

Here self.args.num_folds is forced to ≥5, but other code (e.g. _split_genotypes, PCA) iterates over range(cache.num_folds). With configs like scripts/configs/cache/node1.yaml (num_folds: 1), _split_ids will create 5 folds of IDs while only 1 fold of genotypes/artifacts exists. This can break downstream logic for folds > 0. Instead of enforcing a minimum here, keep self.args.num_folds aligned with cache.num_folds (or validate and update them together).


ids = pandas.read_table(cache.ids_path()).rename(columns={'#IID': 'IID'}).filter(['FID', 'IID'])
indices = numpy.arange(ids.shape[0])
if self.args.num_folds == 1:
Expand Down Expand Up @@ -67,12 +71,17 @@ def _split_ids(self,
ids.iloc[indices, :].to_csv(out_path, sep='\t', index=False)

def _split_genotypes(self, cache: FileCache) -> None:
# 🔥 Force use of QC-processed genotype
base_path = str(cache.pfile_path())
if not base_path.endswith("_qc"):
base_path = base_path + "_qc"

for fold_index, part in product(range(cache.num_folds), ['train', 'val', 'test']):
run_plink(
args_dict={
'--pfile': str(cache.pfile_path()),
'--pfile': base_path, # ✅ FIXED: use QC data
'--keep': str(cache.ids_path(fold_index, part)),
'--out': str(cache.pfile_path(fold_index, part))
'--out': str(cache.pfile_path(fold_index, part))
},
args_list=['--make-pgen']
)
Expand All @@ -89,7 +98,9 @@ def _split_phenotypes(self, cache: FileCache) -> None:
)

def fit_transform(self, cache: FileCache) -> None:

# Force splitter to use QC output
if not str(cache.pfile_path()).endswith("_qc"):
cache._pfile_path = str(cache.pfile_path()) + "_qc"
self._split_ids(cache)
self._split_genotypes(cache)
self._split_phenotypes(cache)
Expand Down
2 changes: 1 addition & 1 deletion scripts/configs/cache/node1.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
path: /data/flan/.cache/deep_ancestry/node1
path: ./data/flan/.cache/deep_ancestry/node1
num_folds: 1
2 changes: 1 addition & 1 deletion scripts/configs/source/node1_50.yaml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
link: /data/flan/node1_50
link: ./data/flan/node1_50