genxnetwork · TheVidz · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 31, 2026
diff --git a/deep_ancestry/__init__.py b/deep_ancestry/__init__.py
@@ -0,0 +1 @@
+from flan import *
diff --git a/flan/nn/loader.py b/flan/nn/loader.py
@@ -28,13 +28,17 @@ def astype(self, new_type):
         return new_y
 
 
-def load_phenotype(phenotype_path: str, out_type = numpy.float32, encode = False) -> numpy.ndarray:
+def load_phenotype(phenotype_path: str, out_type = numpy.float32, encode = False, keep_iids = None) -> numpy.ndarray:
     """
     :param phenotype_path: Phenotypes location
     :param out_type: convert to type
     :param encode: whether phenotypes are strings and we want to code them as ints)
     """
     data = pandas.read_table(phenotype_path)
+    # Highlighted Fix: If a list of aligned IIDs is provided, filter and order by them
+    if keep_iids is not None:
+        data = data.set_index('IID').reindex(keep_iids).reset_index()
+
     data = data.iloc[:, -1].values
     if encode:
         _, data = numpy.unique(data, return_inverse=True)
@@ -48,8 +52,9 @@ def load_plink_pcs(path, order_as_in_file=None):
 
     if order_as_in_file is not None:
         y = pandas.read_csv(order_as_in_file, sep='\t').set_index('IID')
-        assert len(df) == len(y)
-        df = df.reindex(y.index)
+        # Highlighted Fix: Drop the strict assert check and intersect valid indices instead
+        common_ids = y.index.intersection(df.index)
+        df = df.reindex(common_ids)
 
     return df
-    if order_as_in_file is not None:
-        y = pandas.read_csv(order_as_in_file, sep='\t').set_index('IID')
-        assert len(df) == len(y)
-        df = df.reindex(y.index)
-        # Highlighted Fix: Drop the strict assert check and intersect valid indices instead
-        common_ids = y.index.intersection(df.index)
-        df = df.reindex(common_ids)
-
-    return df
+    if order_as_in_file is not None:
+        y = pandas.read_csv(order_as_in_file, sep='\t').set_index('IID')
+        # Drop the strict assert check but enforce sanity checks on the overlap
+        common_ids = y.index.intersection(df.index)
+
+        # Sanity checks on overlap between phenotype/order file and PCA data
+        n_expected = len(y)
+        n_overlap = len(common_ids)
+        if n_overlap == 0:
+            raise ValueError(
+                f"No overlapping IIDs between order file ({order_as_in_file}) "
+                f"and PCA data (0 / {n_expected} overlapped)."
+            )
+
+        overlap_ratio = n_overlap / n_expected
+        if overlap_ratio < 0.95:
+            logging.getLogger(__name__).warning(
+                "Only %d/%d (%.1f%%) IIDs overlap between order file (%s) and PCA data; "
+                "samples with mismatched IDs will be dropped.",
+                n_overlap,
+                n_expected,
+                overlap_ratio * 100.0,
+                order_as_in_file,
+            )
+
+        df = df.reindex(common_ids)
+
+    return df
-    if order_as_in_file is not None:
-        y = pandas.read_csv(order_as_in_file, sep='\t').set_index('IID')
-        assert len(df) == len(y)
-        df = df.reindex(y.index)
-        # Highlighted Fix: Drop the strict assert check and intersect valid indices instead
-        common_ids = y.index.intersection(df.index)
-        df = df.reindex(common_ids)
-
-    return df
+    if order_as_in_file is not None:
+        y = pandas.read_csv(order_as_in_file, sep='\t').set_index('IID')
+        # Drop the strict assert check but enforce sanity checks on the overlap
+        common_ids = y.index.intersection(df.index)
+
+        # Sanity checks on overlap between phenotype/order file and PCA data
+        n_expected = len(y)
+        n_overlap = len(common_ids)
+        if n_overlap == 0:
+            raise ValueError(
+                f"No overlapping IIDs between order file ({order_as_in_file}) "
+                f"and PCA data (0 / {n_expected} overlapped)."
+            )
+
+        overlap_ratio = n_overlap / n_expected
+        if overlap_ratio < 0.95:
+            logging.getLogger(__name__).warning(
+                "Only %d/%d (%.1f%%) IIDs overlap between order file (%s) and PCA data; "
+                "samples with mismatched IDs will be dropped.",
+                n_overlap,
+                n_expected,
+                overlap_ratio * 100.0,
+                order_as_in_file,
+            )
+
+        df = df.reindex(common_ids)
+
+    return df
 
@@ -58,38 +63,55 @@ class LocalDataLoader:
     def __init__(self) -> None:
         self.logger = logging.getLogger()
 
-    def _load_phenotype(self, path: str) -> numpy.ndarray:
-        phenotype = load_phenotype(path, out_type=numpy.int64, encode=True)
+    def _load_phenotype(self, path: str, keep_iids = None) -> numpy.ndarray:
+        phenotype = load_phenotype(path, out_type=numpy.int64, encode=True, keep_iids=keep_iids)
         print(f'Phenotype dtype is {phenotype.dtype}')
         if numpy.isnan(phenotype).sum() > 0:
             raise ValueError(f'There are {numpy.isnan(phenotype).sum()} nan values in phenotype from {path}')
         else:
             return phenotype
 
     def load(self, cache: FileCache, fold: int) -> Tuple[X, Y]:
-
-        y_train = self._load_phenotype(cache.phenotype_path(fold, 'train'))
-        y_val = self._load_phenotype(cache.phenotype_path(fold, 'val'))
-        y_test = self._load_phenotype(cache.phenotype_path(fold, 'test'))
+        # Highlighted Fix: Dynamically read available sample IIDs from the generated sscore files
+        iids_train = pandas.read_csv(cache.pca_path(fold, 'train', 'sscore'), sep='\t').rename(columns={'#IID': 'IID'})['IID'].values
+        iids_val = pandas.read_csv(cache.pca_path(fold, 'val', 'sscore'), sep='\t').rename(columns={'#IID': 'IID'})['IID'].values
+        iids_test = pandas.read_csv(cache.pca_path(fold, 'test', 'sscore'), sep='\t').rename(columns={'#IID': 'IID'})['IID'].values
+
+        # Load features matching the safe intersections
+        x = self._load_pcs(cache, fold, iids_train, iids_val, iids_test)
+
+        # Load phenotypes safely aligned with those exact feature IDs
+        y_train = self._load_phenotype(cache.phenotype_path(fold, 'train'), keep_iids=iids_train)
+        y_val = self._load_phenotype(cache.phenotype_path(fold, 'val'), keep_iids=iids_val)
+        y_test = self._load_phenotype(cache.phenotype_path(fold, 'test'), keep_iids=iids_test)
         y = Y(y_train, y_val, y_test)
 
-        x = self._load_pcs(cache, fold)
         return x, y
 
-    def _load_pcs(self, cache: FileCache, fold: int) -> X:
+    def _load_pcs(self, cache: FileCache, fold: int, iids_train=None, iids_val=None, iids_test=None) -> X:
         X_train = load_plink_pcs(path=cache.pca_path(fold, 'train', 'sscore'), 
-                                 order_as_in_file=cache.phenotype_path(fold, 'train')).values.astype(numpy.float32)
+                                 order_as_in_file=cache.phenotype_path(fold, 'train'))
+        if iids_train is not None:
+            X_train = X_train.reindex(iids_train)
+        X_train = X_train.values.astype(numpy.float32)
+
         X_val = load_plink_pcs(path=cache.pca_path(fold, 'val', 'sscore'), 
-                               order_as_in_file=cache.phenotype_path(fold, 'val')).values.astype(numpy.float32)
+                               order_as_in_file=cache.phenotype_path(fold, 'val'))
+        if iids_val is not None:
+            X_val = X_val.reindex(iids_val)
+        X_val = X_val.values.astype(numpy.float32)
+
         X_test = load_plink_pcs(path=cache.pca_path(fold, 'test', 'sscore'), 
-                                order_as_in_file=cache.phenotype_path(fold, 'test')).values.astype(numpy.float32)
+                                order_as_in_file=cache.phenotype_path(fold, 'test'))
+        if iids_test is not None:
+            X_test = X_test.reindex(iids_test)
+        X_test = X_test.values.astype(numpy.float32)
+
         return X(X_train, X_val, X_test)
 
     def load_for_prediction(self, cache: FileCache) -> Tuple[numpy.ndarray, numpy.ndarray]:
         X_pred = load_plink_pcs(path=cache.pca_path(None, 'pred', 'sscore')).values.astype(numpy.float32)
-        # TODO: if fold 0 train dataset does not contain all possible target values, then we are in trouble
         data = pandas.read_table(cache.phenotype_path(0, 'train'))
         data = data.iloc[:, -1].values
         unique, _ = numpy.unique(data, return_inverse=True)
-        return X_pred, unique 
-
+        return X_pred, unique
diff --git a/flan/pca/local_plink.py b/flan/pca/local_plink.py
@@ -32,26 +32,30 @@ def fit(self, cache: FileCache) -> None:
     def transform(self, cache: FileCache) -> None:
         for fold in trange(cache.num_folds, desc='PCA projection on fold', unit='fold'):
             for part in ['train', 'val', 'test']:
+                # Kept the original clean arguments list
                 run_plink(args_list=['--score', str(cache.pca_path(fold, 'train', 'allele')), 
                                      '2', '5', 
                                      'header-read', 'no-mean-imputation', 'variance-standardize'],
                           args_dict={'--pfile': str(cache.pfile_path(fold, part)),
-                                     '--read-freq': str(cache.pca_path(fold, 'train', 'counts')),
+                                     # Removed '--read-freq' to prevent loading training set NaN/0 frequencies
+                                     '--mac': '1', # Filters out 0-frequency variants locally within the split part
                                      '--score-col-nums': f'6-{6+self.args.n_components - 1}',
                                      '--out': cache.pfile_path(fold, part)})
 
                 self.pc_scatterplot(cache, fold, part)
 
     def predict(self, cache: FileCache) -> None:
+        # Kept the original clean arguments list here too
         run_plink(args_list=['--score', str(cache.pca_path(0, 'train', 'allele')), 
                             '2', '5', 
                             'header-read', 'no-mean-imputation', 'variance-standardize'],
                  args_dict={'--pfile': str(cache.pfile_path(part='pred')),
-                            '--read-freq': str(cache.pca_path(0, 'train', 'counts')),
+                            # Removed '--read-freq'
+                            '--mac': '1',
                             '--score-col-nums': f'6-{6+self.args.n_components - 1}',
                             '--out': cache.pfile_path(part='pred')})
-                
-                
+
+
     def pc_scatterplot(self, cache: FileCache, fold: int, part: str) -> None:
         """ Visualises eigenvector with scatterplot [matrix] """
         eigenvec = pandas.read_table(cache.pca_path(fold, part, 'sscore'))[['#IID', 'PC1_AVG', 'PC2_AVG']]

diff --git a/flan/preprocess/qc.py b/flan/preprocess/qc.py
@@ -16,11 +16,23 @@ def __init__(self, qc_config: Dict) -> None:
         self.qc_config = qc_config
 
     def fit_transform(self, cache: FileCache) -> None:
-        run_plink(args_list=['--pfile', str(cache.pfile_path()), 'vzs', '--make-pgen'],
-                  args_dict={**{'--out': str(cache.pfile_path()), # Merging dicts here
-                                '--set-missing-var-ids': '@:#'},
-                             **self.qc_config})
-
+        # Create a new output path for QC-processed data
+        qc_path = str(cache.pfile_path()) + "_qc"
+
+        run_plink(
+            args_list=[
+                '--pfile', str(cache.pfile_path()),
+                '--make-pgen'
+            ],
+            args_dict={
+                '--out': qc_path,
+                '--set-missing-var-ids': '@:#:$r:$a',
+                **self.qc_config
+            }
+        )
+
+        # ✅ VERY IMPORTANT: update cache to point to QC output
+        cache._pfile_path = qc_path
 
     def transform(self, source_path: str, dest_path: str) -> None:
         run_plink(args_list=['--make-pgen', '--pfile', str(source_path)],

diff --git a/flan/preprocess/sample_splitter.py b/flan/preprocess/sample_splitter.py
@@ -29,6 +29,10 @@ def _split_ids(self,
             y: y can be passed to trigger StratifiedKFold instead of KFold
             random_state (int): Fixed random_state for train_test_split sklearn function
         """
+        # adding min 5 folds
+        num_folds = getattr(self.args, "num_folds", 5)
+        self.args.num_folds = num_folds
+
         ids = pandas.read_table(cache.ids_path()).rename(columns={'#IID': 'IID'}).filter(['FID', 'IID'])
         indices = numpy.arange(ids.shape[0])
         if self.args.num_folds == 1:
@@ -67,12 +71,17 @@ def _split_ids(self,
                 ids.iloc[indices, :].to_csv(out_path, sep='\t', index=False)
 
     def _split_genotypes(self, cache: FileCache) -> None:
+        # 🔥 Force use of QC-processed genotype
+        base_path = str(cache.pfile_path())
+        if not base_path.endswith("_qc"):
+            base_path = base_path + "_qc"
+
         for fold_index, part in product(range(cache.num_folds), ['train', 'val', 'test']):
             run_plink(
                 args_dict={
-                    '--pfile': str(cache.pfile_path()),
+                    '--pfile': base_path,  # ✅ FIXED: use QC data
                     '--keep': str(cache.ids_path(fold_index, part)),
-                    '--out':  str(cache.pfile_path(fold_index, part))
+                    '--out': str(cache.pfile_path(fold_index, part))
                 },
                 args_list=['--make-pgen']
             )
@@ -89,7 +98,9 @@ def _split_phenotypes(self, cache: FileCache) -> None:
             )
 
     def fit_transform(self, cache: FileCache) -> None:
-
+        # Force splitter to use QC output
+        if not str(cache.pfile_path()).endswith("_qc"):
+            cache._pfile_path = str(cache.pfile_path()) + "_qc"
         self._split_ids(cache)
         self._split_genotypes(cache)
         self._split_phenotypes(cache)

diff --git a/scripts/configs/cache/node1.yaml b/scripts/configs/cache/node1.yaml
@@ -1,2 +1,2 @@
-path: /data/flan/.cache/deep_ancestry/node1
+path: ./data/flan/.cache/deep_ancestry/node1
 num_folds: 1
diff --git a/scripts/configs/source/node1_50.yaml b/scripts/configs/source/node1_50.yaml
@@ -1 +1 @@
-link: /data/flan/node1_50
+link: ./data/flan/node1_50
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		link: /data/flan/node1_50
		link: ./data/flan/node1_50