From fbd937ed4f0ccfe064c40552daebde8485e16c2f Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Mon, 20 Apr 2026 12:42:47 +0000
Subject: [PATCH 01/22] initial rewrie of merge step from WDL to Polars

---
 mypolars/.python-version |   1 +
 mypolars/README.md       |   0
 mypolars/main.py         | 171 +++++++++++++++++++++++++
 mypolars/pyproject.toml  |  15 +++
 mypolars/uv.lock         | 263 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 450 insertions(+)
 create mode 100644 mypolars/.python-version
 create mode 100644 mypolars/README.md
 create mode 100644 mypolars/main.py
 create mode 100644 mypolars/pyproject.toml
 create mode 100644 mypolars/uv.lock

diff --git a/mypolars/.python-version b/mypolars/.python-version
new file mode 100644
index 0000000..6324d40
--- /dev/null
+++ b/mypolars/.python-version
@@ -0,0 +1 @@
+3.14
diff --git a/mypolars/README.md b/mypolars/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/mypolars/main.py b/mypolars/main.py
new file mode 100644
index 0000000..cd47118
--- /dev/null
+++ b/mypolars/main.py
@@ -0,0 +1,171 @@
+"""
+Merges the incoming Kanta Lab data from THL into one coherent file.
+
+Note: needed ~128GB memory to run on R14 data.
+"""
+import gzip
+from argparse import ArgumentParser
+from itertools import zip_longest
+from pathlib import Path
+
+import polars as pl
+pl.Config.set_verbose(True)  
+
+
+# TODO
+# 4. Validate that shared column match
+# 5. Post WDL sort-dup: subset columns, join SEX, sort, output unique/duplicates/error rows
+
+EXPECTED_COLUMNS_RESPONSES = [
+    "FINNGENID",
+    "EVENT_AGE",
+    "APPROX_EVENT_DAY",
+    "TIME",
+    "asiakirjaoid_pseudo",
+    "merkintaoid_pseudo",
+    "entryoid_pseudo",
+    "load_id_pseudo",
+    "file_name_pseudo",
+    "laboratoriotutkimusoid",
+    "laboratoriotutkimusnimike",
+    "paikallinentutkimusnimike_koodi",
+    "paikallinentutkimusnimike_selite",
+    "tutkimuskoodistonjarjestelma",
+    "tiedonlahde",
+    "tutkimusvastauksentila",
+    "tutkimustulosarvo",
+    "tutkimustulosyksikko",
+    "tutkimuksennaytelaatu",
+    "tutkimuksentekotapa",
+    "tuloksenpoikkeavuus",
+    "viitearvoryhma",
+    "viitevalialkuarvo",
+    "viitevalialkuyksikko",
+    "viitevaliloppuarvo",
+    "viitevaliloppuyksikko",
+]
+
+EXPECTED_COLUMNS_FREETEXT = [
+    "FINNGENID",
+    "EVENT_AGE",
+    "APPROX_EVENT_DAY",
+    "TIME",
+    "asiakirjaoid_pseudo",
+    "merkintaoid_pseudo",
+    "entryoid_pseudo",
+    "load_id_pseudo",
+    "file_name_pseudo",
+    "tutkimustulosteksti",
+]
+
+
+def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]:
+    pairs = []
+    with open(list_file) as fp:
+        for line in fp:
+            values = line.split(separator, maxsplit=2)
+
+            responses = validate_tsv_gz(values[0], list_file.parent)
+            freetext = validate_tsv_gz(values[1], list_file.parent)
+
+            pairs.append((responses, freetext))
+
+    for responses, freetext in pairs:
+        check_columns(responses, EXPECTED_COLUMNS_RESPONSES, "responses")
+        check_columns(freetext, EXPECTED_COLUMNS_FREETEXT, "freetext")
+
+    return pairs
+
+
+def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) -> None:
+    to_concat = []
+    for path_responses, path_freetext in pairs:
+        print(f"Processing {path_responses} & {path_freetext}")
+
+        df_resp = (
+            pl.scan_csv(path_responses, infer_schema=False, separator="\t")
+            .with_row_index(name="_rn", offset=1)
+        )
+
+        df_freetext = (
+            pl.scan_csv(path_freetext, infer_schema=False, separator="\t")
+            .with_row_index(name="_rn", offset=1)
+        )
+
+        df_merged = df_resp.join(df_freetext, on="_rn", how="full")
+        to_concat.append(df_merged)
+
+    pl.concat(to_concat).sink_parquet(parquet_output)
+
+
+def validate_tsv_gz(filename: str, in_dir: Path) -> Path:
+    """Check if path exists and is a proper TSV & gz"""
+    full_path = (in_dir / filename.strip()).resolve()
+
+    if not full_path.exists():
+        raise FileNotFoundError(f"File does not exist: {full_path}")
+
+    # Check it's readable as a gzip file
+    try:
+        with gzip.open(full_path, "rt", encoding="utf-8") as ff:
+            first_line = ff.readline()
+    except OSError as ee:
+        raise ValueError(f"File is not a valid gzip: {full_path}") from ee
+
+    # Check it's actual TSV
+    if "\t" not in first_line:
+        raise ValueError(
+            f"File does not appear to be TSV (no \\t on first line): {full_path}"
+        )
+
+    return full_path
+
+
+def check_columns(file_path: Path, expected_columns: list[str], label: str) -> None:
+    actual_columns = get_columns(file_path)
+
+    if actual_columns != expected_columns:
+        if len(actual_columns) == 0:
+            raise Exception(f"No columns in {file_path}")
+
+        if len(expected_columns) == 0:
+            raise Exception(
+                f"Misconfigured expected columns ({label}): no columns listed"
+            )
+
+        if set(actual_columns) != set(expected_columns):
+            message = f"Columns differ for {label}:\n"
+            message += f"Only in expected columns: {list(set(expected_columns) - set(actual_columns))}\n"
+            message += f"Only in actual columns: {list(set(actual_columns) - set(expected_columns))}"
+            raise Exception(message)
+
+        # Else it's the same columns but in different order
+        message = "Column order differ:\n"
+        for col_expected, col_actual in zip_longest(expected_columns, actual_columns):
+            comp = "==" if col_expected == col_actual else "=!=/!\\=!="
+            message += f"{col_expected} {comp} {col_actual}\n"
+        raise Exception(message)
+
+
+def get_columns(input_path: Path) -> list[str]:
+    # We checked that the file is a proper TSV gz beforehand, so we now explicitely specify the separator
+    df = pl.read_csv(
+        input_path, has_header=True, separator="\t", infer_schema=False, n_rows=0
+    )
+    return df.columns
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--list-file",
+        required=True,
+        type=Path,
+        help="File containing pair of paths to responses & freetext data, one pair per line (TSV without header).",
+    )
+
+    args = parser.parse_args()
+
+    pairs = validate_input_pairs(args.list_file)
+
+    merge_by_pair(pairs, "/tmp/out.parquet")
diff --git a/mypolars/pyproject.toml b/mypolars/pyproject.toml
new file mode 100644
index 0000000..26939d3
--- /dev/null
+++ b/mypolars/pyproject.toml
@@ -0,0 +1,15 @@
+[project]
+name = "mypolars"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.14"
+dependencies = [
+    "polars>=1.40.0",
+]
+
+[dependency-groups]
+dev = [
+    "ipython>=9.12.0",
+    "ty>=0.0.32",
+]
diff --git a/mypolars/uv.lock b/mypolars/uv.lock
new file mode 100644
index 0000000..00fb747
--- /dev/null
+++ b/mypolars/uv.lock
@@ -0,0 +1,263 @@
+version = 1
+revision = 3
+requires-python = ">=3.14"
+
+[[package]]
+name = "asttokens"
+version = "3.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/be/a5/8e3f9b6771b0b408517c82d97aed8f2036509bc247d46114925e32fe33f0/asttokens-3.0.1.tar.gz", hash = "sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7", size = 62308, upload-time = "2025-11-15T16:43:48.578Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "decorator"
+version = "5.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
+]
+
+[[package]]
+name = "executing"
+version = "2.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" },
+]
+
+[[package]]
+name = "ipython"
+version = "9.12.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "decorator" },
+    { name = "ipython-pygments-lexers" },
+    { name = "jedi" },
+    { name = "matplotlib-inline" },
+    { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "prompt-toolkit" },
+    { name = "pygments" },
+    { name = "stack-data" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3a/73/7114f80a8f9cabdb13c27732dce24af945b2923dcab80723602f7c8bc2d8/ipython-9.12.0.tar.gz", hash = "sha256:01daa83f504b693ba523b5a407246cabde4eb4513285a3c6acaff11a66735ee4", size = 4428879, upload-time = "2026-03-27T09:42:45.312Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/59/22/906c8108974c673ebef6356c506cebb6870d48cedea3c41e949e2dd556bb/ipython-9.12.0-py3-none-any.whl", hash = "sha256:0f2701e8ee86e117e37f50563205d36feaa259d2e08d4a6bc6b6d74b18ce128d", size = 625661, upload-time = "2026-03-27T09:42:42.831Z" },
+]
+
+[[package]]
+name = "ipython-pygments-lexers"
+version = "1.1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" },
+]
+
+[[package]]
+name = "jedi"
+version = "0.19.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "parso" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" },
+]
+
+[[package]]
+name = "matplotlib-inline"
+version = "0.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" },
+]
+
+[[package]]
+name = "mypolars"
+version = "0.1.0"
+source = { virtual = "." }
+dependencies = [
+    { name = "polars" },
+]
+
+[package.dev-dependencies]
+dev = [
+    { name = "ipython" },
+    { name = "ty" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "polars", specifier = ">=1.40.0" }]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "ipython", specifier = ">=9.12.0" },
+    { name = "ty", specifier = ">=0.0.32" },
+]
+
+[[package]]
+name = "parso"
+version = "0.8.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/81/76/a1e769043c0c0c9fe391b702539d594731a4362334cdf4dc25d0c09761e7/parso-0.8.6.tar.gz", hash = "sha256:2b9a0332696df97d454fa67b81618fd69c35a7b90327cbe6ba5c92d2c68a7bfd", size = 401621, upload-time = "2026-02-09T15:45:24.425Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b6/61/fae042894f4296ec49e3f193aff5d7c18440da9e48102c3315e1bc4519a7/parso-0.8.6-py2.py3-none-any.whl", hash = "sha256:2c549f800b70a5c4952197248825584cb00f033b29c692671d3bf08bf380baff", size = 106894, upload-time = "2026-02-09T15:45:21.391Z" },
+]
+
+[[package]]
+name = "pexpect"
+version = "4.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "ptyprocess" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" },
+]
+
+[[package]]
+name = "polars"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "polars-runtime-32" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d9/1b/eea7d6fe6daafc1d784cc0f76c729b28051837ccb2d51ae64a0a3f798142/polars-1.40.0.tar.gz", hash = "sha256:711dd50dcbc35ba42a2625fcadc2a1349e2e9abf48e35631bdabafb90d89874b", size = 732943, upload-time = "2026-04-18T05:25:26.077Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b4/ad/d5ed79269b7fe59a3dbbfbdbecbe1e59a0b56e38d36491e57d2bfb5846c1/polars-1.40.0-py3-none-any.whl", hash = "sha256:60b1d677ca363e2fc6fdea8c3d16c0653fd52cc37f0249e0f29d9536d5aa45ef", size = 828012, upload-time = "2026-04-18T05:23:39.055Z" },
+]
+
+[[package]]
+name = "polars-runtime-32"
+version = "1.40.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fb/b2/eae6c1b3d16c7a64ff382f557985ff939cce13455e8c9d056ab8e1e0fc87/polars_runtime_32-1.40.0.tar.gz", hash = "sha256:e31bff8bd37492c714e155e2e1429ac2d9ddf2dd6ec6474cc1cc70ac0b2bd6af", size = 2935285, upload-time = "2026-04-18T05:25:28.038Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b0/e4/2325689d2af4f9e70699ff98e8a2543707bebc34af78a5fe0e654107d9ed/polars_runtime_32-1.40.0-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:cab3ac7ff5bc9e0f4b3b146015569e9417cf0eaff8d3fb71004d73d67b6f09c7", size = 52092528, upload-time = "2026-04-18T05:23:42.341Z" },
+    { url = "https://files.pythonhosted.org/packages/19/a6/82157b19c5c40b2c1ed0493b87b9eaf9b4863cdedca5575ee083488b45ba/polars_runtime_32-1.40.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:d29624c75c4049253300786d00882fce620b3677ce495ebc4199292de8c2ba02", size = 46365073, upload-time = "2026-04-18T05:23:46.7Z" },
+    { url = "https://files.pythonhosted.org/packages/85/b5/5c4f1f2545f56c664cc57bbdd1aa66fcfcb129aa137ed72cc81d58eb480f/polars_runtime_32-1.40.0-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a034dc0d8481fc1ca0456ab33e98e53a4c6d6cc6a2edb36246cc81c936b925dc", size = 50250561, upload-time = "2026-04-18T05:23:51.316Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/51/cb5eb75394f39c0ec14fddcc9b11adb707e1f28224a552ecbfa72d39b61b/polars_runtime_32-1.40.0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70e78c2f13a54a9d92ae30d2625bda759173cc4867ad6a39f85f140058d899c6", size = 56243695, upload-time = "2026-04-18T05:23:55.932Z" },
+    { url = "https://files.pythonhosted.org/packages/16/3a/be1437c0fbecbb07d81b151456089c3cf054eea5a791f849ed39b67611ca/polars_runtime_32-1.40.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1843272c0ef49f4a07435888f0059eca08ec16ab9880219c457195a081df0281", size = 50427843, upload-time = "2026-04-18T05:24:00.159Z" },
+    { url = "https://files.pythonhosted.org/packages/be/c7/ea6449a2161816a13ed1d8aa02177d5a0594e011f0df5ddd2fad8e5bf20e/polars_runtime_32-1.40.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:081237dba07f15d61fc151825f203165480e9503ebe72a474a8c99aa78021962", size = 54153077, upload-time = "2026-04-18T05:24:05.066Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/1a/0b239138afe8b80a1a0b4c95db3884e6afbbe82ec3318918ab03bc57f231/polars_runtime_32-1.40.0-cp310-abi3-win_amd64.whl", hash = "sha256:a916040e0b7f461ce987e4551fed9eea5914b4fbb5af907b1d9e80db71fadeb5", size = 51822748, upload-time = "2026-04-18T05:24:09.384Z" },
+    { url = "https://files.pythonhosted.org/packages/06/ce/c16ef8fd3030b7342032b040fab21a42f6fee57e47ee7f41e2f1a1e36f01/polars_runtime_32-1.40.0-cp310-abi3-win_arm64.whl", hash = "sha256:719c64eecde24a95aa3599eb9c8efc98c1499bab7ef9c01cbbe8939cd583e654", size = 45819617, upload-time = "2026-04-18T05:24:13.214Z" },
+]
+
+[[package]]
+name = "prompt-toolkit"
+version = "3.0.52"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "wcwidth" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" },
+]
+
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" },
+]
+
+[[package]]
+name = "pure-eval"
+version = "0.2.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
+]
+
+[[package]]
+name = "stack-data"
+version = "0.6.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "asttokens" },
+    { name = "executing" },
+    { name = "pure-eval" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" },
+]
+
+[[package]]
+name = "traitlets"
+version = "5.14.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" },
+]
+
+[[package]]
+name = "ty"
+version = "0.0.32"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/85/7e/2aa791c9ae7b8cd5024cd4122e92267f664ca954cea3def3211919fa3c1f/ty-0.0.32.tar.gz", hash = "sha256:8743174c5f920f6700a4a0c9de140109189192ba16226884cd50095b43b8a45c", size = 5522294, upload-time = "2026-04-20T19:29:01.626Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/eb/1075dc6a49d7acbe2584ae4d5b410c41b1f177a5adcc567e09eca4c69000/ty-0.0.32-py3-none-linux_armv6l.whl", hash = "sha256:dacbc2f6cd698d488ae7436838ff929570455bf94bfa4d9fe57a630c552aff83", size = 10902959, upload-time = "2026-04-20T19:28:31.907Z" },
+    { url = "https://files.pythonhosted.org/packages/33/d2/c35fc8bc66e98d1ee9b0f8ed319bf743e450e1f1e997574b178fab75670f/ty-0.0.32-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:914bbc4f605ce2a9e2a78982e28fae1d3359a169d141f9dc3b4c7749cd5eca81", size = 10726172, upload-time = "2026-04-20T19:28:44.765Z" },
+    { url = "https://files.pythonhosted.org/packages/96/32/c827da3ca480456fb02d8cea68a2609273b6c220fea0be9a4c8d8470b86e/ty-0.0.32-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4787ac9fe1f86b1f3133f5c6732adbe2df5668b50c679ac6e2d98cd284da812f", size = 10163701, upload-time = "2026-04-20T19:28:27.005Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/9e/2734478fbdb90c160cb2813a3916a16a2af5c1e231f87d635f6131d781fb/ty-0.0.32-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8ea0a728af99fe40dd744cba6441a2404f80b7f4bde17aa6da393810af5ea57", size = 10656220, upload-time = "2026-04-20T19:29:03.814Z" },
+    { url = "https://files.pythonhosted.org/packages/44/9f/0007da2d35e424debe7e9f86ffbc1ab7f60983cfbc5f0411324ab2de5292/ty-0.0.32-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2850561f9b018ae33d7e5bbfa0ac414d3c518513edcffe43877dc9801446b9c5", size = 10696086, upload-time = "2026-04-20T19:28:46.829Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/5e/ce5fd4ec803222ae3e69a76d2a2db2eed55e19f5b131702b9789ef45f93d/ty-0.0.32-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b5fa2fb3c614349ee211d36476b49d88c5ef79a687cdb91b2872ad023b94d2f8", size = 11184800, upload-time = "2026-04-20T19:28:42.57Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/46/ebcf67a5999421331214aac51a7464db42de2be15bbe929c612a3ed0b039/ty-0.0.32-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b89969307ab2417d41c9be8059dd79feea577234e1e10d35132f5495e0d42c6", size = 11718718, upload-time = "2026-04-20T19:28:36.433Z" },
+    { url = "https://files.pythonhosted.org/packages/18/2c/2141c86ed0ce0962b45cefb658a95e734f59759d47f20afdcd9c732910a1/ty-0.0.32-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b59868ede9b1d69a088f0d695df52a0061f95fa7baa1d5e0dc6fc9cf06e1334", size = 11346369, upload-time = "2026-04-20T19:28:48.967Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/da/ed6f772339cf29bd9a46def9d6db5084689eb574ee4d150ff704224c1ed8/ty-0.0.32-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8300caf35345498e9b9b03e550bba03cee8f5f5f8ab4c83c3b1ff1b7403b7d3a", size = 11280714, upload-time = "2026-04-20T19:28:51.516Z" },
+    { url = "https://files.pythonhosted.org/packages/da/9b/c6813987edf4816a40e0c8e408b555f97d3f267c7b3a1688c8bbdf65609c/ty-0.0.32-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:583c7094f4574b02f724db924f98b804d1387a0bd9405ecb5e078cc0f47fbcfb", size = 10638806, upload-time = "2026-04-20T19:28:29.651Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/d4/0cefcbd2ad0f3d51762ccf58e652ec7da146eb6ae34f87228f6254bbb8be/ty-0.0.32-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e44ebe1bb4143a5628bc4db67ac0dfebe14594af671e4ee66f6f2e983da56501", size = 10726106, upload-time = "2026-04-20T19:29:06.3Z" },
+    { url = "https://files.pythonhosted.org/packages/32/ad/2c8a97f91f06311f4367400f7d13534bbda2522c73c99a3e4c0757dff9b8/ty-0.0.32-py3-none-musllinux_1_2_i686.whl", hash = "sha256:06f17ada3e069cba6148342ef88e9929156beca8473e8d4f101b68f66c75643e", size = 10872951, upload-time = "2026-04-20T19:28:34.077Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/68/42293f9248106dd51875120971a5cc6ea315c2c4dcfb8e59aa063aa0af26/ty-0.0.32-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e96e60fa556cec04f15d7ea62d2ceee5982bd389233e961ab9fd42304e278175", size = 11363334, upload-time = "2026-04-20T19:28:54.036Z" },
+    { url = "https://files.pythonhosted.org/packages/df/92/be9abf4d3e589ad5023e2ea965b93e204ec856420d46adf73c5c36c04678/ty-0.0.32-py3-none-win32.whl", hash = "sha256:2ff2ebb4986b24aebcf1444db7db5ca41b36086040e95eea9f8fb851c11e805c", size = 10260689, upload-time = "2026-04-20T19:28:56.541Z" },
+    { url = "https://files.pythonhosted.org/packages/14/61/dc86acea899349d2579cb8419aecedd83dc504d7d6a10df65eef546c8300/ty-0.0.32-py3-none-win_amd64.whl", hash = "sha256:ba7284a4a954b598c1b31500352b3ec1f89bff533825592b5958848226fdc7ee", size = 11255371, upload-time = "2026-04-20T19:28:39.917Z" },
+    { url = "https://files.pythonhosted.org/packages/43/01/beffec56d71ca25b343ede63adb076456b5b3e211f1c066452a44cd120b3/ty-0.0.32-py3-none-win_arm64.whl", hash = "sha256:7e10aadbdbda989a7d567ee6a37f8b98d4d542e31e3b190a2879fd581f75d658", size = 10658087, upload-time = "2026-04-20T19:28:59.286Z" },
+]
+
+[[package]]
+name = "wcwidth"
+version = "0.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" },
+]

From a3fc44a356f2c42d81289b09bc10c0c9df11566e Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Wed, 22 Apr 2026 07:48:45 +0000
Subject: [PATCH 02/22] add check about horizontal merge consistency

---
 mypolars/main.py | 48 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/mypolars/main.py b/mypolars/main.py
index cd47118..d8a92d6 100644
--- a/mypolars/main.py
+++ b/mypolars/main.py
@@ -3,13 +3,13 @@
 
 Note: needed ~128GB memory to run on R14 data.
 """
+
 import gzip
 from argparse import ArgumentParser
 from itertools import zip_longest
 from pathlib import Path
 
 import polars as pl
-pl.Config.set_verbose(True)  
 
 
 # TODO
@@ -59,6 +59,9 @@
 ]
 
 
+SUFFIX_JOIN_COL = "_right"
+
+
 def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]:
     pairs = []
     with open(list_file) as fp:
@@ -84,20 +87,45 @@ def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) ->
 
         df_resp = (
             pl.scan_csv(path_responses, infer_schema=False, separator="\t")
-            .with_row_index(name="_rn", offset=1)
+            .with_row_index(name="_rowid", offset=1)
+            .with_columns(pl.col("_rowid").cast(pl.String) + "|" + path_responses.name)
         )
 
         df_freetext = (
             pl.scan_csv(path_freetext, infer_schema=False, separator="\t")
-            .with_row_index(name="_rn", offset=1)
+            .with_row_index(name="_rowid", offset=1)
+            .with_columns(pl.col("_rowid").cast(pl.String) + "|" + path_freetext.name)
         )
 
-        df_merged = df_resp.join(df_freetext, on="_rn", how="full")
+        df_merged = df_resp.join(
+            df_freetext, on="_rowid", how="full", suffix=SUFFIX_JOIN_COL
+        )
         to_concat.append(df_merged)
 
     pl.concat(to_concat).sink_parquet(parquet_output)
 
 
+def check_merge_consistency(data_path: str | Path):
+    shared_cols = set(EXPECTED_COLUMNS_RESPONSES).intersection(
+        EXPECTED_COLUMNS_FREETEXT
+    )
+
+    all_check = (
+        pl.scan_parquet(data_path)
+        .select(
+            pl.all_horizontal(
+                pl.col(cc) == pl.col(cc + SUFFIX_JOIN_COL) for cc in shared_cols
+            ).all()
+        )
+        .collect()
+        .item()
+    )
+
+    assert all_check
+
+    return all_check
+
+
 def validate_tsv_gz(filename: str, in_dir: Path) -> Path:
     """Check if path exists and is a proper TSV & gz"""
     full_path = (in_dir / filename.strip()).resolve()
@@ -163,9 +191,19 @@ def get_columns(input_path: Path) -> list[str]:
         type=Path,
         help="File containing pair of paths to responses & freetext data, one pair per line (TSV without header).",
     )
+    parser.add_argument(
+        "--post-merge-file",
+        required=True,
+        type=Path,
+        help="Path to intermediary output file from the merge stage",
+    )
 
     args = parser.parse_args()
 
     pairs = validate_input_pairs(args.list_file)
 
-    merge_by_pair(pairs, "/tmp/out.parquet")
+    print(">> merge_by_pair")
+    merge_by_pair(pairs, args.post_merge_file)
+
+    print(">> check_merge_consistency")
+    print(check_merge_consistency(args.post_merge_file))

From 97c4b50c63da7838ab58397690308bd8d1102535 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Fri, 15 May 2026 07:58:18 +0000
Subject: [PATCH 03/22] implement low-memory "merge" stage

Switching to these made it memory friendly:
- `pl.concat(... how="horizontal")`
  instead of `.join`
- `.collect(engine="streaming")`
  insead of just `.collect()`

Also added another check for the merging of main <> freetext files.


NOTE: Polars is better than DuckDB for this since it assigns line
numbers in a deterministic way (only polars guarantees this, not the
case with DuckDB).
---
 mypolars/main.py | 86 +++++++++++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 33 deletions(-)

diff --git a/mypolars/main.py b/mypolars/main.py
index d8a92d6..cbaac33 100644
--- a/mypolars/main.py
+++ b/mypolars/main.py
@@ -1,7 +1,5 @@
 """
 Merges the incoming Kanta Lab data from THL into one coherent file.
-
-Note: needed ~128GB memory to run on R14 data.
 """
 
 import gzip
@@ -13,10 +11,9 @@
 
 
 # TODO
-# 4. Validate that shared column match
 # 5. Post WDL sort-dup: subset columns, join SEX, sort, output unique/duplicates/error rows
 
-EXPECTED_COLUMNS_RESPONSES = [
+EXPECTED_COLUMNS_MAIN = [
     "FINNGENID",
     "EVENT_AGE",
     "APPROX_EVENT_DAY",
@@ -58,8 +55,8 @@
     "tutkimustulosteksti",
 ]
 
-
-SUFFIX_JOIN_COL = "_right"
+COL_PREFIX_MAIN = "main."
+COL_PREFIX_FREETEXT = "freetext."
 
 
 def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]:
@@ -68,13 +65,13 @@ def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path,
         for line in fp:
             values = line.split(separator, maxsplit=2)
 
-            responses = validate_tsv_gz(values[0], list_file.parent)
+            main = validate_tsv_gz(values[0], list_file.parent)
             freetext = validate_tsv_gz(values[1], list_file.parent)
 
-            pairs.append((responses, freetext))
+            pairs.append((main, freetext))
 
-    for responses, freetext in pairs:
-        check_columns(responses, EXPECTED_COLUMNS_RESPONSES, "responses")
+    for main, freetext in pairs:
+        check_columns(main, EXPECTED_COLUMNS_MAIN, "main")
         check_columns(freetext, EXPECTED_COLUMNS_FREETEXT, "freetext")
 
     return pairs
@@ -82,48 +79,71 @@ def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path,
 
 def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) -> None:
     to_concat = []
-    for path_responses, path_freetext in pairs:
-        print(f"Processing {path_responses} & {path_freetext}")
-
-        df_resp = (
-            pl.scan_csv(path_responses, infer_schema=False, separator="\t")
-            .with_row_index(name="_rowid", offset=1)
-            .with_columns(pl.col("_rowid").cast(pl.String) + "|" + path_responses.name)
+    for path_main, path_freetext in pairs:
+        print(f"Processing {path_main} & {path_freetext}")
+
+        df_main = (
+            pl.scan_csv(
+                path_main,
+                infer_schema=False,
+                separator="\t",
+                row_index_name="_rowid",
+                row_index_offset=1,
+            )
+            .with_columns(pl.lit(path_main.name).alias("_filename"))
+            .select(pl.all().name.prefix(COL_PREFIX_MAIN))
         )
 
         df_freetext = (
-            pl.scan_csv(path_freetext, infer_schema=False, separator="\t")
-            .with_row_index(name="_rowid", offset=1)
-            .with_columns(pl.col("_rowid").cast(pl.String) + "|" + path_freetext.name)
+            pl.scan_csv(
+                path_freetext,
+                infer_schema=False,
+                separator="\t",
+                row_index_name="_rowid",
+                row_index_offset=1,
+            )
+            .with_columns(pl.lit(path_freetext.name).alias("_filename"))
+            .select(pl.all().name.prefix(COL_PREFIX_FREETEXT))
         )
 
-        df_merged = df_resp.join(
-            df_freetext, on="_rowid", how="full", suffix=SUFFIX_JOIN_COL
-        )
+        df_merged = pl.concat([df_main, df_freetext], how="horizontal")
+
         to_concat.append(df_merged)
 
     pl.concat(to_concat).sink_parquet(parquet_output)
 
 
-def check_merge_consistency(data_path: str | Path):
-    shared_cols = set(EXPECTED_COLUMNS_RESPONSES).intersection(
-        EXPECTED_COLUMNS_FREETEXT
-    )
+def check_merge_consistency(data_path: str | Path) -> bool:
+    # First check: all shared columns have the same values
+    shared_cols = set(EXPECTED_COLUMNS_MAIN).intersection(EXPECTED_COLUMNS_FREETEXT)
 
-    all_check = (
+    check_shared_columns_same_values = (
         pl.scan_parquet(data_path)
         .select(
             pl.all_horizontal(
-                pl.col(cc) == pl.col(cc + SUFFIX_JOIN_COL) for cc in shared_cols
+                pl.col(COL_PREFIX_MAIN + cc) == pl.col(COL_PREFIX_FREETEXT + cc)
+                for cc in shared_cols
             ).all()
         )
-        .collect()
+        .collect(engine="streaming")
+        .item()
+    )
+
+    assert check_shared_columns_same_values
+
+    # Second check: main and freetext have same height.
+    # This is done by checking the absence of null in _rowid, which happens iif
+    # the main and freetext data are of different height.
+    check_same_height = (
+        pl.scan_parquet(data_path)
+        .select(pl.all_horizontal(pl.selectors.ends_with("._rowid").is_not_null().all()))
+        .collect(engine="streaming")
         .item()
     )
 
-    assert all_check
+    assert check_same_height
 
-    return all_check
+    return check_shared_columns_same_values and check_same_height
 
 
 def validate_tsv_gz(filename: str, in_dir: Path) -> Path:
@@ -189,7 +209,7 @@ def get_columns(input_path: Path) -> list[str]:
         "--list-file",
         required=True,
         type=Path,
-        help="File containing pair of paths to responses & freetext data, one pair per line (TSV without header).",
+        help="File containing pair of paths to main & freetext data, one pair per line (TSV without header).",
     )
     parser.add_argument(
         "--post-merge-file",

From 52c8f8b23828ea3b96dad04c1a73f4acfcb13ed0 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Tue, 19 May 2026 11:47:05 +0000
Subject: [PATCH 04/22] add build backend to pyproject.toml

Prerequisite in order to have the polars implementation pull the config
shared with the other import packages.
---
 pyproject.toml | 7 +++++++
 uv.lock        | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2d89315..5361d69 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,3 +14,10 @@ dev = [
     "pytest>=9.0.3",
     "ruff>=0.15.10",
 ]
+
+[build-system]
+requires = ["uv_build>=0.11.14,<0.12.0"]
+build-backend = "uv_build"
+
+[tool.uv.build-backend]
+module-name = "kanta"
diff --git a/uv.lock b/uv.lock
index 14d81d0..f1334b2 100644
--- a/uv.lock
+++ b/uv.lock
@@ -103,7 +103,7 @@ wheels = [
 [[package]]
 name = "kanta-lab-preprocessing"
 version = "0.1.0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
     { name = "pandas" },
 ]

From 6d20d564892e2dffe89fb0cd449bf163a5d3fe2d Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Tue, 19 May 2026 12:35:25 +0000
Subject: [PATCH 05/22] move polars rewrite under src/kanta/

---
 .gitignore                                    |   3 +
 .python-version                               |   1 -
 mypolars/.python-version                      |   1 -
 mypolars/pyproject.toml                       |  15 -
 mypolars/uv.lock                              | 263 ------------------
 pyproject.toml                                |   3 +-
 mypolars/README.md => src/kanta/config.py     |   0
 src/kanta/intake/__init__.py                  |   0
 .../main.py => src/kanta/intake/assemble.py   |   0
 uv.lock                                       |  55 +++-
 10 files changed, 58 insertions(+), 283 deletions(-)
 delete mode 100644 .python-version
 delete mode 100644 mypolars/.python-version
 delete mode 100644 mypolars/pyproject.toml
 delete mode 100644 mypolars/uv.lock
 rename mypolars/README.md => src/kanta/config.py (100%)
 create mode 100644 src/kanta/intake/__init__.py
 rename mypolars/main.py => src/kanta/intake/assemble.py (100%)

diff --git a/.gitignore b/.gitignore
index 818e20a..c000fa2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -119,6 +119,9 @@ ipython_config.py
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 
+# uv
+.python-version
+
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 
diff --git a/.python-version b/.python-version
deleted file mode 100644
index 24ee5b1..0000000
--- a/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.13
diff --git a/mypolars/.python-version b/mypolars/.python-version
deleted file mode 100644
index 6324d40..0000000
--- a/mypolars/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.14
diff --git a/mypolars/pyproject.toml b/mypolars/pyproject.toml
deleted file mode 100644
index 26939d3..0000000
--- a/mypolars/pyproject.toml
+++ /dev/null
@@ -1,15 +0,0 @@
-[project]
-name = "mypolars"
-version = "0.1.0"
-description = "Add your description here"
-readme = "README.md"
-requires-python = ">=3.14"
-dependencies = [
-    "polars>=1.40.0",
-]
-
-[dependency-groups]
-dev = [
-    "ipython>=9.12.0",
-    "ty>=0.0.32",
-]
diff --git a/mypolars/uv.lock b/mypolars/uv.lock
deleted file mode 100644
index 00fb747..0000000
--- a/mypolars/uv.lock
+++ /dev/null
@@ -1,263 +0,0 @@
-version = 1
-revision = 3
-requires-python = ">=3.14"
-
-[[package]]
-name = "asttokens"
-version = "3.0.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/be/a5/8e3f9b6771b0b408517c82d97aed8f2036509bc247d46114925e32fe33f0/asttokens-3.0.1.tar.gz", hash = "sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7", size = 62308, upload-time = "2025-11-15T16:43:48.578Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" },
-]
-
-[[package]]
-name = "colorama"
-version = "0.4.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
-]
-
-[[package]]
-name = "decorator"
-version = "5.2.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
-]
-
-[[package]]
-name = "executing"
-version = "2.2.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" },
-]
-
-[[package]]
-name = "ipython"
-version = "9.12.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-    { name = "decorator" },
-    { name = "ipython-pygments-lexers" },
-    { name = "jedi" },
-    { name = "matplotlib-inline" },
-    { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
-    { name = "prompt-toolkit" },
-    { name = "pygments" },
-    { name = "stack-data" },
-    { name = "traitlets" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3a/73/7114f80a8f9cabdb13c27732dce24af945b2923dcab80723602f7c8bc2d8/ipython-9.12.0.tar.gz", hash = "sha256:01daa83f504b693ba523b5a407246cabde4eb4513285a3c6acaff11a66735ee4", size = 4428879, upload-time = "2026-03-27T09:42:45.312Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/59/22/906c8108974c673ebef6356c506cebb6870d48cedea3c41e949e2dd556bb/ipython-9.12.0-py3-none-any.whl", hash = "sha256:0f2701e8ee86e117e37f50563205d36feaa259d2e08d4a6bc6b6d74b18ce128d", size = 625661, upload-time = "2026-03-27T09:42:42.831Z" },
-]
-
-[[package]]
-name = "ipython-pygments-lexers"
-version = "1.1.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pygments" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" },
-]
-
-[[package]]
-name = "jedi"
-version = "0.19.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "parso" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" },
-]
-
-[[package]]
-name = "matplotlib-inline"
-version = "0.2.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "traitlets" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" },
-]
-
-[[package]]
-name = "mypolars"
-version = "0.1.0"
-source = { virtual = "." }
-dependencies = [
-    { name = "polars" },
-]
-
-[package.dev-dependencies]
-dev = [
-    { name = "ipython" },
-    { name = "ty" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "polars", specifier = ">=1.40.0" }]
-
-[package.metadata.requires-dev]
-dev = [
-    { name = "ipython", specifier = ">=9.12.0" },
-    { name = "ty", specifier = ">=0.0.32" },
-]
-
-[[package]]
-name = "parso"
-version = "0.8.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/81/76/a1e769043c0c0c9fe391b702539d594731a4362334cdf4dc25d0c09761e7/parso-0.8.6.tar.gz", hash = "sha256:2b9a0332696df97d454fa67b81618fd69c35a7b90327cbe6ba5c92d2c68a7bfd", size = 401621, upload-time = "2026-02-09T15:45:24.425Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b6/61/fae042894f4296ec49e3f193aff5d7c18440da9e48102c3315e1bc4519a7/parso-0.8.6-py2.py3-none-any.whl", hash = "sha256:2c549f800b70a5c4952197248825584cb00f033b29c692671d3bf08bf380baff", size = 106894, upload-time = "2026-02-09T15:45:21.391Z" },
-]
-
-[[package]]
-name = "pexpect"
-version = "4.9.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "ptyprocess" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" },
-]
-
-[[package]]
-name = "polars"
-version = "1.40.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "polars-runtime-32" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/d9/1b/eea7d6fe6daafc1d784cc0f76c729b28051837ccb2d51ae64a0a3f798142/polars-1.40.0.tar.gz", hash = "sha256:711dd50dcbc35ba42a2625fcadc2a1349e2e9abf48e35631bdabafb90d89874b", size = 732943, upload-time = "2026-04-18T05:25:26.077Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b4/ad/d5ed79269b7fe59a3dbbfbdbecbe1e59a0b56e38d36491e57d2bfb5846c1/polars-1.40.0-py3-none-any.whl", hash = "sha256:60b1d677ca363e2fc6fdea8c3d16c0653fd52cc37f0249e0f29d9536d5aa45ef", size = 828012, upload-time = "2026-04-18T05:23:39.055Z" },
-]
-
-[[package]]
-name = "polars-runtime-32"
-version = "1.40.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/fb/b2/eae6c1b3d16c7a64ff382f557985ff939cce13455e8c9d056ab8e1e0fc87/polars_runtime_32-1.40.0.tar.gz", hash = "sha256:e31bff8bd37492c714e155e2e1429ac2d9ddf2dd6ec6474cc1cc70ac0b2bd6af", size = 2935285, upload-time = "2026-04-18T05:25:28.038Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b0/e4/2325689d2af4f9e70699ff98e8a2543707bebc34af78a5fe0e654107d9ed/polars_runtime_32-1.40.0-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:cab3ac7ff5bc9e0f4b3b146015569e9417cf0eaff8d3fb71004d73d67b6f09c7", size = 52092528, upload-time = "2026-04-18T05:23:42.341Z" },
-    { url = "https://files.pythonhosted.org/packages/19/a6/82157b19c5c40b2c1ed0493b87b9eaf9b4863cdedca5575ee083488b45ba/polars_runtime_32-1.40.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:d29624c75c4049253300786d00882fce620b3677ce495ebc4199292de8c2ba02", size = 46365073, upload-time = "2026-04-18T05:23:46.7Z" },
-    { url = "https://files.pythonhosted.org/packages/85/b5/5c4f1f2545f56c664cc57bbdd1aa66fcfcb129aa137ed72cc81d58eb480f/polars_runtime_32-1.40.0-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a034dc0d8481fc1ca0456ab33e98e53a4c6d6cc6a2edb36246cc81c936b925dc", size = 50250561, upload-time = "2026-04-18T05:23:51.316Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/51/cb5eb75394f39c0ec14fddcc9b11adb707e1f28224a552ecbfa72d39b61b/polars_runtime_32-1.40.0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70e78c2f13a54a9d92ae30d2625bda759173cc4867ad6a39f85f140058d899c6", size = 56243695, upload-time = "2026-04-18T05:23:55.932Z" },
-    { url = "https://files.pythonhosted.org/packages/16/3a/be1437c0fbecbb07d81b151456089c3cf054eea5a791f849ed39b67611ca/polars_runtime_32-1.40.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1843272c0ef49f4a07435888f0059eca08ec16ab9880219c457195a081df0281", size = 50427843, upload-time = "2026-04-18T05:24:00.159Z" },
-    { url = "https://files.pythonhosted.org/packages/be/c7/ea6449a2161816a13ed1d8aa02177d5a0594e011f0df5ddd2fad8e5bf20e/polars_runtime_32-1.40.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:081237dba07f15d61fc151825f203165480e9503ebe72a474a8c99aa78021962", size = 54153077, upload-time = "2026-04-18T05:24:05.066Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/1a/0b239138afe8b80a1a0b4c95db3884e6afbbe82ec3318918ab03bc57f231/polars_runtime_32-1.40.0-cp310-abi3-win_amd64.whl", hash = "sha256:a916040e0b7f461ce987e4551fed9eea5914b4fbb5af907b1d9e80db71fadeb5", size = 51822748, upload-time = "2026-04-18T05:24:09.384Z" },
-    { url = "https://files.pythonhosted.org/packages/06/ce/c16ef8fd3030b7342032b040fab21a42f6fee57e47ee7f41e2f1a1e36f01/polars_runtime_32-1.40.0-cp310-abi3-win_arm64.whl", hash = "sha256:719c64eecde24a95aa3599eb9c8efc98c1499bab7ef9c01cbbe8939cd583e654", size = 45819617, upload-time = "2026-04-18T05:24:13.214Z" },
-]
-
-[[package]]
-name = "prompt-toolkit"
-version = "3.0.52"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "wcwidth" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" },
-]
-
-[[package]]
-name = "ptyprocess"
-version = "0.7.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" },
-]
-
-[[package]]
-name = "pure-eval"
-version = "0.2.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
-]
-
-[[package]]
-name = "pygments"
-version = "2.20.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
-]
-
-[[package]]
-name = "stack-data"
-version = "0.6.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "asttokens" },
-    { name = "executing" },
-    { name = "pure-eval" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" },
-]
-
-[[package]]
-name = "traitlets"
-version = "5.14.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" },
-]
-
-[[package]]
-name = "ty"
-version = "0.0.32"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/85/7e/2aa791c9ae7b8cd5024cd4122e92267f664ca954cea3def3211919fa3c1f/ty-0.0.32.tar.gz", hash = "sha256:8743174c5f920f6700a4a0c9de140109189192ba16226884cd50095b43b8a45c", size = 5522294, upload-time = "2026-04-20T19:29:01.626Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/62/eb/1075dc6a49d7acbe2584ae4d5b410c41b1f177a5adcc567e09eca4c69000/ty-0.0.32-py3-none-linux_armv6l.whl", hash = "sha256:dacbc2f6cd698d488ae7436838ff929570455bf94bfa4d9fe57a630c552aff83", size = 10902959, upload-time = "2026-04-20T19:28:31.907Z" },
-    { url = "https://files.pythonhosted.org/packages/33/d2/c35fc8bc66e98d1ee9b0f8ed319bf743e450e1f1e997574b178fab75670f/ty-0.0.32-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:914bbc4f605ce2a9e2a78982e28fae1d3359a169d141f9dc3b4c7749cd5eca81", size = 10726172, upload-time = "2026-04-20T19:28:44.765Z" },
-    { url = "https://files.pythonhosted.org/packages/96/32/c827da3ca480456fb02d8cea68a2609273b6c220fea0be9a4c8d8470b86e/ty-0.0.32-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4787ac9fe1f86b1f3133f5c6732adbe2df5668b50c679ac6e2d98cd284da812f", size = 10163701, upload-time = "2026-04-20T19:28:27.005Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/9e/2734478fbdb90c160cb2813a3916a16a2af5c1e231f87d635f6131d781fb/ty-0.0.32-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8ea0a728af99fe40dd744cba6441a2404f80b7f4bde17aa6da393810af5ea57", size = 10656220, upload-time = "2026-04-20T19:29:03.814Z" },
-    { url = "https://files.pythonhosted.org/packages/44/9f/0007da2d35e424debe7e9f86ffbc1ab7f60983cfbc5f0411324ab2de5292/ty-0.0.32-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2850561f9b018ae33d7e5bbfa0ac414d3c518513edcffe43877dc9801446b9c5", size = 10696086, upload-time = "2026-04-20T19:28:46.829Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/5e/ce5fd4ec803222ae3e69a76d2a2db2eed55e19f5b131702b9789ef45f93d/ty-0.0.32-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b5fa2fb3c614349ee211d36476b49d88c5ef79a687cdb91b2872ad023b94d2f8", size = 11184800, upload-time = "2026-04-20T19:28:42.57Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/46/ebcf67a5999421331214aac51a7464db42de2be15bbe929c612a3ed0b039/ty-0.0.32-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b89969307ab2417d41c9be8059dd79feea577234e1e10d35132f5495e0d42c6", size = 11718718, upload-time = "2026-04-20T19:28:36.433Z" },
-    { url = "https://files.pythonhosted.org/packages/18/2c/2141c86ed0ce0962b45cefb658a95e734f59759d47f20afdcd9c732910a1/ty-0.0.32-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b59868ede9b1d69a088f0d695df52a0061f95fa7baa1d5e0dc6fc9cf06e1334", size = 11346369, upload-time = "2026-04-20T19:28:48.967Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/da/ed6f772339cf29bd9a46def9d6db5084689eb574ee4d150ff704224c1ed8/ty-0.0.32-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8300caf35345498e9b9b03e550bba03cee8f5f5f8ab4c83c3b1ff1b7403b7d3a", size = 11280714, upload-time = "2026-04-20T19:28:51.516Z" },
-    { url = "https://files.pythonhosted.org/packages/da/9b/c6813987edf4816a40e0c8e408b555f97d3f267c7b3a1688c8bbdf65609c/ty-0.0.32-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:583c7094f4574b02f724db924f98b804d1387a0bd9405ecb5e078cc0f47fbcfb", size = 10638806, upload-time = "2026-04-20T19:28:29.651Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/d4/0cefcbd2ad0f3d51762ccf58e652ec7da146eb6ae34f87228f6254bbb8be/ty-0.0.32-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e44ebe1bb4143a5628bc4db67ac0dfebe14594af671e4ee66f6f2e983da56501", size = 10726106, upload-time = "2026-04-20T19:29:06.3Z" },
-    { url = "https://files.pythonhosted.org/packages/32/ad/2c8a97f91f06311f4367400f7d13534bbda2522c73c99a3e4c0757dff9b8/ty-0.0.32-py3-none-musllinux_1_2_i686.whl", hash = "sha256:06f17ada3e069cba6148342ef88e9929156beca8473e8d4f101b68f66c75643e", size = 10872951, upload-time = "2026-04-20T19:28:34.077Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/68/42293f9248106dd51875120971a5cc6ea315c2c4dcfb8e59aa063aa0af26/ty-0.0.32-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e96e60fa556cec04f15d7ea62d2ceee5982bd389233e961ab9fd42304e278175", size = 11363334, upload-time = "2026-04-20T19:28:54.036Z" },
-    { url = "https://files.pythonhosted.org/packages/df/92/be9abf4d3e589ad5023e2ea965b93e204ec856420d46adf73c5c36c04678/ty-0.0.32-py3-none-win32.whl", hash = "sha256:2ff2ebb4986b24aebcf1444db7db5ca41b36086040e95eea9f8fb851c11e805c", size = 10260689, upload-time = "2026-04-20T19:28:56.541Z" },
-    { url = "https://files.pythonhosted.org/packages/14/61/dc86acea899349d2579cb8419aecedd83dc504d7d6a10df65eef546c8300/ty-0.0.32-py3-none-win_amd64.whl", hash = "sha256:ba7284a4a954b598c1b31500352b3ec1f89bff533825592b5958848226fdc7ee", size = 11255371, upload-time = "2026-04-20T19:28:39.917Z" },
-    { url = "https://files.pythonhosted.org/packages/43/01/beffec56d71ca25b343ede63adb076456b5b3e211f1c066452a44cd120b3/ty-0.0.32-py3-none-win_arm64.whl", hash = "sha256:7e10aadbdbda989a7d567ee6a37f8b98d4d542e31e3b190a2879fd581f75d658", size = 10658087, upload-time = "2026-04-20T19:28:59.286Z" },
-]
-
-[[package]]
-name = "wcwidth"
-version = "0.6.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" },
-]
diff --git a/pyproject.toml b/pyproject.toml
index 5361d69..7d4654e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,9 +3,10 @@ name = "kanta-lab-preprocessing"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-requires-python = ">=3.13"
+requires-python = ">=3.12"
 dependencies = [
     "pandas>=3.0.2",
+    "polars>=1.40.0",
 ]
 
 [dependency-groups]
diff --git a/mypolars/README.md b/src/kanta/config.py
similarity index 100%
rename from mypolars/README.md
rename to src/kanta/config.py
diff --git a/src/kanta/intake/__init__.py b/src/kanta/intake/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mypolars/main.py b/src/kanta/intake/assemble.py
similarity index 100%
rename from mypolars/main.py
rename to src/kanta/intake/assemble.py
diff --git a/uv.lock b/uv.lock
index f1334b2..209cda9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,6 +1,6 @@
 version = 1
 revision = 3
-requires-python = ">=3.13"
+requires-python = ">=3.12"
 resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'win32'",
     "python_full_version >= '3.14' and sys_platform == 'emscripten'",
@@ -106,6 +106,7 @@ version = "0.1.0"
 source = { editable = "." }
 dependencies = [
     { name = "pandas" },
+    { name = "polars" },
 ]
 
 [package.dev-dependencies]
@@ -116,7 +117,10 @@ dev = [
 ]
 
 [package.metadata]
-requires-dist = [{ name = "pandas", specifier = ">=3.0.2" }]
+requires-dist = [
+    { name = "pandas", specifier = ">=3.0.2" },
+    { name = "polars", specifier = ">=1.40.0" },
+]
 
 [package.metadata.requires-dev]
 dev = [
@@ -143,6 +147,17 @@ version = "2.4.4"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0", size = 20731587, upload-time = "2026-03-29T13:22:01.298Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b", size = 16689272, upload-time = "2026-03-29T13:18:49.223Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e", size = 14699573, upload-time = "2026-03-29T13:18:52.629Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842", size = 5204782, upload-time = "2026-03-29T13:18:55.579Z" },
+    { url = "https://files.pythonhosted.org/packages/de/2f/702a4594413c1a8632092beae8aba00f1d67947389369b3777aed783fdca/numpy-2.4.4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e4a010c27ff6f210ff4c6ef34394cd61470d01014439b192ec22552ee867f2a8", size = 6552038, upload-time = "2026-03-29T13:18:57.769Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/37/eed308a8f56cba4d1fdf467a4fc67ef4ff4bf1c888f5fc980481890104b1/numpy-2.4.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9e75681b59ddaa5e659898085ae0eaea229d054f2ac0c7e563a62205a700121", size = 15670666, upload-time = "2026-03-29T13:19:00.341Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/0d/0e3ecece05b7a7e87ab9fb587855548da437a061326fff64a223b6dcb78a/numpy-2.4.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f4a14bee47aec54f883e0cad2d73986640c1590eb9bfaaba7ad17394481e6e", size = 16645480, upload-time = "2026-03-29T13:19:03.63Z" },
+    { url = "https://files.pythonhosted.org/packages/34/49/f2312c154b82a286758ee2f1743336d50651f8b5195db18cdb63675ff649/numpy-2.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:62d6b0f03b694173f9fcb1fb317f7222fd0b0b103e784c6549f5e53a27718c44", size = 17020036, upload-time = "2026-03-29T13:19:07.428Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/e9/736d17bd77f1b0ec4f9901aaec129c00d59f5d84d5e79bba540ef12c2330/numpy-2.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbc356aae7adf9e6336d336b9c8111d390a05df88f1805573ebb0807bd06fd1d", size = 18368643, upload-time = "2026-03-29T13:19:10.775Z" },
+    { url = "https://files.pythonhosted.org/packages/63/f6/d417977c5f519b17c8a5c3bc9e8304b0908b0e21136fe43bf628a1343914/numpy-2.4.4-cp312-cp312-win32.whl", hash = "sha256:0d35aea54ad1d420c812bfa0385c71cd7cc5bcf7c65fed95fc2cd02fe8c79827", size = 5961117, upload-time = "2026-03-29T13:19:13.464Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/5b/e1deebf88ff431b01b7406ca3583ab2bbb90972bbe1c568732e49c844f7e/numpy-2.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5f0362dc928a6ecd9db58868fca5e48485205e3855957bdedea308f8672ea4a", size = 12320584, upload-time = "2026-03-29T13:19:16.155Z" },
+    { url = "https://files.pythonhosted.org/packages/58/89/e4e856ac82a68c3ed64486a544977d0e7bdd18b8da75b78a577ca31c4395/numpy-2.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:846300f379b5b12cc769334464656bc882e0735d27d9726568bc932fdc49d5ec", size = 10221450, upload-time = "2026-03-29T13:19:18.994Z" },
     { url = "https://files.pythonhosted.org/packages/14/1d/d0a583ce4fefcc3308806a749a536c201ed6b5ad6e1322e227ee4848979d/numpy-2.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:08f2e31ed5e6f04b118e49821397f12767934cfdd12a1ce86a058f91e004ee50", size = 16684933, upload-time = "2026-03-29T13:19:22.47Z" },
     { url = "https://files.pythonhosted.org/packages/c1/62/2b7a48fbb745d344742c0277f01286dead15f3f68e4f359fbfcf7b48f70f/numpy-2.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e823b8b6edc81e747526f70f71a9c0a07ac4e7ad13020aa736bb7c9d67196115", size = 14694532, upload-time = "2026-03-29T13:19:25.581Z" },
     { url = "https://files.pythonhosted.org/packages/e5/87/499737bfba066b4a3bebff24a8f1c5b2dee410b209bc6668c9be692580f0/numpy-2.4.4-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4a19d9dba1a76618dd86b164d608566f393f8ec6ac7c44f0cc879011c45e65af", size = 5199661, upload-time = "2026-03-29T13:19:28.31Z" },
@@ -207,6 +222,14 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/da/99/b342345300f13440fe9fe385c3c481e2d9a595ee3bab4d3219247ac94e9a/pandas-3.0.2.tar.gz", hash = "sha256:f4753e73e34c8d83221ba58f232433fca2748be8b18dbca02d242ed153945043", size = 4645855, upload-time = "2026-03-31T06:48:30.816Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/f3/b0/c20bd4d6d3f736e6bd6b55794e9cd0a617b858eaad27c8f410ea05d953b7/pandas-3.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:232a70ebb568c0c4d2db4584f338c1577d81e3af63292208d615907b698a0f18", size = 10347921, upload-time = "2026-03-31T06:46:33.36Z" },
+    { url = "https://files.pythonhosted.org/packages/35/d0/4831af68ce30cc2d03c697bea8450e3225a835ef497d0d70f31b8cdde965/pandas-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:970762605cff1ca0d3f71ed4f3a769ea8f85fc8e6348f6e110b8fea7e6eb5a14", size = 9888127, upload-time = "2026-03-31T06:46:36.253Z" },
+    { url = "https://files.pythonhosted.org/packages/61/a9/16ea9346e1fc4a96e2896242d9bc674764fb9049b0044c0132502f7a771e/pandas-3.0.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aff4e6f4d722e0652707d7bcb190c445fe58428500c6d16005b02401764b1b3d", size = 10399577, upload-time = "2026-03-31T06:46:39.224Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/a8/3a61a721472959ab0ce865ef05d10b0d6bfe27ce8801c99f33d4fa996e65/pandas-3.0.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef8b27695c3d3dc78403c9a7d5e59a62d5464a7e1123b4e0042763f7104dc74f", size = 10880030, upload-time = "2026-03-31T06:46:42.412Z" },
+    { url = "https://files.pythonhosted.org/packages/da/65/7225c0ea4d6ce9cb2160a7fb7f39804871049f016e74782e5dade4d14109/pandas-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f8d68083e49e16b84734eb1a4dcae4259a75c90fb6e2251ab9a00b61120c06ab", size = 11409468, upload-time = "2026-03-31T06:46:45.2Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/5b/46e7c76032639f2132359b5cf4c785dd8cf9aea5ea64699eac752f02b9db/pandas-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:32cc41f310ebd4a296d93515fcac312216adfedb1894e879303987b8f1e2b97d", size = 11936381, upload-time = "2026-03-31T06:46:48.293Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/8b/721a9cff6fa6a91b162eb51019c6243b82b3226c71bb6c8ef4a9bd65cbc6/pandas-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:a4785e1d6547d8427c5208b748ae2efb64659a21bd82bf440d4262d02bfa02a4", size = 9744993, upload-time = "2026-03-31T06:46:51.488Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/18/7f0bd34ae27b28159aa80f2a6799f47fda34f7fb938a76e20c7b7fe3b200/pandas-3.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:08504503f7101300107ecdc8df73658e4347586db5cfdadabc1592e9d7e7a0fd", size = 9056118, upload-time = "2026-03-31T06:46:54.548Z" },
     { url = "https://files.pythonhosted.org/packages/bf/ca/3e639a1ea6fcd0617ca4e8ca45f62a74de33a56ae6cd552735470b22c8d3/pandas-3.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b5918ba197c951dec132b0c5929a00c0bf05d5942f590d3c10a807f6e15a57d3", size = 10321105, upload-time = "2026-03-31T06:46:57.327Z" },
     { url = "https://files.pythonhosted.org/packages/0b/77/dbc82ff2fb0e63c6564356682bf201edff0ba16c98630d21a1fb312a8182/pandas-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d606a041c89c0a474a4702d532ab7e73a14fe35c8d427b972a625c8e46373668", size = 9864088, upload-time = "2026-03-31T06:46:59.935Z" },
     { url = "https://files.pythonhosted.org/packages/5c/2b/341f1b04bbca2e17e13cd3f08c215b70ef2c60c5356ef1e8c6857449edc7/pandas-3.0.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:710246ba0616e86891b58ab95f2495143bb2bc83ab6b06747c74216f583a6ac9", size = 10369066, upload-time = "2026-03-31T06:47:02.792Z" },
@@ -270,6 +293,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
 ]
 
+[[package]]
+name = "polars"
+version = "1.40.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "polars-runtime-32" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b3/8c/bc9bc948058348ed43117cecc3007cd608f395915dae8a00974579a5dab1/polars-1.40.1.tar.gz", hash = "sha256:ab2694134b137596b5a59bfd7b4c54ebbc9b59f9403127f18e32d363777552e8", size = 733574, upload-time = "2026-04-22T19:15:55.507Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ea/91/74fc60d94488685a92ac9d49d7ec55f3e91fe9b77942a6235a5fa7f249c3/polars-1.40.1-py3-none-any.whl", hash = "sha256:c0f861219d1319cdea45c4ce4d30355a47176b8f98dcedf95ea8269f131b8abd", size = 828723, upload-time = "2026-04-22T19:14:25.452Z" },
+]
+
+[[package]]
+name = "polars-runtime-32"
+version = "1.40.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/ba/26d40f039be9f552b5fd7365a621bdfc0f8e912ef77094ae4693491b0bae/polars_runtime_32-1.40.1.tar.gz", hash = "sha256:37f3065615d1bf90d03b5326222df4c5c1f8a5d33e50470aa588e3465e6eb814", size = 2935843, upload-time = "2026-04-22T19:15:57.26Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7d/46/22c8af5eed68ac2eeb556e0fa3ca8a7b798e984ceff4450888f3b5ac61fd/polars_runtime_32-1.40.1-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b748ef652270cc49e9e69f99a035e0eb4d5f856d42bcd6ac4d9d80a40142aa1e", size = 52098755, upload-time = "2026-04-22T19:14:28.555Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/3e/48599a38009ca60ff82a6f38c8a621ce3c0286aa7397c7d79e741bd9060e/polars_runtime_32-1.40.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:d249b3743e05986060cec0a7aaa542d020df6c6b876e556023a310efd581f9be", size = 46367542, upload-time = "2026-04-22T19:14:32.433Z" },
+    { url = "https://files.pythonhosted.org/packages/43/e9/384bc069367a1a36ee31c13782c178dbd039b2b873b772d4a0fc23a2373d/polars_runtime_32-1.40.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5987b30e7aa1059d069498496e8dda35afd592b0ac3d46ed87e3ff8df1ad652c", size = 50252104, upload-time = "2026-04-22T19:14:35.945Z" },
+    { url = "https://files.pythonhosted.org/packages/15/ef/7d57ceb0651af74194e97ed6583e148d352f03d696090221b8059cdfc90b/polars_runtime_32-1.40.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d7f42a8b3f16fc66002cc0f6516f7dd7653396886ae0ed362ab95c0b3408b59", size = 56250788, upload-time = "2026-04-22T19:14:39.743Z" },
+    { url = "https://files.pythonhosted.org/packages/10/0f/e4b3ffc748827a14a474ec9c42e45c066050e440fec57e914091d9adda75/polars_runtime_32-1.40.1-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e5f7becc237a7ec9d9a10878dc8e54b73bbf4e2d94a2991c37d7a0b38590d8f9", size = 50432590, upload-time = "2026-04-22T19:14:43.388Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/0b/b8d95fbed869fa4caabe9c400e4210374913b376e925e96fdcfa9be6416b/polars_runtime_32-1.40.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:992d14cf191dde043d36fbdbc98a65e43fbc7e9a5024cecd45f838ac4988c1ee", size = 54155564, upload-time = "2026-04-22T19:14:47.239Z" },
+    { url = "https://files.pythonhosted.org/packages/06/d9/d091d8fb5cbed5e9536adfed955c4c89987a4cc3b8e73ae4532402b91c74/polars_runtime_32-1.40.1-cp310-abi3-win_amd64.whl", hash = "sha256:f78bb2abd00101cbb23cc0cb068f7e36e081057a15d2ec2dde3dda280709f030", size = 51829755, upload-time = "2026-04-22T19:14:50.85Z" },
+    { url = "https://files.pythonhosted.org/packages/65/ad/b33c3022a394f3eb55c3310597cec615412a8a33880055eee191d154a628/polars_runtime_32-1.40.1-cp310-abi3-win_arm64.whl", hash = "sha256:b5cbfaf6b085b420b4bfcbe24e8f665076d1cccfdb80c0484c02a023ce205537", size = 45822104, upload-time = "2026-04-22T19:14:54.192Z" },
+]
+
 [[package]]
 name = "prompt-toolkit"
 version = "3.0.52"

From 6876bad17575c789086f9c5fb29d842a1a814b34 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Tue, 19 May 2026 09:48:17 +0000
Subject: [PATCH 06/22] intial rewrite of sort-dedup from WDL to Polars

---
 src/kanta/config.py          |  0
 src/kanta/intake/assemble.py |  3 --
 src/kanta/intake/tidy.py     | 88 ++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 3 deletions(-)
 delete mode 100644 src/kanta/config.py
 create mode 100644 src/kanta/intake/tidy.py

diff --git a/src/kanta/config.py b/src/kanta/config.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py
index cbaac33..38a096e 100644
--- a/src/kanta/intake/assemble.py
+++ b/src/kanta/intake/assemble.py
@@ -10,9 +10,6 @@
 import polars as pl
 
 
-# TODO
-# 5. Post WDL sort-dup: subset columns, join SEX, sort, output unique/duplicates/error rows
-
 EXPECTED_COLUMNS_MAIN = [
     "FINNGENID",
     "EVENT_AGE",
diff --git a/src/kanta/intake/tidy.py b/src/kanta/intake/tidy.py
new file mode 100644
index 0000000..488c0f3
--- /dev/null
+++ b/src/kanta/intake/tidy.py
@@ -0,0 +1,88 @@
+# TODO: in assemble: get rid of main. / freetext. prefixes for columns that are in common, since we check they have the same values (right?), then change it here.
+from argparse import ArgumentParser
+from pathlib import Path
+
+import polars as pl
+
+
+COLUMNS_OUTPUT = [
+    "main.FINNGENID",
+    "main.EVENT_AGE",
+    "main.tutkimuskoodistonjarjestelma",
+    "main.paikallinentutkimusnimike_selite",
+    "main.tutkimustulosarvo",
+    "main.tutkimustulosyksikko",
+    "main.tutkimusvastauksentila",
+    "main.tuloksenpoikkeavuus",
+    "main.viitearvoryhma",
+    "main.viitevalialkuarvo",
+    "main.viitevalialkuyksikko",
+    "main.viitevaliloppuarvo",
+    "main.viitevaliloppuyksikko",
+    "freetext.tutkimustulosteksti",
+    "main.paikallinentutkimusnimike_koodi",
+    "main.laboratoriotutkimusnimike",
+    "main.APPROX_EVENT_DAY",
+    "main.TIME",
+    "main._rowid",
+]
+
+COLUMNS_UNIQUENESS_SORT = [
+    "main.FINNGENID",
+    "main.APPROX_EVENT_DAY",
+    "main.TIME",
+    "main.laboratoriotutkimusnimike",
+    "main.paikallinentutkimusnimike_koodi",
+    "main.tutkimusvastauksentila",
+    "main.tutkimustulosarvo",
+    "main.tutkimustulosyksikko",
+]
+
+
+def main(args):
+    df_pheno = pl.scan_csv(
+        args.phenotype_file,
+        infer_schema=False,
+        separator="\t",
+    ).select("FINNGENID", "SEX")
+
+    (
+        pl.scan_parquet(args.assembled_file)
+        .select(COLUMNS_OUTPUT)
+        # Dedup rows
+        # NOTE(Vincent 2026-05-20) Here the deduplication is done on whole data,
+        # not just on adjacent lines as was done in the previous implementation.
+        .unique(subset=COLUMNS_UNIQUENESS_SORT)
+        # Sort
+        .sort(by=COLUMNS_UNIQUENESS_SORT)
+        # join SEX
+        .join(df_pheno, left_on="main.FINNGENID", right_on="FINNGENID", how="left")
+        .sink_parquet(args.output_file)
+    )
+
+    # TODO validation
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--assembled-file",
+        help="Path to assembled file from the intake.assemble step (Parquet)",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--phenotype-file",
+        help="Path to phenotype file with SEX column (.txt.gz)",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--output-file",
+        help="Path to write the tidied up output file (Parquet)",
+        required=True,
+        type=Path,
+    )
+    args = parser.parse_args()
+
+    main(args)

From 998f8b35cd7db5cb57e81d90811a802dbc0e9df0 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Thu, 21 May 2026 05:09:06 +0000
Subject: [PATCH 07/22] use bucket partitioning to implement sort+dedup

---
 src/kanta/intake/tidy.py   |  88 -------------------
 src/kanta/intake/tidyup.py | 173 +++++++++++++++++++++++++++++++++++++
 2 files changed, 173 insertions(+), 88 deletions(-)
 delete mode 100644 src/kanta/intake/tidy.py
 create mode 100644 src/kanta/intake/tidyup.py

diff --git a/src/kanta/intake/tidy.py b/src/kanta/intake/tidy.py
deleted file mode 100644
index 488c0f3..0000000
--- a/src/kanta/intake/tidy.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# TODO: in assemble: get rid of main. / freetext. prefixes for columns that are in common, since we check they have the same values (right?), then change it here.
-from argparse import ArgumentParser
-from pathlib import Path
-
-import polars as pl
-
-
-COLUMNS_OUTPUT = [
-    "main.FINNGENID",
-    "main.EVENT_AGE",
-    "main.tutkimuskoodistonjarjestelma",
-    "main.paikallinentutkimusnimike_selite",
-    "main.tutkimustulosarvo",
-    "main.tutkimustulosyksikko",
-    "main.tutkimusvastauksentila",
-    "main.tuloksenpoikkeavuus",
-    "main.viitearvoryhma",
-    "main.viitevalialkuarvo",
-    "main.viitevalialkuyksikko",
-    "main.viitevaliloppuarvo",
-    "main.viitevaliloppuyksikko",
-    "freetext.tutkimustulosteksti",
-    "main.paikallinentutkimusnimike_koodi",
-    "main.laboratoriotutkimusnimike",
-    "main.APPROX_EVENT_DAY",
-    "main.TIME",
-    "main._rowid",
-]
-
-COLUMNS_UNIQUENESS_SORT = [
-    "main.FINNGENID",
-    "main.APPROX_EVENT_DAY",
-    "main.TIME",
-    "main.laboratoriotutkimusnimike",
-    "main.paikallinentutkimusnimike_koodi",
-    "main.tutkimusvastauksentila",
-    "main.tutkimustulosarvo",
-    "main.tutkimustulosyksikko",
-]
-
-
-def main(args):
-    df_pheno = pl.scan_csv(
-        args.phenotype_file,
-        infer_schema=False,
-        separator="\t",
-    ).select("FINNGENID", "SEX")
-
-    (
-        pl.scan_parquet(args.assembled_file)
-        .select(COLUMNS_OUTPUT)
-        # Dedup rows
-        # NOTE(Vincent 2026-05-20) Here the deduplication is done on whole data,
-        # not just on adjacent lines as was done in the previous implementation.
-        .unique(subset=COLUMNS_UNIQUENESS_SORT)
-        # Sort
-        .sort(by=COLUMNS_UNIQUENESS_SORT)
-        # join SEX
-        .join(df_pheno, left_on="main.FINNGENID", right_on="FINNGENID", how="left")
-        .sink_parquet(args.output_file)
-    )
-
-    # TODO validation
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--assembled-file",
-        help="Path to assembled file from the intake.assemble step (Parquet)",
-        required=True,
-        type=Path,
-    )
-    parser.add_argument(
-        "--phenotype-file",
-        help="Path to phenotype file with SEX column (.txt.gz)",
-        required=True,
-        type=Path,
-    )
-    parser.add_argument(
-        "--output-file",
-        help="Path to write the tidied up output file (Parquet)",
-        required=True,
-        type=Path,
-    )
-    args = parser.parse_args()
-
-    main(args)
diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
new file mode 100644
index 0000000..5f9ddbf
--- /dev/null
+++ b/src/kanta/intake/tidyup.py
@@ -0,0 +1,173 @@
+import tempfile
+from argparse import ArgumentParser
+from pathlib import Path
+
+import polars as pl
+
+
+COLUMNS_UNIQUENESS_SORT = [
+    "FINNGENID",
+    "APPROX_EVENT_DAY",
+    "TIME",
+    "laboratoriotutkimusnimike",
+    "paikallinentutkimusnimike_koodi",
+    "tutkimusvastauksentila",
+    "tutkimustulosarvo",
+    "tutkimustulosyksikko",
+]
+
+
+def main(args):
+    temp_dir = Path(tempfile.mkdtemp())
+    print(f">> {temp_dir=}")
+
+    temp_dir_partition = temp_dir / "partition"
+    temp_dir_partition.mkdir()
+
+    temp_dir_tidyup = temp_dir / "tidyup"
+    temp_dir_tidyup.mkdir()
+
+    print("# Consolidate")
+    consolidated_file = consolidate_columns(args.assembled_file, args.output_dir)
+
+    print("# Partition")
+    partition(consolidated_file, temp_dir_partition, args.partition_n_buckets)
+
+    print("# Tidy-up")
+    for bucket_file in temp_dir_partition.glob("bucket_id__*.parquet"):
+        (
+            pl.scan_parquet(bucket_file)
+            .pipe(tidy_up)
+            .sink_parquet(temp_dir_tidyup / bucket_file.name)
+        )
+
+    df_pheno = pl.scan_csv(
+        args.phenotype_file,
+        infer_schema=False,
+        separator="\t",
+    ).select("FINNGENID", "SEX")
+
+    print("# Concatenate + Unique + SEX join")
+    bucket_files = []
+    for bucket_id in range(args.partition_n_buckets):
+        bucket_files.append(temp_dir_tidyup / f"bucket_id__{bucket_id}.parquet")
+
+    (
+        # TODO: verify the file order of `bucket_files` is kept
+        pl.scan_parquet(bucket_files)
+        # Join SEX
+        .join(
+            df_pheno,
+            left_on="FINNGENID",
+            right_on="FINNGENID",
+            how="left",
+            maintain_order="left",
+        )
+        .sink_parquet("/tmp/out.parquet")
+    )
+
+    # TODO validation
+    #
+    print("<< end")
+
+    # TODO: keep or delete intermediate files in temp_dir based on CLI flag
+    # with shutil.rmtree
+
+
+def consolidate_columns(assembled_file: Path, output_dir: Path) -> Path:
+    """Remove unecessary columns form the assembled file and rename the ones we will keep."""
+    output_file = output_dir / "consolidated.parquet"
+
+    columns = {
+        "main.FINNGENID": "FINNGENID",
+        "main.EVENT_AGE": "EVENT_AGE",
+        "main.tutkimuskoodistonjarjestelma": "tutkimuskoodistonjarjestelma",
+        "main.paikallinentutkimusnimike_selite": "paikallinentutkimusnimike_selite",
+        "main.tutkimustulosarvo": "tutkimustulosarvo",
+        "main.tutkimustulosyksikko": "tutkimustulosyksikko",
+        "main.tutkimusvastauksentila": "tutkimusvastauksentila",
+        "main.tuloksenpoikkeavuus": "tuloksenpoikkeavuus",
+        "main.viitearvoryhma": "viitearvoryhma",
+        "main.viitevalialkuarvo": "viitevalialkuarvo",
+        "main.viitevalialkuyksikko": "viitevalialkuyksikko",
+        "main.viitevaliloppuarvo": "viitevaliloppuarvo",
+        "main.viitevaliloppuyksikko": "viitevaliloppuyksikko",
+        "freetext.tutkimustulosteksti": "tutkimustulosteksti",
+        "main.paikallinentutkimusnimike_koodi": "paikallinentutkimusnimike_koodi",
+        "main.laboratoriotutkimusnimike": "laboratoriotutkimusnimike",
+        "main.APPROX_EVENT_DAY": "APPROX_EVENT_DAY",
+        "main.TIME": "TIME",
+    }
+
+    (
+        pl.scan_parquet(assembled_file)
+        .with_columns(
+            (
+                pl.col("main._rowid").cast(pl.String)
+                + "@"
+                + pl.col("main._filename")
+                + "|"
+                + pl.col("freetext._rowid").cast(pl.String)
+                + "@"
+                + pl.col("freetext._filename")
+            ).alias("_rowid")
+        )
+        .select(pl.col(list(columns.keys()) + ["_rowid"]))
+        .rename(columns)
+        .sink_parquet(output_file)
+    )
+
+    return output_file
+
+
+def partition(assembled_file: Path, temp_dir: Path, n_buckets):
+    for bucket_id in range(n_buckets):
+        (
+            pl.scan_parquet(assembled_file)
+            .filter(pl.col("FINNGENID").hash() % n_buckets == bucket_id)
+            .sink_parquet(temp_dir / f"bucket_id__{bucket_id}.parquet")
+        )
+
+
+def tidy_up(frame: pl.LazyFrame | pl.DataFrame):
+    return (
+        frame.sort(by=COLUMNS_UNIQUENESS_SORT)
+        # Dedup rows
+        # NOTE(Vincent 2026-05-20) The previous implementation (WDL/Python) was
+        # doing the dedup on adjacent lines. Here the deduplication is not done
+        # explicitely on adjacent lines (since polars `unique` does it on the
+        # full data), though the result should be the same.
+        .unique(subset=COLUMNS_UNIQUENESS_SORT, keep="first")
+    )
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--assembled-file",
+        help="Path to assembled file from the intake.assemble step (Parquet)",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--phenotype-file",
+        help="Path to phenotype file with SEX column (.txt.gz)",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--partition-n-buckets",
+        help="How many buckets to partition the data into to spread the sort+unique computations.",
+        required=False,
+        type=int,
+        default=32
+    )
+    parser.add_argument(
+        "--output-dir",
+        help="Path to write the output files",
+        required=True,
+        type=Path,
+    )
+    args = parser.parse_args()
+
+    main(args)

From 55a35095a37146f0220989deeb9256a9ecac976e Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Mon, 25 May 2026 11:30:58 +0000
Subject: [PATCH 08/22] add parameter to keep intermediate files

---
 src/kanta/intake/tidyup.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index 5f9ddbf..d7c38e2 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -1,4 +1,5 @@
 import tempfile
+import shutil
 from argparse import ArgumentParser
 from pathlib import Path
 
@@ -68,10 +69,11 @@ def main(args):
 
     # TODO validation
     #
-    print("<< end")
 
-    # TODO: keep or delete intermediate files in temp_dir based on CLI flag
-    # with shutil.rmtree
+    if not args.keep_intermediate_files:
+        shutil.rmtree(temp_dir)
+    
+    print("<< end")
 
 
 def consolidate_columns(assembled_file: Path, output_dir: Path) -> Path:
@@ -155,6 +157,12 @@ def tidy_up(frame: pl.LazyFrame | pl.DataFrame):
         required=True,
         type=Path,
     )
+    parser.add_argument(
+        "--output-dir",
+        help="Path to write the output files",
+        required=True,
+        type=Path,
+    )
     parser.add_argument(
         "--partition-n-buckets",
         help="How many buckets to partition the data into to spread the sort+unique computations.",
@@ -163,10 +171,9 @@ def tidy_up(frame: pl.LazyFrame | pl.DataFrame):
         default=32
     )
     parser.add_argument(
-        "--output-dir",
-        help="Path to write the output files",
-        required=True,
-        type=Path,
+        "--keep-intermediate-files",
+        help="Keep intermediate files, useful for debugging.",
+        action="store_true",
     )
     args = parser.parse_args()
 

From b30983121558ec055052112b282e1696286a8f65 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Mon, 25 May 2026 12:32:32 +0000
Subject: [PATCH 09/22] fix output paths

---
 src/kanta/intake/tidyup.py | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index d7c38e2..cf3bec4 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -1,6 +1,13 @@
+"""
+Differences from the WDL implementation:
+- no logging of duplicates/err lines
+- outputs to a single parquet file, no .txt.gz, as this is very slow.
+"""
+
 import tempfile
 import shutil
 from argparse import ArgumentParser
+from datetime import date
 from pathlib import Path
 
 import polars as pl
@@ -19,9 +26,17 @@
 
 
 def main(args):
+    # Set up output file and temporary directory for intermediate files
+    today = date.today()
+    output_file_stem = (
+        args.output_dir / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}"
+    )
+
     temp_dir = Path(tempfile.mkdtemp())
     print(f">> {temp_dir=}")
 
+    temp_file_consolidate = temp_dir / "consolidated.parquet"
+
     temp_dir_partition = temp_dir / "partition"
     temp_dir_partition.mkdir()
 
@@ -29,7 +44,7 @@ def main(args):
     temp_dir_tidyup.mkdir()
 
     print("# Consolidate")
-    consolidated_file = consolidate_columns(args.assembled_file, args.output_dir)
+    consolidated_file = consolidate_columns(args.assembled_file, temp_file_consolidate)
 
     print("# Partition")
     partition(consolidated_file, temp_dir_partition, args.partition_n_buckets)
@@ -54,7 +69,6 @@ def main(args):
         bucket_files.append(temp_dir_tidyup / f"bucket_id__{bucket_id}.parquet")
 
     (
-        # TODO: verify the file order of `bucket_files` is kept
         pl.scan_parquet(bucket_files)
         # Join SEX
         .join(
@@ -64,22 +78,17 @@ def main(args):
             how="left",
             maintain_order="left",
         )
-        .sink_parquet("/tmp/out.parquet")
+        .sink_parquet(output_file_stem.with_suffix(".parquet"))
     )
 
-    # TODO validation
-    #
-
     if not args.keep_intermediate_files:
         shutil.rmtree(temp_dir)
-    
+
     print("<< end")
 
 
-def consolidate_columns(assembled_file: Path, output_dir: Path) -> Path:
+def consolidate_columns(assembled_file: Path, output_file: Path) -> Path:
     """Remove unecessary columns form the assembled file and rename the ones we will keep."""
-    output_file = output_dir / "consolidated.parquet"
-
     columns = {
         "main.FINNGENID": "FINNGENID",
         "main.EVENT_AGE": "EVENT_AGE",
@@ -168,7 +177,7 @@ def tidy_up(frame: pl.LazyFrame | pl.DataFrame):
         help="How many buckets to partition the data into to spread the sort+unique computations.",
         required=False,
         type=int,
-        default=32
+        default=32,
     )
     parser.add_argument(
         "--keep-intermediate-files",

From ccd59451461baa82593d80acf268b62e94a6e2a3 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Mon, 25 May 2026 13:15:53 +0000
Subject: [PATCH 10/22] use output order of tidyup as _rowid

---
 src/kanta/intake/tidyup.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index cf3bec4..cfd1185 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -2,6 +2,8 @@
 Differences from the WDL implementation:
 - no logging of duplicates/err lines
 - outputs to a single parquet file, no .txt.gz, as this is very slow.
+- uses CSV-aware parsing, robust to edge cases like new-line character inside
+  CSV values.
 """
 
 import tempfile
@@ -78,7 +80,9 @@ def main(args):
             how="left",
             maintain_order="left",
         )
-        .sink_parquet(output_file_stem.with_suffix(".parquet"))
+        .with_row_index(name="_rowid", offset=1)
+        .drop("_rowid_consolidate_debug")
+        .sink_parquet(output_file_stem.with_name(output_file_stem.name +  ".parquet"))
     )
 
     if not args.keep_intermediate_files:
@@ -121,9 +125,9 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path:
                 + pl.col("freetext._rowid").cast(pl.String)
                 + "@"
                 + pl.col("freetext._filename")
-            ).alias("_rowid")
+            ).alias("_rowid_consolidate_debug")
         )
-        .select(pl.col(list(columns.keys()) + ["_rowid"]))
+        .select(pl.col(list(columns.keys()) + ["_rowid_consolidate_debug"]))
         .rename(columns)
         .sink_parquet(output_file)
     )

From 6d5bb82a72178832ec77dcf047ed220e6942592f Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Tue, 26 May 2026 10:31:44 +0000
Subject: [PATCH 11/22] add notes from benchmarks & adapt default N buckets

---
 src/kanta/intake/tidyup.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index cfd1185..3e512fb 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -1,9 +1,24 @@
 """
-Differences from the WDL implementation:
-- no logging of duplicates/err lines
-- outputs to a single parquet file, no .txt.gz, as this is very slow.
-- uses CSV-aware parsing, robust to edge cases like new-line character inside
+Differences from the WDL implementation
+=======================================
+- No logging of duplicates/err lines.
+- Outputs to a single parquet file, no .txt.gz, as this is very slow.
+- Uses CSV-aware parsing, robust to edge cases like new-line character inside
   CSV values.
+  
+
+VM choice and performance
+=========================
+Best config: 32 CPUs / 32 GB RAM and use 24 buckets. Runs in 2-3 min.
+
+For lower specs, run with 16 or 8 CPUs and allocate 2 GB RAM per CPU, use 24
+buckets. Runs in 5-8 min.
+
+Lowest tested working spec: 8 CPUs / 8 GB RAM, 32 buckets. Runs in 6-12 min.
+
+If failing due to OOM in the sort+dedup stage, try increasing the bucket count.
+
+The GCP VM type appears to matter. N2D is about 2x faster than E2.
 """
 
 import tempfile
@@ -178,10 +193,10 @@ def tidy_up(frame: pl.LazyFrame | pl.DataFrame):
     )
     parser.add_argument(
         "--partition-n-buckets",
-        help="How many buckets to partition the data into to spread the sort+unique computations.",
+        help="How many buckets to partition the data into to spread the sort+dedup computations.",
         required=False,
         type=int,
-        default=32,
+        default=24,
     )
     parser.add_argument(
         "--keep-intermediate-files",

From 2e18c2c54beb5d6aaad7e2a37e9e72dd2a85cc79 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Tue, 26 May 2026 10:55:48 +0000
Subject: [PATCH 12/22] rename tidy-up step to sort + dedup

---
 src/kanta/intake/tidyup.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index 3e512fb..db2e7b8 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -57,8 +57,8 @@ def main(args):
     temp_dir_partition = temp_dir / "partition"
     temp_dir_partition.mkdir()
 
-    temp_dir_tidyup = temp_dir / "tidyup"
-    temp_dir_tidyup.mkdir()
+    temp_dir_sort_dedup = temp_dir / "sort_dedup"
+    temp_dir_sort_dedup.mkdir()
 
     print("# Consolidate")
     consolidated_file = consolidate_columns(args.assembled_file, temp_file_consolidate)
@@ -66,12 +66,12 @@ def main(args):
     print("# Partition")
     partition(consolidated_file, temp_dir_partition, args.partition_n_buckets)
 
-    print("# Tidy-up")
+    print("# Sort + Dedup")
     for bucket_file in temp_dir_partition.glob("bucket_id__*.parquet"):
         (
             pl.scan_parquet(bucket_file)
-            .pipe(tidy_up)
-            .sink_parquet(temp_dir_tidyup / bucket_file.name)
+            .pipe(sort_dedup)
+            .sink_parquet(temp_dir_sort_dedup / bucket_file.name)
         )
 
     df_pheno = pl.scan_csv(
@@ -83,7 +83,7 @@ def main(args):
     print("# Concatenate + Unique + SEX join")
     bucket_files = []
     for bucket_id in range(args.partition_n_buckets):
-        bucket_files.append(temp_dir_tidyup / f"bucket_id__{bucket_id}.parquet")
+        bucket_files.append(temp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet")
 
     (
         pl.scan_parquet(bucket_files)
@@ -159,7 +159,7 @@ def partition(assembled_file: Path, temp_dir: Path, n_buckets):
         )
 
 
-def tidy_up(frame: pl.LazyFrame | pl.DataFrame):
+def sort_dedup(frame: pl.LazyFrame | pl.DataFrame):
     return (
         frame.sort(by=COLUMNS_UNIQUENESS_SORT)
         # Dedup rows

From 955f3ee6d6589b62acb80ce4fbb44842ab77ed2e Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Tue, 26 May 2026 10:56:13 +0000
Subject: [PATCH 13/22] improve info message of intake.tidyup

---
 src/kanta/intake/tidyup.py | 87 +++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index db2e7b8..b828610 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -5,7 +5,7 @@
 - Outputs to a single parquet file, no .txt.gz, as this is very slow.
 - Uses CSV-aware parsing, robust to edge cases like new-line character inside
   CSV values.
-  
+
 
 VM choice and performance
 =========================
@@ -45,12 +45,18 @@
 def main(args):
     # Set up output file and temporary directory for intermediate files
     today = date.today()
-    output_file_stem = (
-        args.output_dir / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}"
+    output_file = (
+        args.output_dir
+        / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}.parquet"
     )
 
     temp_dir = Path(tempfile.mkdtemp())
-    print(f">> {temp_dir=}")
+
+    print("# Run info")
+    print(f"- Partition into N buckets: {args.partition_n_buckets}")
+    print(f"- Directory for intermediate files: {temp_dir}")
+    print(f"- Output directory: {args.output_dir}")
+    print()
 
     temp_file_consolidate = temp_dir / "consolidated.parquet"
 
@@ -97,13 +103,48 @@ def main(args):
         )
         .with_row_index(name="_rowid", offset=1)
         .drop("_rowid_consolidate_debug")
-        .sink_parquet(output_file_stem.with_name(output_file_stem.name +  ".parquet"))
+        .sink_parquet(output_file)
     )
 
     if not args.keep_intermediate_files:
         shutil.rmtree(temp_dir)
 
-    print("<< end")
+
+def init_cli():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--assembled-file",
+        help="Path to assembled file from the intake.assemble step (Parquet)",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--phenotype-file",
+        help="Path to phenotype file with SEX column (.txt.gz)",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--output-dir",
+        help="Path to write the output files",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--partition-n-buckets",
+        help="How many buckets to partition the data into to spread the sort+dedup computations.",
+        required=False,
+        type=int,
+        default=24,
+    )
+    parser.add_argument(
+        "--keep-intermediate-files",
+        help="Keep intermediate files, useful for debugging.",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    return args
 
 
 def consolidate_columns(assembled_file: Path, output_file: Path) -> Path:
@@ -172,37 +213,5 @@ def sort_dedup(frame: pl.LazyFrame | pl.DataFrame):
 
 
 if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--assembled-file",
-        help="Path to assembled file from the intake.assemble step (Parquet)",
-        required=True,
-        type=Path,
-    )
-    parser.add_argument(
-        "--phenotype-file",
-        help="Path to phenotype file with SEX column (.txt.gz)",
-        required=True,
-        type=Path,
-    )
-    parser.add_argument(
-        "--output-dir",
-        help="Path to write the output files",
-        required=True,
-        type=Path,
-    )
-    parser.add_argument(
-        "--partition-n-buckets",
-        help="How many buckets to partition the data into to spread the sort+dedup computations.",
-        required=False,
-        type=int,
-        default=24,
-    )
-    parser.add_argument(
-        "--keep-intermediate-files",
-        help="Keep intermediate files, useful for debugging.",
-        action="store_true",
-    )
-    args = parser.parse_args()
-
+    args = init_cli()
     main(args)

From 59dc347a38cb654b691c106b02c9e35fdccec70b Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Tue, 26 May 2026 12:13:17 +0000
Subject: [PATCH 14/22] add documentation for intake assemble and tidy-up

---
 src/kanta/intake/assemble.py |   6 +
 src/kanta/intake/tidyup.py   |   6 +-
 wdl/pre-merge.json           |   7 -
 wdl/pre-merge.wdl            | 114 ----------------
 wdl/sort_dup.json            |   8 --
 wdl/sort_dup.wdl             | 249 -----------------------------------
 6 files changed, 10 insertions(+), 380 deletions(-)
 delete mode 100644 wdl/pre-merge.json
 delete mode 100644 wdl/pre-merge.wdl
 delete mode 100644 wdl/sort_dup.json
 delete mode 100644 wdl/sort_dup.wdl

diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py
index 38a096e..3b1f861 100644
--- a/src/kanta/intake/assemble.py
+++ b/src/kanta/intake/assemble.py
@@ -1,5 +1,11 @@
 """
 Merges the incoming Kanta Lab data from THL into one coherent file.
+
+
+Differences from the WDL implementation
+=======================================
+- Uses CSV-aware parsing, robust to edge cases like new-line character inside
+  CSV values.
 """
 
 import gzip
diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index b828610..5c2343d 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -1,10 +1,12 @@
 """
+Tidy-up the raw data into a subset of necessary column, and apply sorting
+and deduplication.
+
+
 Differences from the WDL implementation
 =======================================
 - No logging of duplicates/err lines.
 - Outputs to a single parquet file, no .txt.gz, as this is very slow.
-- Uses CSV-aware parsing, robust to edge cases like new-line character inside
-  CSV values.
 
 
 VM choice and performance
diff --git a/wdl/pre-merge.json b/wdl/pre-merge.json
deleted file mode 100644
index 6ea88a5..0000000
--- a/wdl/pre-merge.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "pre_merge.test":false,
-    "pre_merge.kanta_list": "gs://fg-3/kanta_v3/inputs/kanta_file_list.txt",
-    "pre_merge.prefix": "finngen_R14_kanta_laboratory_responses_internal_VERSION.txt.gz",
-    "pre_merge.version": "1.0"
-    
-}
diff --git a/wdl/pre-merge.wdl b/wdl/pre-merge.wdl
deleted file mode 100644
index a161165..0000000
--- a/wdl/pre-merge.wdl
+++ /dev/null
@@ -1,114 +0,0 @@
-version 1.0
-
-workflow pre_merge {
-  input {
-    Boolean test
-    File kanta_list
-    String prefix
-    String version
-    
-  }
-  String docker = "eu.gcr.io/finngen-sandbox-v3-containers/bioinformatics:1.0.1"
-
-  # Remove quotation marks (and test if needed)
-  scatter (year_files in read_tsv(kanta_list)) {
-    # THIS STEP REMOVES QUOTATION BLOCKS
-    call process_file as process_responses {input :input_file= year_files[0],test = test,docker=docker}
-    call process_file as process_ft { input :input_file= year_files[1],test = test ,docker=docker}
-    call merge_ft {input: responses_file = process_responses.cleaned_file,ft_file = process_ft.cleaned_file,docker=docker}
-  }
-
-  call merge_files {input:rr_files = merge_ft.merged_year,out_file = sub(prefix,"VERSION",if test then version +"_test" else version),docker=docker }
-  output {
-    File merged_kanta =merge_files.merged_file
-    }
-}
-
-task merge_ft {
-  input {
-    File responses_file
-    File ft_file
-    String docker
-  }
-
-  String out_file = sub(basename(responses_file),'.txt.gz','_merged.txt.gz')
-  command <<<
-  set -euo pipefail
-  F1="~{responses_file}"
-  F2="~{ft_file}"
-  OUT="~{out_file}"
-  # 1. Get headers safely. 
-  # 'head -1' often causes 'zcat' to return exit code 141 (SIGPIPE).
-  # '|| true' ensures H1/H2 assignments don't trigger 'set -e'.
-  H1=$(zcat -f "$F1" | head -1 || true)
-  H2=$(zcat -f "$F2" | head -1 || true)
-
-# Check if we actually got headers before proceeding
-  if [[ -z "$H1" || -z "$H2" ]]; then
-      echo "Error: Could not read headers from input files." >&2
-      exit 1
-  fi
-  # Get headers and find indices for columns in F2 not in F1
-  OFF=$(echo "$H1" | tr '\t' '\n' | wc -l)
-
-  # Join files and process in one AWK pass
-  paste <(zcat -f "$F1") <(zcat -f "$F2") | awk -F'\t' -v OFS='\t' -v h1="$H1" -v h2="$H2" -v off="$OFF" '
-  BEGIN {
-    split(h1, a1); split(h2, a2)
-    for(i in a1) map[a1[i]] = i
-    for(i in a2) if(a2[i] in map) pairs[map[a2[i]]] = i + off; else new[++n] = i + off
-  }
-  {
-    for(p in pairs) if($p != $pairs[p]) { print "Err line "NR": "$p" != "$pairs[p] > "/dev/stderr"; exit 1 }
-    res = $1; for(i=2; i<=off; i++) res = res OFS $i
-    for(i=1; i<=n; i++) res = res OFS $(new[i])
-    print res
-    if(NR%50000==0) printf "\rRow %d", NR > "/dev/stderr"
-  }' | gzip > "$OUT"
-
-
-  >>>
-  runtime {
-    disks: "local-disk ~{ceil(size(responses_file,'GB')*3) + 10} HDD"
-    docker : "~{docker}"
-  }
-  output {
-    File merged_year = out_file
-  }
-}
-
-task process_file {
-  input {
-    File input_file
-    Boolean test
-    String docker
-  }
-  String base = sub(basename(input_file),'.txt.gz','_cleaned.txt.gz')
-  command <<<
-  zcat -f ~{input_file} | sed 's/\(^\|\t\)"/\1/g; s/"\(\t\|$\)/\1/g' | tr -d '\r' | awk -F'\t' '/^FG/{if(NR>1)print ""; printf "%s",$0; next} {printf " %s",$0} END{print ""}' | awk -F'\t' 'BEGIN{OFS="\t"} NR==1{cols=NF} {if(NF<cols) for(i=NF+1;i<=cols;i++) $i="NA"; NF=cols; print}' | bgzip -c > ~{base}
-  >>>
-  runtime {
-    disks: "local-disk ~{ceil(size(input_file,'GB')*3) + 10} HDD"
-    docker:"~{docker}"
-  }
-  output {File cleaned_file = base}
-   
-}
-
-task merge_files {
-  input {
-    Array[File] rr_files
-    String out_file
-    String docker
-  }
-  command <<<
-  zcat ~{rr_files[0]} | head -n1 | bgzip -c > ~{out_file}
-  while read f; do  echo $f &&  zcat $f | sed -E 1d | bgzip -c >> ~{out_file}; done < ~{write_lines(rr_files)}
-  zcat ~{out_file} | wc -l
-  >>>
-  runtime {
-    disks: "local-disk ~{ceil(size(rr_files,'GB'))*3 + 10} HDD"
-    docker:"~{docker}"
-  }
-  output { File merged_file = out_file}
-}
diff --git a/wdl/sort_dup.json b/wdl/sort_dup.json
deleted file mode 100644
index 906d162..0000000
--- a/wdl/sort_dup.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "kanta_sort_dup.test":false,
-    "kanta_sort_dup.kanta_data":"gs://fg-3/kanta_v3/finngen_R14_kanta_laboratory_responses_internal_1.0.txt.gz",
-    "kanta_sort_dup.sex_map.min_pheno": "gs://finngen-production-library-red/finngen_R13/phenotype_1.0/data/finngen_R13_minimum_1.0.txt.gz",
-    "kanta_sort_dup.kanta_docker": "eu.gcr.io/finngen-sandbox-v3-containers/kanta:dev",
-    "kanta_sort_dup.split.n_chunks": 32,
-   
-}
diff --git a/wdl/sort_dup.wdl b/wdl/sort_dup.wdl
deleted file mode 100644
index d5b6596..0000000
--- a/wdl/sort_dup.wdl
+++ /dev/null
@@ -1,249 +0,0 @@
-version 1.0
-
-workflow kanta_sort_dup{
-  input {
-    # works with 100k lines
-    Boolean test
-    File kanta_data
-    String kanta_docker
-  }
-  # this has python 3.6, needed in the merge step.
-  String base_docker = "eu.gcr.io/finngen-sandbox-v3-containers/kanta:v3_base"
-  call get_cols {input:docker=kanta_docker}  
-  # split input in chunks
-  # s_cols (names) --> sort_cols (indices)
-  call split {
-    input:
-    test = test,
-    kanta_data = kanta_data,
-    cols = get_cols.cols,
-    s_cols = get_cols.s_cols,
-    docker=base_docker
-  }
-
-  # builds sex dictionary mapping from pheno file
-  call sex_map {input: docker=base_docker}
-
-  # extract columns sort and extract duplicates/errs
-  scatter (i in range(length(split.chunks))) {
-    call sort {
-      input :
-      index = i,
-      chunk = split.chunks[i],
-      sort_cols = split.sort_cols,
-      sex_map = sex_map.sex_map,
-      docker=base_docker
-    }
-  }
-  # merge chunks (unique/dup/err)
-  String prefix = basename(kanta_data,'.txt.gz')
-  call merge {
-    input :
-    sorted_chunks = sort.sorted_chunk,
-    sort_cols = split.sort_cols,
-    header = split.header,
-    docker= base_docker,
-    prefix = if test then prefix+ "_test"  else prefix 
-  }
-}
-
-task merge {
-  input {
-    File header
-    Array[File] sorted_chunks
-    Array[String] sort_cols
-    String prefix
-    String docker
-  }
-
-  Int chunk_size = ceil(size(sorted_chunks,"GB"))
-  command <<<
-  # CONCAT PRE-SORTED FILES
-  echo "SORT FILES"
-  for col in ~{sep=' ' sort_cols}; do  echo "${col},${col}" >> sort_keys.tmp;    done
-  SORT_ARGS=$(cat sort_keys.tmp | xargs -I {} echo "-k {}" | tr '\n' ' ')
-  /usr/bin/time -v sort -t $'\t' -m $SORT_ARGS ~{sep=" " sorted_chunks} > sorted.txt
-  #/usr/bin/time -v sort -t $'\t' -m -k ~{sep=" -k " sort_cols}  ~{sep=" " sorted_chunks}  > sorted.txt
-  # REMOVE DUPS
-  python3 <<EOF
-  from operator import itemgetter
-  from datetime import datetime
-  import gzip
-  # get col indices            
-  cols = [elem -1 for elem in [~{sep ="," sort_cols}]]
-  # initial empty values
-  values = ['' for _ in cols]
-  date = datetime.now().strftime("%Y_%m_%d")
-  prefix = '~{prefix}' +  "_{}".format(date)
-  unique = prefix + "_unique.tsv.gz"
-  dups   = prefix + "_duplicates.tsv.gz"
-  errs   = prefix + "_err.tsv.gz"
-  print(unique)
-  with open('sorted.txt') as i,gzip.open(dups,'wt') as dup,gzip.open(unique,'wt') as out,gzip.open(errs,'wt') as err:
-      # copy header to out files
-      with open('~{header}') as tmp: head = "ROW_ID\t" + tmp.read().strip() + "\tSEX\n"
-      out.write(head),dup.write(head),err.write(head)
-      row,dup_count,count,err_count = 0,0,0,0
-      for line in i:
-          row += 1
-          # read in new sort values to compare
-          try:
-              new_values = itemgetter(*cols)(line.strip().split('\t'))
-              if new_values != values: # new value found, so update values and output to unique file
-                  values = new_values
-                  f = out
-                  count += 1
-              else:
-                  f = dup
-                  dup_count +=1
-          except:
-              f = err
-              err_count +=1
-          f.write(str(row) + '\t' + line)
-          if row % 100000 == 0: print(f"{row}\r")
-  print(count)
-  print(err_count)
-  print(dup_count)
-  print(round(dup_count/(count+dup_count),4))
-  EOF
-  >>>
-  runtime {
-    disks:   "local-disk ~{chunk_size*4+10}  HDD"
-    docker : "~{docker}"
-  }
-  output {
-    Array[File] kanta_files = glob("~{prefix}*gz")
-  }
-}
-
-task sort {
-  input {
-    File chunk
-    Array[String] sort_cols
-    Int index
-    String docker
-    File sex_map
-  }
-  String out_file = "kanta_sorted_" + index
-
-  command <<<
-  for col in ~{sep=' ' sort_cols}; do  echo "${col},${col}" >> sort_keys.tmp;    done
-  SORT_ARGS=$(cat sort_keys.tmp | xargs -I {} echo "-k {}" | tr '\n' ' ')
-  zcat ~{chunk} | sort -t $'\t' $SORT_ARGS > tmp.txt
-  #zcat ~{chunk} | sort -t $'\t'  -k ~{sep=" -k " sort_cols}  > ~{out_file}
-
-  #add sex
-  awk -F'\t' 'BEGIN {OFS="\t"}  NR==FNR {sex[$1]=$2; next} NR==1 {print $0, "SEX"; next} {print $0, (sex[$1] ? sex[$1] : "NA")}' \
-      ~{sex_map} tmp.txt > ~{out_file}
-  
-  # check file size
-  count_tmp=$(wc -l < tmp.txt)
-  count_out=$(wc -l < ~{out_file})
-  
-  # Perform the assertion
-  if [[ "$count_tmp" -ne "$count_out" ]]; then
-      echo "❌ Assertion Failed: Line counts do not match!" >&2
-      echo "tmp.txt has $count_tmp lines." >&2
-      echo "~{out_file} has $count_out lines." >&2
-      exit 1 # Exit with a non-zero status to signal an error
-  else
-      echo "✅ Assertion Passed: Both files have $count_tmp lines."
-  fi
-  >>>
-
-  runtime {
-    disks:   "local-disk ~{ceil(size(chunk,'GB'))*3 + 10} HDD"
-    docker: "~{docker}"
-  }
-
-  output {
-    File sorted_chunk = out_file
-  }
-}
-
-task get_cols {
-  input {
-    String docker
-  }
-
-  command <<<
-  # get required columns to cut from git repository
-  cp /finngen_qc/magic_config.py ./config.py
-  python3 -c "import config;o= open('./columns.txt','wt') ;o.write('\n'.join(list(config.config['rename_cols'].keys())) + '\n');o.write('\n'.join(config.config['other_cols'])+ '\n')"
-  python3 -c "import config;o= open('./sort_columns.txt','wt') ;o.write('\n'.join(config.config['sort_cols'])+ '\n')"
-  >>>
-  runtime {
-    disks:   "local-disk 10 HDD"
-    docker : "~{docker}"
-  }
-  output {
-    Array[String] cols = read_lines("columns.txt")
-    Array[String] s_cols = read_lines("sort_columns.txt")
-  }
-}
-  
-task split {
-  input {
-    Boolean test
-    File kanta_data
-    Int n_chunks
-    Array[String] cols
-    Array[String] s_cols
-    String docker
-  }
- 
-  Int disk_size = ceil(size(kanta_data,"GB"))*10*n_chunks
-  
-  command <<<
-  echo "SORT KANTA"
-  cat ~{write_lines(cols)} > columns.txt
-  cat ~{write_lines(s_cols)} > sort_columns.txt
-  COLS=$(zcat ~{kanta_data} |  head -n1 | tr '\t' '\n'  | grep -wnf columns.txt | cut -f 1 -d ':' | tr '\n' ',' | rev | cut -c2- | rev)
-  echo $COLS
-  
-  # uncompress and split new header from body
-  zcat ~{kanta_data} | cut -f $COLS | head -n1  > header.txt
-  zcat ~{kanta_data} | cut -f $COLS | sed -E 1d  ~{if test then " | head -n 10000 " else ""}> tmp.tsv
-  
-  # GET SORT COLS AND KEEP ORDER
-  echo "COLS"
-  while read f;
-  do
-      cat header.txt | head -n1 | tr '\t' '\n'|  grep -wn $f |  cut -f 1 -d ':' >> sort_cols.txt
-  done <  sort_columns.txt
-  cat sort_cols.txt
-  
-  # SPLIT INTO N FILES
-  split tmp.tsv -n l/~{n_chunks} -d kanta_chunk --filter='gzip > $FILE.gz'
-  >>>
-
-  runtime {
-    disks: "local-disk ~{disk_size} HDD"
-    docker : "~{docker}"
-  }
-
-  output {
-    Array[File] chunks = glob("./kanta_chunk*gz")
-    File header = "header.txt"
-    Array[String] sort_cols = read_lines("sort_cols.txt")
-  }
-}
-
-task sex_map {
-  input {
-    File min_pheno
-    String docker
-  }
-  String sex_file = "sex_map.txt"
-  command <<<
-  # get sex col
-  sexcol=$(awk '{for(i=1;i<=NF;i++){if($i=="SEX"){print i; exit}}}' <(zcat ~{min_pheno} | head -n1))
-  # extract sex only and sort
-  zcat ~{min_pheno} | cut -f 1,$sexcol | (sed -u 1q ; sort )>> ~{sex_file}
-  >>>
-  runtime {
-    disks: "local-disk ~{ceil(size(min_pheno,'GB')) * 3} HDD"
-    docker : "~{docker}"
-  }
-  output {File sex_map = sex_file}
-}

From f77a1bbafb56e3c71375d931c8d978113ddcc006 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Wed, 27 May 2026 07:54:44 +0000
Subject: [PATCH 15/22] add ability to run assemble + tidyup stages at once

---
 src/kanta/intake/__main__.py | 62 ++++++++++++++++++++++++++++++++++++
 src/kanta/intake/assemble.py | 33 +++++++++++--------
 src/kanta/intake/tidyup.py   | 53 +++++++++++++++++-------------
 3 files changed, 111 insertions(+), 37 deletions(-)
 create mode 100644 src/kanta/intake/__main__.py

diff --git a/src/kanta/intake/__main__.py b/src/kanta/intake/__main__.py
new file mode 100644
index 0000000..f44e2d3
--- /dev/null
+++ b/src/kanta/intake/__main__.py
@@ -0,0 +1,62 @@
+if __name__ == "__main__":
+    import tempfile
+    import os
+    from argparse import ArgumentParser
+    from pathlib import Path
+
+    from kanta.intake import assemble
+    from kanta.intake import tidyup
+
+    parser = ArgumentParser()
+
+    parser.add_argument(
+        "--source-list-file",
+        required=True,
+        type=Path,
+        help="File containing pair of paths to main & freetext data, one pair per line (TSV without header).",
+    )
+    parser.add_argument(
+        "--phenotype-file",
+        help="Path to phenotype file with FINNGENID and SEX columns (.txt.gz)",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--output-dir",
+        help="Path to write the output files",
+        required=True,
+        type=Path,
+    )
+    parser.add_argument(
+        "--partition-n-buckets",
+        help="How many buckets to partition the data into to spread the sort+dedup computations.",
+        required=False,
+        type=int,
+        default=24,
+    )
+    parser.add_argument(
+        "--debug",
+        help="Increase verbosity and keep intermediate files",
+        required=False,
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+
+    # Assemble stage
+    _fd, absolute_pathname = tempfile.mkstemp()
+    tmp_file_assemble = Path(absolute_pathname)
+    post_assemble_file = assemble.main(args.source_list_file, tmp_file_assemble)
+
+    # Tidy-up stage
+    tidyup.main(
+        tmp_file_assemble,
+        args.phenotype_file,
+        args.output_dir,
+        partition_n_buckets=args.partition_n_buckets,
+        keep_intermediate_files=args.debug,
+    )
+
+    # Cleaning up
+    if not args.debug:
+        os.remove(tmp_file_assemble)
diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py
index 3b1f861..5e9a252 100644
--- a/src/kanta/intake/assemble.py
+++ b/src/kanta/intake/assemble.py
@@ -62,14 +62,25 @@
 COL_PREFIX_FREETEXT = "freetext."
 
 
-def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]:
+def main(source_list_file: Path, output_file: Path) -> Path:
+    pairs = validate_input_pairs(source_list_file)
+
+    print(">> merge_by_pair")
+    merge_by_pair(pairs, output_file)
+
+    print(">> check_merge_consistency")
+    print(check_merge_consistency(output_file))
+
+
+
+def validate_input_pairs(source_list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]:
     pairs = []
-    with open(list_file) as fp:
+    with open(source_list_file) as fp:
         for line in fp:
             values = line.split(separator, maxsplit=2)
 
-            main = validate_tsv_gz(values[0], list_file.parent)
-            freetext = validate_tsv_gz(values[1], list_file.parent)
+            main = validate_tsv_gz(values[0], source_list_file.parent)
+            freetext = validate_tsv_gz(values[1], source_list_file.parent)
 
             pairs.append((main, freetext))
 
@@ -209,24 +220,18 @@ def get_columns(input_path: Path) -> list[str]:
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument(
-        "--list-file",
+        "--source-list-file",
         required=True,
         type=Path,
         help="File containing pair of paths to main & freetext data, one pair per line (TSV without header).",
     )
     parser.add_argument(
-        "--post-merge-file",
+        "--output-file",
         required=True,
         type=Path,
-        help="Path to intermediary output file from the merge stage",
+        help="Path to output the intermediary file from this stage.",
     )
 
     args = parser.parse_args()
 
-    pairs = validate_input_pairs(args.list_file)
-
-    print(">> merge_by_pair")
-    merge_by_pair(pairs, args.post_merge_file)
-
-    print(">> check_merge_consistency")
-    print(check_merge_consistency(args.post_merge_file))
+    main(args.source_list_file, args.output_file)
diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index 5c2343d..fdf5e6f 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -44,54 +44,61 @@
 ]
 
 
-def main(args):
+def main(
+    assembled_file: Path,
+    phenotype_file: Path,
+    output_dir: Path,
+    *,
+    partition_n_buckets: int,
+    keep_intermediate_files: bool,
+):
     # Set up output file and temporary directory for intermediate files
     today = date.today()
     output_file = (
-        args.output_dir
+        output_dir
         / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}.parquet"
     )
 
-    temp_dir = Path(tempfile.mkdtemp())
+    tmp_dir = Path(tempfile.mkdtemp())
 
     print("# Run info")
-    print(f"- Partition into N buckets: {args.partition_n_buckets}")
-    print(f"- Directory for intermediate files: {temp_dir}")
-    print(f"- Output directory: {args.output_dir}")
+    print(f"- Partition into N buckets: {partition_n_buckets}")
+    print(f"- Directory for intermediate files: {tmp_dir}")
+    print(f"- Output directory: {output_dir}")
     print()
 
-    temp_file_consolidate = temp_dir / "consolidated.parquet"
+    tmp_file_consolidate = tmp_dir / "consolidated.parquet"
 
-    temp_dir_partition = temp_dir / "partition"
-    temp_dir_partition.mkdir()
+    tmp_dir_partition = tmp_dir / "partition"
+    tmp_dir_partition.mkdir()
 
-    temp_dir_sort_dedup = temp_dir / "sort_dedup"
-    temp_dir_sort_dedup.mkdir()
+    tmp_dir_sort_dedup = tmp_dir / "sort_dedup"
+    tmp_dir_sort_dedup.mkdir()
 
     print("# Consolidate")
-    consolidated_file = consolidate_columns(args.assembled_file, temp_file_consolidate)
+    consolidated_file = consolidate_columns(assembled_file, tmp_file_consolidate)
 
     print("# Partition")
-    partition(consolidated_file, temp_dir_partition, args.partition_n_buckets)
+    partition(consolidated_file, tmp_dir_partition, partition_n_buckets)
 
     print("# Sort + Dedup")
-    for bucket_file in temp_dir_partition.glob("bucket_id__*.parquet"):
+    for bucket_file in tmp_dir_partition.glob("bucket_id__*.parquet"):
         (
             pl.scan_parquet(bucket_file)
             .pipe(sort_dedup)
-            .sink_parquet(temp_dir_sort_dedup / bucket_file.name)
+            .sink_parquet(tmp_dir_sort_dedup / bucket_file.name)
         )
 
     df_pheno = pl.scan_csv(
-        args.phenotype_file,
+        phenotype_file,
         infer_schema=False,
         separator="\t",
     ).select("FINNGENID", "SEX")
 
     print("# Concatenate + Unique + SEX join")
     bucket_files = []
-    for bucket_id in range(args.partition_n_buckets):
-        bucket_files.append(temp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet")
+    for bucket_id in range(partition_n_buckets):
+        bucket_files.append(tmp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet")
 
     (
         pl.scan_parquet(bucket_files)
@@ -108,8 +115,8 @@ def main(args):
         .sink_parquet(output_file)
     )
 
-    if not args.keep_intermediate_files:
-        shutil.rmtree(temp_dir)
+    if not keep_intermediate_files:
+        shutil.rmtree(tmp_dir)
 
 
 def init_cli():
@@ -122,7 +129,7 @@ def init_cli():
     )
     parser.add_argument(
         "--phenotype-file",
-        help="Path to phenotype file with SEX column (.txt.gz)",
+        help="Path to phenotype file with FINNGENID and SEX columns (.txt.gz)",
         required=True,
         type=Path,
     )
@@ -193,12 +200,12 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path:
     return output_file
 
 
-def partition(assembled_file: Path, temp_dir: Path, n_buckets):
+def partition(assembled_file: Path, tmp_dir: Path, n_buckets):
     for bucket_id in range(n_buckets):
         (
             pl.scan_parquet(assembled_file)
             .filter(pl.col("FINNGENID").hash() % n_buckets == bucket_id)
-            .sink_parquet(temp_dir / f"bucket_id__{bucket_id}.parquet")
+            .sink_parquet(tmp_dir / f"bucket_id__{bucket_id}.parquet")
         )
 
 
From 7d3314d9330eab06faffe34345f4016a39b835ac Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Wed, 27 May 2026 10:53:39 +0000
Subject: [PATCH 16/22] add end-to-end row id tracking

GitHub: fixes #50
---
 src/kanta/intake/__main__.py | 25 +++++++++++---------
 src/kanta/intake/assemble.py | 24 +++++++++++++------
 src/kanta/intake/tidyup.py   | 46 ++++++++++++++----------------------
 3 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/src/kanta/intake/__main__.py b/src/kanta/intake/__main__.py
index f44e2d3..6bd162d 100644
--- a/src/kanta/intake/__main__.py
+++ b/src/kanta/intake/__main__.py
@@ -1,7 +1,6 @@
 if __name__ == "__main__":
-    import tempfile
-    import os
     from argparse import ArgumentParser
+    from datetime import date
     from pathlib import Path
 
     from kanta.intake import assemble
@@ -44,19 +43,23 @@
     args = parser.parse_args()
 
     # Assemble stage
-    _fd, absolute_pathname = tempfile.mkstemp()
-    tmp_file_assemble = Path(absolute_pathname)
-    post_assemble_file = assemble.main(args.source_list_file, tmp_file_assemble)
+    output_file_assemble_stage = (
+        args.output_dir
+        / f"finngen_R14_kanta_laboratory_responses.assemble-stage.{date.today()}.parquet"
+    )
+    post_assemble_file = assemble.main(
+        args.source_list_file, output_file_assemble_stage
+    )
 
     # Tidy-up stage
+    output_file_tidyup_stage = (
+        args.output_dir
+        / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{date.today()}.parquet"
+    )
     tidyup.main(
-        tmp_file_assemble,
+        output_file_assemble_stage,
         args.phenotype_file,
-        args.output_dir,
+        output_file_tidyup_stage,
         partition_n_buckets=args.partition_n_buckets,
         keep_intermediate_files=args.debug,
     )
-
-    # Cleaning up
-    if not args.debug:
-        os.remove(tmp_file_assemble)
diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py
index 5e9a252..fea5895 100644
--- a/src/kanta/intake/assemble.py
+++ b/src/kanta/intake/assemble.py
@@ -63,17 +63,21 @@
 
 
 def main(source_list_file: Path, output_file: Path) -> Path:
+    print()
+    print("=== ASSEMBLE STAGE ===")
     pairs = validate_input_pairs(source_list_file)
 
-    print(">> merge_by_pair")
+    print("# Merge by pair")
     merge_by_pair(pairs, output_file)
 
-    print(">> check_merge_consistency")
-    print(check_merge_consistency(output_file))
+    print("# Checking merge consistency")
+    is_consistent = check_merge_consistency(output_file)
+    print("All good." if is_consistent else "!!! Inconsitent merge !!!")
 
 
-
-def validate_input_pairs(source_list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]:
+def validate_input_pairs(
+    source_list_file: Path, *, separator="\t"
+) -> list[tuple[Path, Path]]:
     pairs = []
     with open(source_list_file) as fp:
         for line in fp:
@@ -124,7 +128,11 @@ def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) ->
 
         to_concat.append(df_merged)
 
-    pl.concat(to_concat).sink_parquet(parquet_output)
+    (
+        pl.concat(to_concat)
+        .with_row_index(name="_rowid_source", offset=1)
+        .sink_parquet(parquet_output)
+    )
 
 
 def check_merge_consistency(data_path: str | Path) -> bool:
@@ -150,7 +158,9 @@ def check_merge_consistency(data_path: str | Path) -> bool:
     # the main and freetext data are of different height.
     check_same_height = (
         pl.scan_parquet(data_path)
-        .select(pl.all_horizontal(pl.selectors.ends_with("._rowid").is_not_null().all()))
+        .select(
+            pl.all_horizontal(pl.selectors.ends_with("._rowid").is_not_null().all())
+        )
         .collect(engine="streaming")
         .item()
     )
diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index fdf5e6f..8db218a 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -26,7 +26,6 @@
 import tempfile
 import shutil
 from argparse import ArgumentParser
-from datetime import date
 from pathlib import Path
 
 import polars as pl
@@ -47,25 +46,20 @@
 def main(
     assembled_file: Path,
     phenotype_file: Path,
-    output_dir: Path,
+    output_file: Path,
     *,
     partition_n_buckets: int,
     keep_intermediate_files: bool,
 ):
     # Set up output file and temporary directory for intermediate files
-    today = date.today()
-    output_file = (
-        output_dir
-        / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}.parquet"
-    )
-
     tmp_dir = Path(tempfile.mkdtemp())
 
+    print()
+    print("=== TIDY-UP STAGE ===")
     print("# Run info")
     print(f"- Partition into N buckets: {partition_n_buckets}")
     print(f"- Directory for intermediate files: {tmp_dir}")
-    print(f"- Output directory: {output_dir}")
-    print()
+    print(f"- Output file: {output_file}")
 
     tmp_file_consolidate = tmp_dir / "consolidated.parquet"
 
@@ -111,7 +105,6 @@ def main(
             maintain_order="left",
         )
         .with_row_index(name="_rowid", offset=1)
-        .drop("_rowid_consolidate_debug")
         .sink_parquet(output_file)
     )
 
@@ -134,8 +127,8 @@ def init_cli():
         type=Path,
     )
     parser.add_argument(
-        "--output-dir",
-        help="Path to write the output files",
+        "--output-file",
+        help="Path to write the output file",
         required=True,
         type=Path,
     )
@@ -158,7 +151,7 @@ def init_cli():
 
 def consolidate_columns(assembled_file: Path, output_file: Path) -> Path:
     """Remove unecessary columns form the assembled file and rename the ones we will keep."""
-    columns = {
+    rename_columns = {
         "main.FINNGENID": "FINNGENID",
         "main.EVENT_AGE": "EVENT_AGE",
         "main.tutkimuskoodistonjarjestelma": "tutkimuskoodistonjarjestelma",
@@ -179,21 +172,12 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path:
         "main.TIME": "TIME",
     }
 
+    out_columns = list(rename_columns.keys()) + ["_rowid_source"]
+
     (
         pl.scan_parquet(assembled_file)
-        .with_columns(
-            (
-                pl.col("main._rowid").cast(pl.String)
-                + "@"
-                + pl.col("main._filename")
-                + "|"
-                + pl.col("freetext._rowid").cast(pl.String)
-                + "@"
-                + pl.col("freetext._filename")
-            ).alias("_rowid_consolidate_debug")
-        )
-        .select(pl.col(list(columns.keys()) + ["_rowid_consolidate_debug"]))
-        .rename(columns)
+        .select(pl.col(out_columns))
+        .rename(rename_columns)
         .sink_parquet(output_file)
     )
 
@@ -223,4 +207,10 @@ def sort_dedup(frame: pl.LazyFrame | pl.DataFrame):
 
 if __name__ == "__main__":
     args = init_cli()
-    main(args)
+    main(
+        args.assembled_file,
+        args.phenotype_file,
+        args.output_file,
+        partition_n_buckets=args.partition_n_buckets,
+        keep_intermediate_files=args.keep_intermediate_files
+    )

From 7f9f97ee75ea46f1281512f1c2a0f30cf14de0c9 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Fri, 29 May 2026 11:08:53 +0000
Subject: [PATCH 17/22] reword log message when merging files in assemble

---
 src/kanta/intake/assemble.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py
index fea5895..ba1f562 100644
--- a/src/kanta/intake/assemble.py
+++ b/src/kanta/intake/assemble.py
@@ -98,7 +98,7 @@ def validate_input_pairs(
 def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) -> None:
     to_concat = []
     for path_main, path_freetext in pairs:
-        print(f"Processing {path_main} & {path_freetext}")
+        print(f"Adding horizontal merge: {path_main} & {path_freetext}")
 
         df_main = (
             pl.scan_csv(

From 196d22311f8d7b2b709c9b783eff3bf8eb0226e1 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Fri, 29 May 2026 11:10:43 +0000
Subject: [PATCH 18/22] sanitize text fields by removing new-line character

---
 src/kanta/intake/tidyup.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index 8db218a..dada05e 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -89,12 +89,12 @@ def main(
         separator="\t",
     ).select("FINNGENID", "SEX")
 
-    print("# Concatenate + Unique + SEX join")
+    print("# Concatenate + join SEX")
     bucket_files = []
     for bucket_id in range(partition_n_buckets):
         bucket_files.append(tmp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet")
 
-    (
+    df_concat = (
         pl.scan_parquet(bucket_files)
         # Join SEX
         .join(
@@ -105,7 +105,32 @@ def main(
             maintain_order="left",
         )
         .with_row_index(name="_rowid", offset=1)
-        .sink_parquet(output_file)
+    )
+
+    print("# Sanitize text fields")
+
+    unicode_newline = "\u2424"  # Unicode "SYMBOL FOR NEWLINE", displayed as: ␤
+    trusted_columns = [
+        "FINNGENID",
+        "EVENT_AGE",
+        "APPROX_EVENT_DAY",
+        "TIME",
+        "asiakirjaoid_pseudo",
+        "merkintaoid_pseudo",
+        "entryoid_pseudo",
+        "load_id_pseudo",
+        "file_name_pseudo",
+        "laboratoriotutkimusoid",
+        "_rowid",
+        "_rowid_source",
+        "SEX"
+    ]
+    (
+        df_concat.with_columns(
+            pl.selectors.exclude(*trusted_columns).str.replace_all(
+                pattern="\r\n|\r|\n", value=unicode_newline
+            )
+        ).sink_parquet(output_file)
     )
 
     if not keep_intermediate_files:
@@ -212,5 +237,5 @@ def sort_dedup(frame: pl.LazyFrame | pl.DataFrame):
         args.phenotype_file,
         args.output_file,
         partition_n_buckets=args.partition_n_buckets,
-        keep_intermediate_files=args.keep_intermediate_files
+        keep_intermediate_files=args.keep_intermediate_files,
     )

From a94c0232123078763952cd391d265f367d5941f4 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Wed, 3 Jun 2026 13:39:00 +0000
Subject: [PATCH 19/22] reorder columns to match previous implementation

---
 src/kanta/intake/tidyup.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index dada05e..7fb54b8 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -123,14 +123,39 @@ def main(
         "laboratoriotutkimusoid",
         "_rowid",
         "_rowid_source",
-        "SEX"
+        "SEX",
     ]
     (
         df_concat.with_columns(
             pl.selectors.exclude(*trusted_columns).str.replace_all(
                 pattern="\r\n|\r|\n", value=unicode_newline
             )
-        ).sink_parquet(output_file)
+        )
+        # Re-order column to be somewhat backward compatible with previous implementation
+        .select(
+            "_rowid",
+            "_rowid_source",
+            "FINNGENID",
+            "EVENT_AGE",
+            "APPROX_EVENT_DAY",
+            "TIME",
+            "laboratoriotutkimusnimike",
+            "paikallinentutkimusnimike_koodi",
+            "paikallinentutkimusnimike_selite",
+            "tutkimuskoodistonjarjestelma",
+            "tutkimusvastauksentila",
+            "tutkimustulosarvo",
+            "tutkimustulosyksikko",
+            "tuloksenpoikkeavuus",
+            "viitearvoryhma",
+            "viitevalialkuarvo",
+            "viitevalialkuyksikko",
+            "viitevaliloppuarvo",
+            "viitevaliloppuyksikko",
+            "tutkimustulosteksti",
+            "SEX",
+        )
+        .sink_parquet(output_file)
     )
 
     if not keep_intermediate_files:

From 45e7e573f5da53e730b3cf6df691027fbbbd08bb Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Thu, 4 Jun 2026 12:02:14 +0000
Subject: [PATCH 20/22] implement same-ish dedup logic as prev WDL version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Known differences:
- Behavior change: Newline inside values are now replaced by the Unicode
  character `␤` (U+2424), instead of replacing by a space character ` `.
- Bug fix: Values within quoted TSV fields are now correctly preserved,
  whereas the previous implementation added extra quotes.
- Bug fix: Tab characters inside quoted TSV values are now correctly
  preserved, whereas the previous implementation treated them as field
  separator resulting in shifted values.
---
 src/kanta/intake/tidyup.py | 41 ++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index 7fb54b8..67dd59a 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -83,17 +83,17 @@ def main(
             .sink_parquet(tmp_dir_sort_dedup / bucket_file.name)
         )
 
+    print("# Concatenate + join SEX")
+    bucket_files = []
+    for bucket_id in range(partition_n_buckets):
+        bucket_files.append(tmp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet")
+
     df_pheno = pl.scan_csv(
         phenotype_file,
         infer_schema=False,
         separator="\t",
     ).select("FINNGENID", "SEX")
 
-    print("# Concatenate + join SEX")
-    bucket_files = []
-    for bucket_id in range(partition_n_buckets):
-        bucket_files.append(tmp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet")
-
     df_concat = (
         pl.scan_parquet(bucket_files)
         # Join SEX
@@ -204,11 +204,15 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path:
     rename_columns = {
         "main.FINNGENID": "FINNGENID",
         "main.EVENT_AGE": "EVENT_AGE",
-        "main.tutkimuskoodistonjarjestelma": "tutkimuskoodistonjarjestelma",
+        "main.APPROX_EVENT_DAY": "APPROX_EVENT_DAY",
+        "main.TIME": "TIME",
+        "main.laboratoriotutkimusnimike": "laboratoriotutkimusnimike",
+        "main.paikallinentutkimusnimike_koodi": "paikallinentutkimusnimike_koodi",
         "main.paikallinentutkimusnimike_selite": "paikallinentutkimusnimike_selite",
+        "main.tutkimuskoodistonjarjestelma": "tutkimuskoodistonjarjestelma",
+        "main.tutkimusvastauksentila": "tutkimusvastauksentila",
         "main.tutkimustulosarvo": "tutkimustulosarvo",
         "main.tutkimustulosyksikko": "tutkimustulosyksikko",
-        "main.tutkimusvastauksentila": "tutkimusvastauksentila",
         "main.tuloksenpoikkeavuus": "tuloksenpoikkeavuus",
         "main.viitearvoryhma": "viitearvoryhma",
         "main.viitevalialkuarvo": "viitevalialkuarvo",
@@ -216,10 +220,6 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path:
         "main.viitevaliloppuarvo": "viitevaliloppuarvo",
         "main.viitevaliloppuyksikko": "viitevaliloppuyksikko",
         "freetext.tutkimustulosteksti": "tutkimustulosteksti",
-        "main.paikallinentutkimusnimike_koodi": "paikallinentutkimusnimike_koodi",
-        "main.laboratoriotutkimusnimike": "laboratoriotutkimusnimike",
-        "main.APPROX_EVENT_DAY": "APPROX_EVENT_DAY",
-        "main.TIME": "TIME",
     }
 
     out_columns = list(rename_columns.keys()) + ["_rowid_source"]
@@ -244,14 +244,17 @@ def partition(assembled_file: Path, tmp_dir: Path, n_buckets):
 
 
 def sort_dedup(frame: pl.LazyFrame | pl.DataFrame):
-    return (
-        frame.sort(by=COLUMNS_UNIQUENESS_SORT)
-        # Dedup rows
-        # NOTE(Vincent 2026-05-20) The previous implementation (WDL/Python) was
-        # doing the dedup on adjacent lines. Here the deduplication is not done
-        # explicitely on adjacent lines (since polars `unique` does it on the
-        # full data), though the result should be the same.
-        .unique(subset=COLUMNS_UNIQUENESS_SORT, keep="first")
+    all_columns = frame.collect_schema().names()
+    sort_subset_columns = set(COLUMNS_UNIQUENESS_SORT)
+    other_columns = []
+    for cc in all_columns:
+        if cc not in sort_subset_columns:
+            other_columns.append(cc)
+
+    sort_full_columns = COLUMNS_UNIQUENESS_SORT + other_columns
+
+    return frame.sort(by=sort_full_columns).unique(
+        subset=COLUMNS_UNIQUENESS_SORT, keep="first", maintain_order=True
     )
 
 
From 47434e1341ea3f535fd5d20cdfc48343b134c882 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Tue, 9 Jun 2026 07:47:58 +0000
Subject: [PATCH 21/22] reorder columns in output of intake.assemble

---
 src/kanta/intake/assemble.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py
index ba1f562..65ff63e 100644
--- a/src/kanta/intake/assemble.py
+++ b/src/kanta/intake/assemble.py
@@ -131,10 +131,24 @@ def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) ->
     (
         pl.concat(to_concat)
         .with_row_index(name="_rowid_source", offset=1)
+        .pipe(reorder_columns)
         .sink_parquet(parquet_output)
     )
 
 
+def reorder_columns(frame: pl.LazyFame | pl.DataFrame) -> pl.LazyFrame | pl.DataFrame:
+    column_order = (
+        ["_rowid_source"]
+        # Columns for main
+        + [COL_PREFIX_MAIN + "_rowid", COL_PREFIX_MAIN + "_filename"]
+        + [COL_PREFIX_MAIN + cc for cc in EXPECTED_COLUMNS_MAIN]
+        # Columns for freetext
+        + [COL_PREFIX_FREETEXT + "_rowid", COL_PREFIX_FREETEXT + "_filename"]
+        + [COL_PREFIX_FREETEXT + cc for cc in EXPECTED_COLUMNS_FREETEXT]
+    )
+    return frame.select(column_order)
+
+
 def check_merge_consistency(data_path: str | Path) -> bool:
     # First check: all shared columns have the same values
     shared_cols = set(EXPECTED_COLUMNS_MAIN).intersection(EXPECTED_COLUMNS_FREETEXT)

From 7d9b7cd04ded785b72e5d06d79c96b5c3e2fb1d4 Mon Sep 17 00:00:00 2001
From: Vincent <vincent-octo@users.noreply.github.com>
Date: Tue, 9 Jun 2026 07:51:14 +0000
Subject: [PATCH 22/22] =?UTF-8?q?replace=20\t=20with=20=E2=90=89=20(U+2409?=
 =?UTF-8?q?)=20in=20raw=20data=20intake=20stage?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is to prevent any naive TSV parsing from being tripped up.
---
 src/kanta/intake/tidyup.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py
index 67dd59a..2f7ca0d 100644
--- a/src/kanta/intake/tidyup.py
+++ b/src/kanta/intake/tidyup.py
@@ -108,8 +108,10 @@ def main(
     )
 
     print("# Sanitize text fields")
-
-    unicode_newline = "\u2424"  # Unicode "SYMBOL FOR NEWLINE", displayed as: ␤
+    # Unicode "SYMBOL FOR NEWLINE", displayed as: ␤
+    unicode_newline = "\u2424"
+    # Unicode "SYMBOL FOR HORIZONTAL TABULATION", displayed as: ␉
+    unicode_tab = "\u2409"
     trusted_columns = [
         "FINNGENID",
         "EVENT_AGE",
@@ -127,9 +129,9 @@ def main(
     ]
     (
         df_concat.with_columns(
-            pl.selectors.exclude(*trusted_columns).str.replace_all(
-                pattern="\r\n|\r|\n", value=unicode_newline
-            )
+            pl.selectors.exclude(*trusted_columns)
+            .str.replace_all(pattern="\r\n|\r|\n", value=unicode_newline)
+            .str.replace_all(pattern="\t", value=unicode_tab, literal=True)
         )
         # Re-order column to be somewhat backward compatible with previous implementation
         .select(