From fbd937ed4f0ccfe064c40552daebde8485e16c2f Mon Sep 17 00:00:00 2001 From: Vincent Date: Mon, 20 Apr 2026 12:42:47 +0000 Subject: [PATCH 01/22] initial rewrie of merge step from WDL to Polars --- mypolars/.python-version | 1 + mypolars/README.md | 0 mypolars/main.py | 171 +++++++++++++++++++++++++ mypolars/pyproject.toml | 15 +++ mypolars/uv.lock | 263 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 450 insertions(+) create mode 100644 mypolars/.python-version create mode 100644 mypolars/README.md create mode 100644 mypolars/main.py create mode 100644 mypolars/pyproject.toml create mode 100644 mypolars/uv.lock diff --git a/mypolars/.python-version b/mypolars/.python-version new file mode 100644 index 0000000..6324d40 --- /dev/null +++ b/mypolars/.python-version @@ -0,0 +1 @@ +3.14 diff --git a/mypolars/README.md b/mypolars/README.md new file mode 100644 index 0000000..e69de29 diff --git a/mypolars/main.py b/mypolars/main.py new file mode 100644 index 0000000..cd47118 --- /dev/null +++ b/mypolars/main.py @@ -0,0 +1,171 @@ +""" +Merges the incoming Kanta Lab data from THL into one coherent file. + +Note: needed ~128GB memory to run on R14 data. +""" +import gzip +from argparse import ArgumentParser +from itertools import zip_longest +from pathlib import Path + +import polars as pl +pl.Config.set_verbose(True) + + +# TODO +# 4. Validate that shared column match +# 5. Post WDL sort-dup: subset columns, join SEX, sort, output unique/duplicates/error rows + +EXPECTED_COLUMNS_RESPONSES = [ + "FINNGENID", + "EVENT_AGE", + "APPROX_EVENT_DAY", + "TIME", + "asiakirjaoid_pseudo", + "merkintaoid_pseudo", + "entryoid_pseudo", + "load_id_pseudo", + "file_name_pseudo", + "laboratoriotutkimusoid", + "laboratoriotutkimusnimike", + "paikallinentutkimusnimike_koodi", + "paikallinentutkimusnimike_selite", + "tutkimuskoodistonjarjestelma", + "tiedonlahde", + "tutkimusvastauksentila", + "tutkimustulosarvo", + "tutkimustulosyksikko", + "tutkimuksennaytelaatu", + "tutkimuksentekotapa", + "tuloksenpoikkeavuus", + "viitearvoryhma", + "viitevalialkuarvo", + "viitevalialkuyksikko", + "viitevaliloppuarvo", + "viitevaliloppuyksikko", +] + +EXPECTED_COLUMNS_FREETEXT = [ + "FINNGENID", + "EVENT_AGE", + "APPROX_EVENT_DAY", + "TIME", + "asiakirjaoid_pseudo", + "merkintaoid_pseudo", + "entryoid_pseudo", + "load_id_pseudo", + "file_name_pseudo", + "tutkimustulosteksti", +] + + +def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]: + pairs = [] + with open(list_file) as fp: + for line in fp: + values = line.split(separator, maxsplit=2) + + responses = validate_tsv_gz(values[0], list_file.parent) + freetext = validate_tsv_gz(values[1], list_file.parent) + + pairs.append((responses, freetext)) + + for responses, freetext in pairs: + check_columns(responses, EXPECTED_COLUMNS_RESPONSES, "responses") + check_columns(freetext, EXPECTED_COLUMNS_FREETEXT, "freetext") + + return pairs + + +def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) -> None: + to_concat = [] + for path_responses, path_freetext in pairs: + print(f"Processing {path_responses} & {path_freetext}") + + df_resp = ( + pl.scan_csv(path_responses, infer_schema=False, separator="\t") + .with_row_index(name="_rn", offset=1) + ) + + df_freetext = ( + pl.scan_csv(path_freetext, infer_schema=False, separator="\t") + .with_row_index(name="_rn", offset=1) + ) + + df_merged = df_resp.join(df_freetext, on="_rn", how="full") + to_concat.append(df_merged) + + pl.concat(to_concat).sink_parquet(parquet_output) + + +def validate_tsv_gz(filename: str, in_dir: Path) -> Path: + """Check if path exists and is a proper TSV & gz""" + full_path = (in_dir / filename.strip()).resolve() + + if not full_path.exists(): + raise FileNotFoundError(f"File does not exist: {full_path}") + + # Check it's readable as a gzip file + try: + with gzip.open(full_path, "rt", encoding="utf-8") as ff: + first_line = ff.readline() + except OSError as ee: + raise ValueError(f"File is not a valid gzip: {full_path}") from ee + + # Check it's actual TSV + if "\t" not in first_line: + raise ValueError( + f"File does not appear to be TSV (no \\t on first line): {full_path}" + ) + + return full_path + + +def check_columns(file_path: Path, expected_columns: list[str], label: str) -> None: + actual_columns = get_columns(file_path) + + if actual_columns != expected_columns: + if len(actual_columns) == 0: + raise Exception(f"No columns in {file_path}") + + if len(expected_columns) == 0: + raise Exception( + f"Misconfigured expected columns ({label}): no columns listed" + ) + + if set(actual_columns) != set(expected_columns): + message = f"Columns differ for {label}:\n" + message += f"Only in expected columns: {list(set(expected_columns) - set(actual_columns))}\n" + message += f"Only in actual columns: {list(set(actual_columns) - set(expected_columns))}" + raise Exception(message) + + # Else it's the same columns but in different order + message = "Column order differ:\n" + for col_expected, col_actual in zip_longest(expected_columns, actual_columns): + comp = "==" if col_expected == col_actual else "=!=/!\\=!=" + message += f"{col_expected} {comp} {col_actual}\n" + raise Exception(message) + + +def get_columns(input_path: Path) -> list[str]: + # We checked that the file is a proper TSV gz beforehand, so we now explicitely specify the separator + df = pl.read_csv( + input_path, has_header=True, separator="\t", infer_schema=False, n_rows=0 + ) + return df.columns + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "--list-file", + required=True, + type=Path, + help="File containing pair of paths to responses & freetext data, one pair per line (TSV without header).", + ) + + args = parser.parse_args() + + pairs = validate_input_pairs(args.list_file) + + merge_by_pair(pairs, "/tmp/out.parquet") diff --git a/mypolars/pyproject.toml b/mypolars/pyproject.toml new file mode 100644 index 0000000..26939d3 --- /dev/null +++ b/mypolars/pyproject.toml @@ -0,0 +1,15 @@ +[project] +name = "mypolars" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.14" +dependencies = [ + "polars>=1.40.0", +] + +[dependency-groups] +dev = [ + "ipython>=9.12.0", + "ty>=0.0.32", +] diff --git a/mypolars/uv.lock b/mypolars/uv.lock new file mode 100644 index 0000000..00fb747 --- /dev/null +++ b/mypolars/uv.lock @@ -0,0 +1,263 @@ +version = 1 +revision = 3 +requires-python = ">=3.14" + +[[package]] +name = "asttokens" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/be/a5/8e3f9b6771b0b408517c82d97aed8f2036509bc247d46114925e32fe33f0/asttokens-3.0.1.tar.gz", hash = "sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7", size = 62308, upload-time = "2025-11-15T16:43:48.578Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, +] + +[[package]] +name = "executing" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, +] + +[[package]] +name = "ipython" +version = "9.12.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "decorator" }, + { name = "ipython-pygments-lexers" }, + { name = "jedi" }, + { name = "matplotlib-inline" }, + { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, + { name = "prompt-toolkit" }, + { name = "pygments" }, + { name = "stack-data" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/73/7114f80a8f9cabdb13c27732dce24af945b2923dcab80723602f7c8bc2d8/ipython-9.12.0.tar.gz", hash = "sha256:01daa83f504b693ba523b5a407246cabde4eb4513285a3c6acaff11a66735ee4", size = 4428879, upload-time = "2026-03-27T09:42:45.312Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/59/22/906c8108974c673ebef6356c506cebb6870d48cedea3c41e949e2dd556bb/ipython-9.12.0-py3-none-any.whl", hash = "sha256:0f2701e8ee86e117e37f50563205d36feaa259d2e08d4a6bc6b6d74b18ce128d", size = 625661, upload-time = "2026-03-27T09:42:42.831Z" }, +] + +[[package]] +name = "ipython-pygments-lexers" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, +] + +[[package]] +name = "jedi" +version = "0.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "parso" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, +] + +[[package]] +name = "matplotlib-inline" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, +] + +[[package]] +name = "mypolars" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "polars" }, +] + +[package.dev-dependencies] +dev = [ + { name = "ipython" }, + { name = "ty" }, +] + +[package.metadata] +requires-dist = [{ name = "polars", specifier = ">=1.40.0" }] + +[package.metadata.requires-dev] +dev = [ + { name = "ipython", specifier = ">=9.12.0" }, + { name = "ty", specifier = ">=0.0.32" }, +] + +[[package]] +name = "parso" +version = "0.8.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/81/76/a1e769043c0c0c9fe391b702539d594731a4362334cdf4dc25d0c09761e7/parso-0.8.6.tar.gz", hash = "sha256:2b9a0332696df97d454fa67b81618fd69c35a7b90327cbe6ba5c92d2c68a7bfd", size = 401621, upload-time = "2026-02-09T15:45:24.425Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/61/fae042894f4296ec49e3f193aff5d7c18440da9e48102c3315e1bc4519a7/parso-0.8.6-py2.py3-none-any.whl", hash = "sha256:2c549f800b70a5c4952197248825584cb00f033b29c692671d3bf08bf380baff", size = 106894, upload-time = "2026-02-09T15:45:21.391Z" }, +] + +[[package]] +name = "pexpect" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ptyprocess" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, +] + +[[package]] +name = "polars" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "polars-runtime-32" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d9/1b/eea7d6fe6daafc1d784cc0f76c729b28051837ccb2d51ae64a0a3f798142/polars-1.40.0.tar.gz", hash = "sha256:711dd50dcbc35ba42a2625fcadc2a1349e2e9abf48e35631bdabafb90d89874b", size = 732943, upload-time = "2026-04-18T05:25:26.077Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b4/ad/d5ed79269b7fe59a3dbbfbdbecbe1e59a0b56e38d36491e57d2bfb5846c1/polars-1.40.0-py3-none-any.whl", hash = "sha256:60b1d677ca363e2fc6fdea8c3d16c0653fd52cc37f0249e0f29d9536d5aa45ef", size = 828012, upload-time = "2026-04-18T05:23:39.055Z" }, +] + +[[package]] +name = "polars-runtime-32" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fb/b2/eae6c1b3d16c7a64ff382f557985ff939cce13455e8c9d056ab8e1e0fc87/polars_runtime_32-1.40.0.tar.gz", hash = "sha256:e31bff8bd37492c714e155e2e1429ac2d9ddf2dd6ec6474cc1cc70ac0b2bd6af", size = 2935285, upload-time = "2026-04-18T05:25:28.038Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/e4/2325689d2af4f9e70699ff98e8a2543707bebc34af78a5fe0e654107d9ed/polars_runtime_32-1.40.0-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:cab3ac7ff5bc9e0f4b3b146015569e9417cf0eaff8d3fb71004d73d67b6f09c7", size = 52092528, upload-time = "2026-04-18T05:23:42.341Z" }, + { url = "https://files.pythonhosted.org/packages/19/a6/82157b19c5c40b2c1ed0493b87b9eaf9b4863cdedca5575ee083488b45ba/polars_runtime_32-1.40.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:d29624c75c4049253300786d00882fce620b3677ce495ebc4199292de8c2ba02", size = 46365073, upload-time = "2026-04-18T05:23:46.7Z" }, + { url = "https://files.pythonhosted.org/packages/85/b5/5c4f1f2545f56c664cc57bbdd1aa66fcfcb129aa137ed72cc81d58eb480f/polars_runtime_32-1.40.0-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a034dc0d8481fc1ca0456ab33e98e53a4c6d6cc6a2edb36246cc81c936b925dc", size = 50250561, upload-time = "2026-04-18T05:23:51.316Z" }, + { url = "https://files.pythonhosted.org/packages/8e/51/cb5eb75394f39c0ec14fddcc9b11adb707e1f28224a552ecbfa72d39b61b/polars_runtime_32-1.40.0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70e78c2f13a54a9d92ae30d2625bda759173cc4867ad6a39f85f140058d899c6", size = 56243695, upload-time = "2026-04-18T05:23:55.932Z" }, + { url = "https://files.pythonhosted.org/packages/16/3a/be1437c0fbecbb07d81b151456089c3cf054eea5a791f849ed39b67611ca/polars_runtime_32-1.40.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1843272c0ef49f4a07435888f0059eca08ec16ab9880219c457195a081df0281", size = 50427843, upload-time = "2026-04-18T05:24:00.159Z" }, + { url = "https://files.pythonhosted.org/packages/be/c7/ea6449a2161816a13ed1d8aa02177d5a0594e011f0df5ddd2fad8e5bf20e/polars_runtime_32-1.40.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:081237dba07f15d61fc151825f203165480e9503ebe72a474a8c99aa78021962", size = 54153077, upload-time = "2026-04-18T05:24:05.066Z" }, + { url = "https://files.pythonhosted.org/packages/aa/1a/0b239138afe8b80a1a0b4c95db3884e6afbbe82ec3318918ab03bc57f231/polars_runtime_32-1.40.0-cp310-abi3-win_amd64.whl", hash = "sha256:a916040e0b7f461ce987e4551fed9eea5914b4fbb5af907b1d9e80db71fadeb5", size = 51822748, upload-time = "2026-04-18T05:24:09.384Z" }, + { url = "https://files.pythonhosted.org/packages/06/ce/c16ef8fd3030b7342032b040fab21a42f6fee57e47ee7f41e2f1a1e36f01/polars_runtime_32-1.40.0-cp310-abi3-win_arm64.whl", hash = "sha256:719c64eecde24a95aa3599eb9c8efc98c1499bab7ef9c01cbbe8939cd583e654", size = 45819617, upload-time = "2026-04-18T05:24:13.214Z" }, +] + +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, +] + +[[package]] +name = "pygments" +version = "2.20.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, +] + +[[package]] +name = "stack-data" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pure-eval" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, +] + +[[package]] +name = "traitlets" +version = "5.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, +] + +[[package]] +name = "ty" +version = "0.0.32" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/7e/2aa791c9ae7b8cd5024cd4122e92267f664ca954cea3def3211919fa3c1f/ty-0.0.32.tar.gz", hash = "sha256:8743174c5f920f6700a4a0c9de140109189192ba16226884cd50095b43b8a45c", size = 5522294, upload-time = "2026-04-20T19:29:01.626Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/eb/1075dc6a49d7acbe2584ae4d5b410c41b1f177a5adcc567e09eca4c69000/ty-0.0.32-py3-none-linux_armv6l.whl", hash = "sha256:dacbc2f6cd698d488ae7436838ff929570455bf94bfa4d9fe57a630c552aff83", size = 10902959, upload-time = "2026-04-20T19:28:31.907Z" }, + { url = "https://files.pythonhosted.org/packages/33/d2/c35fc8bc66e98d1ee9b0f8ed319bf743e450e1f1e997574b178fab75670f/ty-0.0.32-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:914bbc4f605ce2a9e2a78982e28fae1d3359a169d141f9dc3b4c7749cd5eca81", size = 10726172, upload-time = "2026-04-20T19:28:44.765Z" }, + { url = "https://files.pythonhosted.org/packages/96/32/c827da3ca480456fb02d8cea68a2609273b6c220fea0be9a4c8d8470b86e/ty-0.0.32-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4787ac9fe1f86b1f3133f5c6732adbe2df5668b50c679ac6e2d98cd284da812f", size = 10163701, upload-time = "2026-04-20T19:28:27.005Z" }, + { url = "https://files.pythonhosted.org/packages/ba/9e/2734478fbdb90c160cb2813a3916a16a2af5c1e231f87d635f6131d781fb/ty-0.0.32-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8ea0a728af99fe40dd744cba6441a2404f80b7f4bde17aa6da393810af5ea57", size = 10656220, upload-time = "2026-04-20T19:29:03.814Z" }, + { url = "https://files.pythonhosted.org/packages/44/9f/0007da2d35e424debe7e9f86ffbc1ab7f60983cfbc5f0411324ab2de5292/ty-0.0.32-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2850561f9b018ae33d7e5bbfa0ac414d3c518513edcffe43877dc9801446b9c5", size = 10696086, upload-time = "2026-04-20T19:28:46.829Z" }, + { url = "https://files.pythonhosted.org/packages/3b/5e/ce5fd4ec803222ae3e69a76d2a2db2eed55e19f5b131702b9789ef45f93d/ty-0.0.32-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b5fa2fb3c614349ee211d36476b49d88c5ef79a687cdb91b2872ad023b94d2f8", size = 11184800, upload-time = "2026-04-20T19:28:42.57Z" }, + { url = "https://files.pythonhosted.org/packages/6c/46/ebcf67a5999421331214aac51a7464db42de2be15bbe929c612a3ed0b039/ty-0.0.32-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b89969307ab2417d41c9be8059dd79feea577234e1e10d35132f5495e0d42c6", size = 11718718, upload-time = "2026-04-20T19:28:36.433Z" }, + { url = "https://files.pythonhosted.org/packages/18/2c/2141c86ed0ce0962b45cefb658a95e734f59759d47f20afdcd9c732910a1/ty-0.0.32-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b59868ede9b1d69a088f0d695df52a0061f95fa7baa1d5e0dc6fc9cf06e1334", size = 11346369, upload-time = "2026-04-20T19:28:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/7a/da/ed6f772339cf29bd9a46def9d6db5084689eb574ee4d150ff704224c1ed8/ty-0.0.32-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8300caf35345498e9b9b03e550bba03cee8f5f5f8ab4c83c3b1ff1b7403b7d3a", size = 11280714, upload-time = "2026-04-20T19:28:51.516Z" }, + { url = "https://files.pythonhosted.org/packages/da/9b/c6813987edf4816a40e0c8e408b555f97d3f267c7b3a1688c8bbdf65609c/ty-0.0.32-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:583c7094f4574b02f724db924f98b804d1387a0bd9405ecb5e078cc0f47fbcfb", size = 10638806, upload-time = "2026-04-20T19:28:29.651Z" }, + { url = "https://files.pythonhosted.org/packages/4e/d4/0cefcbd2ad0f3d51762ccf58e652ec7da146eb6ae34f87228f6254bbb8be/ty-0.0.32-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e44ebe1bb4143a5628bc4db67ac0dfebe14594af671e4ee66f6f2e983da56501", size = 10726106, upload-time = "2026-04-20T19:29:06.3Z" }, + { url = "https://files.pythonhosted.org/packages/32/ad/2c8a97f91f06311f4367400f7d13534bbda2522c73c99a3e4c0757dff9b8/ty-0.0.32-py3-none-musllinux_1_2_i686.whl", hash = "sha256:06f17ada3e069cba6148342ef88e9929156beca8473e8d4f101b68f66c75643e", size = 10872951, upload-time = "2026-04-20T19:28:34.077Z" }, + { url = "https://files.pythonhosted.org/packages/ba/68/42293f9248106dd51875120971a5cc6ea315c2c4dcfb8e59aa063aa0af26/ty-0.0.32-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e96e60fa556cec04f15d7ea62d2ceee5982bd389233e961ab9fd42304e278175", size = 11363334, upload-time = "2026-04-20T19:28:54.036Z" }, + { url = "https://files.pythonhosted.org/packages/df/92/be9abf4d3e589ad5023e2ea965b93e204ec856420d46adf73c5c36c04678/ty-0.0.32-py3-none-win32.whl", hash = "sha256:2ff2ebb4986b24aebcf1444db7db5ca41b36086040e95eea9f8fb851c11e805c", size = 10260689, upload-time = "2026-04-20T19:28:56.541Z" }, + { url = "https://files.pythonhosted.org/packages/14/61/dc86acea899349d2579cb8419aecedd83dc504d7d6a10df65eef546c8300/ty-0.0.32-py3-none-win_amd64.whl", hash = "sha256:ba7284a4a954b598c1b31500352b3ec1f89bff533825592b5958848226fdc7ee", size = 11255371, upload-time = "2026-04-20T19:28:39.917Z" }, + { url = "https://files.pythonhosted.org/packages/43/01/beffec56d71ca25b343ede63adb076456b5b3e211f1c066452a44cd120b3/ty-0.0.32-py3-none-win_arm64.whl", hash = "sha256:7e10aadbdbda989a7d567ee6a37f8b98d4d542e31e3b190a2879fd581f75d658", size = 10658087, upload-time = "2026-04-20T19:28:59.286Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, +] From a3fc44a356f2c42d81289b09bc10c0c9df11566e Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 22 Apr 2026 07:48:45 +0000 Subject: [PATCH 02/22] add check about horizontal merge consistency --- mypolars/main.py | 48 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/mypolars/main.py b/mypolars/main.py index cd47118..d8a92d6 100644 --- a/mypolars/main.py +++ b/mypolars/main.py @@ -3,13 +3,13 @@ Note: needed ~128GB memory to run on R14 data. """ + import gzip from argparse import ArgumentParser from itertools import zip_longest from pathlib import Path import polars as pl -pl.Config.set_verbose(True) # TODO @@ -59,6 +59,9 @@ ] +SUFFIX_JOIN_COL = "_right" + + def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]: pairs = [] with open(list_file) as fp: @@ -84,20 +87,45 @@ def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) -> df_resp = ( pl.scan_csv(path_responses, infer_schema=False, separator="\t") - .with_row_index(name="_rn", offset=1) + .with_row_index(name="_rowid", offset=1) + .with_columns(pl.col("_rowid").cast(pl.String) + "|" + path_responses.name) ) df_freetext = ( pl.scan_csv(path_freetext, infer_schema=False, separator="\t") - .with_row_index(name="_rn", offset=1) + .with_row_index(name="_rowid", offset=1) + .with_columns(pl.col("_rowid").cast(pl.String) + "|" + path_freetext.name) ) - df_merged = df_resp.join(df_freetext, on="_rn", how="full") + df_merged = df_resp.join( + df_freetext, on="_rowid", how="full", suffix=SUFFIX_JOIN_COL + ) to_concat.append(df_merged) pl.concat(to_concat).sink_parquet(parquet_output) +def check_merge_consistency(data_path: str | Path): + shared_cols = set(EXPECTED_COLUMNS_RESPONSES).intersection( + EXPECTED_COLUMNS_FREETEXT + ) + + all_check = ( + pl.scan_parquet(data_path) + .select( + pl.all_horizontal( + pl.col(cc) == pl.col(cc + SUFFIX_JOIN_COL) for cc in shared_cols + ).all() + ) + .collect() + .item() + ) + + assert all_check + + return all_check + + def validate_tsv_gz(filename: str, in_dir: Path) -> Path: """Check if path exists and is a proper TSV & gz""" full_path = (in_dir / filename.strip()).resolve() @@ -163,9 +191,19 @@ def get_columns(input_path: Path) -> list[str]: type=Path, help="File containing pair of paths to responses & freetext data, one pair per line (TSV without header).", ) + parser.add_argument( + "--post-merge-file", + required=True, + type=Path, + help="Path to intermediary output file from the merge stage", + ) args = parser.parse_args() pairs = validate_input_pairs(args.list_file) - merge_by_pair(pairs, "/tmp/out.parquet") + print(">> merge_by_pair") + merge_by_pair(pairs, args.post_merge_file) + + print(">> check_merge_consistency") + print(check_merge_consistency(args.post_merge_file)) From 97c4b50c63da7838ab58397690308bd8d1102535 Mon Sep 17 00:00:00 2001 From: Vincent Date: Fri, 15 May 2026 07:58:18 +0000 Subject: [PATCH 03/22] implement low-memory "merge" stage Switching to these made it memory friendly: - `pl.concat(... how="horizontal")` instead of `.join` - `.collect(engine="streaming")` insead of just `.collect()` Also added another check for the merging of main <> freetext files. NOTE: Polars is better than DuckDB for this since it assigns line numbers in a deterministic way (only polars guarantees this, not the case with DuckDB). --- mypolars/main.py | 86 +++++++++++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 33 deletions(-) diff --git a/mypolars/main.py b/mypolars/main.py index d8a92d6..cbaac33 100644 --- a/mypolars/main.py +++ b/mypolars/main.py @@ -1,7 +1,5 @@ """ Merges the incoming Kanta Lab data from THL into one coherent file. - -Note: needed ~128GB memory to run on R14 data. """ import gzip @@ -13,10 +11,9 @@ # TODO -# 4. Validate that shared column match # 5. Post WDL sort-dup: subset columns, join SEX, sort, output unique/duplicates/error rows -EXPECTED_COLUMNS_RESPONSES = [ +EXPECTED_COLUMNS_MAIN = [ "FINNGENID", "EVENT_AGE", "APPROX_EVENT_DAY", @@ -58,8 +55,8 @@ "tutkimustulosteksti", ] - -SUFFIX_JOIN_COL = "_right" +COL_PREFIX_MAIN = "main." +COL_PREFIX_FREETEXT = "freetext." def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]: @@ -68,13 +65,13 @@ def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, for line in fp: values = line.split(separator, maxsplit=2) - responses = validate_tsv_gz(values[0], list_file.parent) + main = validate_tsv_gz(values[0], list_file.parent) freetext = validate_tsv_gz(values[1], list_file.parent) - pairs.append((responses, freetext)) + pairs.append((main, freetext)) - for responses, freetext in pairs: - check_columns(responses, EXPECTED_COLUMNS_RESPONSES, "responses") + for main, freetext in pairs: + check_columns(main, EXPECTED_COLUMNS_MAIN, "main") check_columns(freetext, EXPECTED_COLUMNS_FREETEXT, "freetext") return pairs @@ -82,48 +79,71 @@ def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) -> None: to_concat = [] - for path_responses, path_freetext in pairs: - print(f"Processing {path_responses} & {path_freetext}") - - df_resp = ( - pl.scan_csv(path_responses, infer_schema=False, separator="\t") - .with_row_index(name="_rowid", offset=1) - .with_columns(pl.col("_rowid").cast(pl.String) + "|" + path_responses.name) + for path_main, path_freetext in pairs: + print(f"Processing {path_main} & {path_freetext}") + + df_main = ( + pl.scan_csv( + path_main, + infer_schema=False, + separator="\t", + row_index_name="_rowid", + row_index_offset=1, + ) + .with_columns(pl.lit(path_main.name).alias("_filename")) + .select(pl.all().name.prefix(COL_PREFIX_MAIN)) ) df_freetext = ( - pl.scan_csv(path_freetext, infer_schema=False, separator="\t") - .with_row_index(name="_rowid", offset=1) - .with_columns(pl.col("_rowid").cast(pl.String) + "|" + path_freetext.name) + pl.scan_csv( + path_freetext, + infer_schema=False, + separator="\t", + row_index_name="_rowid", + row_index_offset=1, + ) + .with_columns(pl.lit(path_freetext.name).alias("_filename")) + .select(pl.all().name.prefix(COL_PREFIX_FREETEXT)) ) - df_merged = df_resp.join( - df_freetext, on="_rowid", how="full", suffix=SUFFIX_JOIN_COL - ) + df_merged = pl.concat([df_main, df_freetext], how="horizontal") + to_concat.append(df_merged) pl.concat(to_concat).sink_parquet(parquet_output) -def check_merge_consistency(data_path: str | Path): - shared_cols = set(EXPECTED_COLUMNS_RESPONSES).intersection( - EXPECTED_COLUMNS_FREETEXT - ) +def check_merge_consistency(data_path: str | Path) -> bool: + # First check: all shared columns have the same values + shared_cols = set(EXPECTED_COLUMNS_MAIN).intersection(EXPECTED_COLUMNS_FREETEXT) - all_check = ( + check_shared_columns_same_values = ( pl.scan_parquet(data_path) .select( pl.all_horizontal( - pl.col(cc) == pl.col(cc + SUFFIX_JOIN_COL) for cc in shared_cols + pl.col(COL_PREFIX_MAIN + cc) == pl.col(COL_PREFIX_FREETEXT + cc) + for cc in shared_cols ).all() ) - .collect() + .collect(engine="streaming") + .item() + ) + + assert check_shared_columns_same_values + + # Second check: main and freetext have same height. + # This is done by checking the absence of null in _rowid, which happens iif + # the main and freetext data are of different height. + check_same_height = ( + pl.scan_parquet(data_path) + .select(pl.all_horizontal(pl.selectors.ends_with("._rowid").is_not_null().all())) + .collect(engine="streaming") .item() ) - assert all_check + assert check_same_height - return all_check + return check_shared_columns_same_values and check_same_height def validate_tsv_gz(filename: str, in_dir: Path) -> Path: @@ -189,7 +209,7 @@ def get_columns(input_path: Path) -> list[str]: "--list-file", required=True, type=Path, - help="File containing pair of paths to responses & freetext data, one pair per line (TSV without header).", + help="File containing pair of paths to main & freetext data, one pair per line (TSV without header).", ) parser.add_argument( "--post-merge-file", From 52c8f8b23828ea3b96dad04c1a73f4acfcb13ed0 Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 19 May 2026 11:47:05 +0000 Subject: [PATCH 04/22] add build backend to pyproject.toml Prerequisite in order to have the polars implementation pull the config shared with the other import packages. --- pyproject.toml | 7 +++++++ uv.lock | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2d89315..5361d69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,3 +14,10 @@ dev = [ "pytest>=9.0.3", "ruff>=0.15.10", ] + +[build-system] +requires = ["uv_build>=0.11.14,<0.12.0"] +build-backend = "uv_build" + +[tool.uv.build-backend] +module-name = "kanta" diff --git a/uv.lock b/uv.lock index 14d81d0..f1334b2 100644 --- a/uv.lock +++ b/uv.lock @@ -103,7 +103,7 @@ wheels = [ [[package]] name = "kanta-lab-preprocessing" version = "0.1.0" -source = { virtual = "." } +source = { editable = "." } dependencies = [ { name = "pandas" }, ] From 6d20d564892e2dffe89fb0cd449bf163a5d3fe2d Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 19 May 2026 12:35:25 +0000 Subject: [PATCH 05/22] move polars rewrite under src/kanta/ --- .gitignore | 3 + .python-version | 1 - mypolars/.python-version | 1 - mypolars/pyproject.toml | 15 - mypolars/uv.lock | 263 ------------------ pyproject.toml | 3 +- mypolars/README.md => src/kanta/config.py | 0 src/kanta/intake/__init__.py | 0 .../main.py => src/kanta/intake/assemble.py | 0 uv.lock | 55 +++- 10 files changed, 58 insertions(+), 283 deletions(-) delete mode 100644 .python-version delete mode 100644 mypolars/.python-version delete mode 100644 mypolars/pyproject.toml delete mode 100644 mypolars/uv.lock rename mypolars/README.md => src/kanta/config.py (100%) create mode 100644 src/kanta/intake/__init__.py rename mypolars/main.py => src/kanta/intake/assemble.py (100%) diff --git a/.gitignore b/.gitignore index 818e20a..c000fa2 100644 --- a/.gitignore +++ b/.gitignore @@ -119,6 +119,9 @@ ipython_config.py # https://pdm.fming.dev/#use-with-ide .pdm.toml +# uv +.python-version + # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ diff --git a/.python-version b/.python-version deleted file mode 100644 index 24ee5b1..0000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.13 diff --git a/mypolars/.python-version b/mypolars/.python-version deleted file mode 100644 index 6324d40..0000000 --- a/mypolars/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.14 diff --git a/mypolars/pyproject.toml b/mypolars/pyproject.toml deleted file mode 100644 index 26939d3..0000000 --- a/mypolars/pyproject.toml +++ /dev/null @@ -1,15 +0,0 @@ -[project] -name = "mypolars" -version = "0.1.0" -description = "Add your description here" -readme = "README.md" -requires-python = ">=3.14" -dependencies = [ - "polars>=1.40.0", -] - -[dependency-groups] -dev = [ - "ipython>=9.12.0", - "ty>=0.0.32", -] diff --git a/mypolars/uv.lock b/mypolars/uv.lock deleted file mode 100644 index 00fb747..0000000 --- a/mypolars/uv.lock +++ /dev/null @@ -1,263 +0,0 @@ -version = 1 -revision = 3 -requires-python = ">=3.14" - -[[package]] -name = "asttokens" -version = "3.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/be/a5/8e3f9b6771b0b408517c82d97aed8f2036509bc247d46114925e32fe33f0/asttokens-3.0.1.tar.gz", hash = "sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7", size = 62308, upload-time = "2025-11-15T16:43:48.578Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" }, -] - -[[package]] -name = "colorama" -version = "0.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, -] - -[[package]] -name = "decorator" -version = "5.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, -] - -[[package]] -name = "executing" -version = "2.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, -] - -[[package]] -name = "ipython" -version = "9.12.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "decorator" }, - { name = "ipython-pygments-lexers" }, - { name = "jedi" }, - { name = "matplotlib-inline" }, - { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, - { name = "prompt-toolkit" }, - { name = "pygments" }, - { name = "stack-data" }, - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3a/73/7114f80a8f9cabdb13c27732dce24af945b2923dcab80723602f7c8bc2d8/ipython-9.12.0.tar.gz", hash = "sha256:01daa83f504b693ba523b5a407246cabde4eb4513285a3c6acaff11a66735ee4", size = 4428879, upload-time = "2026-03-27T09:42:45.312Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/59/22/906c8108974c673ebef6356c506cebb6870d48cedea3c41e949e2dd556bb/ipython-9.12.0-py3-none-any.whl", hash = "sha256:0f2701e8ee86e117e37f50563205d36feaa259d2e08d4a6bc6b6d74b18ce128d", size = 625661, upload-time = "2026-03-27T09:42:42.831Z" }, -] - -[[package]] -name = "ipython-pygments-lexers" -version = "1.1.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, -] - -[[package]] -name = "jedi" -version = "0.19.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "parso" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, -] - -[[package]] -name = "matplotlib-inline" -version = "0.2.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, -] - -[[package]] -name = "mypolars" -version = "0.1.0" -source = { virtual = "." } -dependencies = [ - { name = "polars" }, -] - -[package.dev-dependencies] -dev = [ - { name = "ipython" }, - { name = "ty" }, -] - -[package.metadata] -requires-dist = [{ name = "polars", specifier = ">=1.40.0" }] - -[package.metadata.requires-dev] -dev = [ - { name = "ipython", specifier = ">=9.12.0" }, - { name = "ty", specifier = ">=0.0.32" }, -] - -[[package]] -name = "parso" -version = "0.8.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/81/76/a1e769043c0c0c9fe391b702539d594731a4362334cdf4dc25d0c09761e7/parso-0.8.6.tar.gz", hash = "sha256:2b9a0332696df97d454fa67b81618fd69c35a7b90327cbe6ba5c92d2c68a7bfd", size = 401621, upload-time = "2026-02-09T15:45:24.425Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/61/fae042894f4296ec49e3f193aff5d7c18440da9e48102c3315e1bc4519a7/parso-0.8.6-py2.py3-none-any.whl", hash = "sha256:2c549f800b70a5c4952197248825584cb00f033b29c692671d3bf08bf380baff", size = 106894, upload-time = "2026-02-09T15:45:21.391Z" }, -] - -[[package]] -name = "pexpect" -version = "4.9.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ptyprocess" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, -] - -[[package]] -name = "polars" -version = "1.40.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "polars-runtime-32" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d9/1b/eea7d6fe6daafc1d784cc0f76c729b28051837ccb2d51ae64a0a3f798142/polars-1.40.0.tar.gz", hash = "sha256:711dd50dcbc35ba42a2625fcadc2a1349e2e9abf48e35631bdabafb90d89874b", size = 732943, upload-time = "2026-04-18T05:25:26.077Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b4/ad/d5ed79269b7fe59a3dbbfbdbecbe1e59a0b56e38d36491e57d2bfb5846c1/polars-1.40.0-py3-none-any.whl", hash = "sha256:60b1d677ca363e2fc6fdea8c3d16c0653fd52cc37f0249e0f29d9536d5aa45ef", size = 828012, upload-time = "2026-04-18T05:23:39.055Z" }, -] - -[[package]] -name = "polars-runtime-32" -version = "1.40.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/fb/b2/eae6c1b3d16c7a64ff382f557985ff939cce13455e8c9d056ab8e1e0fc87/polars_runtime_32-1.40.0.tar.gz", hash = "sha256:e31bff8bd37492c714e155e2e1429ac2d9ddf2dd6ec6474cc1cc70ac0b2bd6af", size = 2935285, upload-time = "2026-04-18T05:25:28.038Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b0/e4/2325689d2af4f9e70699ff98e8a2543707bebc34af78a5fe0e654107d9ed/polars_runtime_32-1.40.0-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:cab3ac7ff5bc9e0f4b3b146015569e9417cf0eaff8d3fb71004d73d67b6f09c7", size = 52092528, upload-time = "2026-04-18T05:23:42.341Z" }, - { url = "https://files.pythonhosted.org/packages/19/a6/82157b19c5c40b2c1ed0493b87b9eaf9b4863cdedca5575ee083488b45ba/polars_runtime_32-1.40.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:d29624c75c4049253300786d00882fce620b3677ce495ebc4199292de8c2ba02", size = 46365073, upload-time = "2026-04-18T05:23:46.7Z" }, - { url = "https://files.pythonhosted.org/packages/85/b5/5c4f1f2545f56c664cc57bbdd1aa66fcfcb129aa137ed72cc81d58eb480f/polars_runtime_32-1.40.0-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a034dc0d8481fc1ca0456ab33e98e53a4c6d6cc6a2edb36246cc81c936b925dc", size = 50250561, upload-time = "2026-04-18T05:23:51.316Z" }, - { url = "https://files.pythonhosted.org/packages/8e/51/cb5eb75394f39c0ec14fddcc9b11adb707e1f28224a552ecbfa72d39b61b/polars_runtime_32-1.40.0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70e78c2f13a54a9d92ae30d2625bda759173cc4867ad6a39f85f140058d899c6", size = 56243695, upload-time = "2026-04-18T05:23:55.932Z" }, - { url = "https://files.pythonhosted.org/packages/16/3a/be1437c0fbecbb07d81b151456089c3cf054eea5a791f849ed39b67611ca/polars_runtime_32-1.40.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1843272c0ef49f4a07435888f0059eca08ec16ab9880219c457195a081df0281", size = 50427843, upload-time = "2026-04-18T05:24:00.159Z" }, - { url = "https://files.pythonhosted.org/packages/be/c7/ea6449a2161816a13ed1d8aa02177d5a0594e011f0df5ddd2fad8e5bf20e/polars_runtime_32-1.40.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:081237dba07f15d61fc151825f203165480e9503ebe72a474a8c99aa78021962", size = 54153077, upload-time = "2026-04-18T05:24:05.066Z" }, - { url = "https://files.pythonhosted.org/packages/aa/1a/0b239138afe8b80a1a0b4c95db3884e6afbbe82ec3318918ab03bc57f231/polars_runtime_32-1.40.0-cp310-abi3-win_amd64.whl", hash = "sha256:a916040e0b7f461ce987e4551fed9eea5914b4fbb5af907b1d9e80db71fadeb5", size = 51822748, upload-time = "2026-04-18T05:24:09.384Z" }, - { url = "https://files.pythonhosted.org/packages/06/ce/c16ef8fd3030b7342032b040fab21a42f6fee57e47ee7f41e2f1a1e36f01/polars_runtime_32-1.40.0-cp310-abi3-win_arm64.whl", hash = "sha256:719c64eecde24a95aa3599eb9c8efc98c1499bab7ef9c01cbbe8939cd583e654", size = 45819617, upload-time = "2026-04-18T05:24:13.214Z" }, -] - -[[package]] -name = "prompt-toolkit" -version = "3.0.52" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "wcwidth" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, -] - -[[package]] -name = "ptyprocess" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, -] - -[[package]] -name = "pure-eval" -version = "0.2.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, -] - -[[package]] -name = "pygments" -version = "2.20.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, -] - -[[package]] -name = "stack-data" -version = "0.6.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "asttokens" }, - { name = "executing" }, - { name = "pure-eval" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, -] - -[[package]] -name = "traitlets" -version = "5.14.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, -] - -[[package]] -name = "ty" -version = "0.0.32" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/85/7e/2aa791c9ae7b8cd5024cd4122e92267f664ca954cea3def3211919fa3c1f/ty-0.0.32.tar.gz", hash = "sha256:8743174c5f920f6700a4a0c9de140109189192ba16226884cd50095b43b8a45c", size = 5522294, upload-time = "2026-04-20T19:29:01.626Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/eb/1075dc6a49d7acbe2584ae4d5b410c41b1f177a5adcc567e09eca4c69000/ty-0.0.32-py3-none-linux_armv6l.whl", hash = "sha256:dacbc2f6cd698d488ae7436838ff929570455bf94bfa4d9fe57a630c552aff83", size = 10902959, upload-time = "2026-04-20T19:28:31.907Z" }, - { url = "https://files.pythonhosted.org/packages/33/d2/c35fc8bc66e98d1ee9b0f8ed319bf743e450e1f1e997574b178fab75670f/ty-0.0.32-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:914bbc4f605ce2a9e2a78982e28fae1d3359a169d141f9dc3b4c7749cd5eca81", size = 10726172, upload-time = "2026-04-20T19:28:44.765Z" }, - { url = "https://files.pythonhosted.org/packages/96/32/c827da3ca480456fb02d8cea68a2609273b6c220fea0be9a4c8d8470b86e/ty-0.0.32-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4787ac9fe1f86b1f3133f5c6732adbe2df5668b50c679ac6e2d98cd284da812f", size = 10163701, upload-time = "2026-04-20T19:28:27.005Z" }, - { url = "https://files.pythonhosted.org/packages/ba/9e/2734478fbdb90c160cb2813a3916a16a2af5c1e231f87d635f6131d781fb/ty-0.0.32-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8ea0a728af99fe40dd744cba6441a2404f80b7f4bde17aa6da393810af5ea57", size = 10656220, upload-time = "2026-04-20T19:29:03.814Z" }, - { url = "https://files.pythonhosted.org/packages/44/9f/0007da2d35e424debe7e9f86ffbc1ab7f60983cfbc5f0411324ab2de5292/ty-0.0.32-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2850561f9b018ae33d7e5bbfa0ac414d3c518513edcffe43877dc9801446b9c5", size = 10696086, upload-time = "2026-04-20T19:28:46.829Z" }, - { url = "https://files.pythonhosted.org/packages/3b/5e/ce5fd4ec803222ae3e69a76d2a2db2eed55e19f5b131702b9789ef45f93d/ty-0.0.32-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b5fa2fb3c614349ee211d36476b49d88c5ef79a687cdb91b2872ad023b94d2f8", size = 11184800, upload-time = "2026-04-20T19:28:42.57Z" }, - { url = "https://files.pythonhosted.org/packages/6c/46/ebcf67a5999421331214aac51a7464db42de2be15bbe929c612a3ed0b039/ty-0.0.32-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b89969307ab2417d41c9be8059dd79feea577234e1e10d35132f5495e0d42c6", size = 11718718, upload-time = "2026-04-20T19:28:36.433Z" }, - { url = "https://files.pythonhosted.org/packages/18/2c/2141c86ed0ce0962b45cefb658a95e734f59759d47f20afdcd9c732910a1/ty-0.0.32-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b59868ede9b1d69a088f0d695df52a0061f95fa7baa1d5e0dc6fc9cf06e1334", size = 11346369, upload-time = "2026-04-20T19:28:48.967Z" }, - { url = "https://files.pythonhosted.org/packages/7a/da/ed6f772339cf29bd9a46def9d6db5084689eb574ee4d150ff704224c1ed8/ty-0.0.32-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8300caf35345498e9b9b03e550bba03cee8f5f5f8ab4c83c3b1ff1b7403b7d3a", size = 11280714, upload-time = "2026-04-20T19:28:51.516Z" }, - { url = "https://files.pythonhosted.org/packages/da/9b/c6813987edf4816a40e0c8e408b555f97d3f267c7b3a1688c8bbdf65609c/ty-0.0.32-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:583c7094f4574b02f724db924f98b804d1387a0bd9405ecb5e078cc0f47fbcfb", size = 10638806, upload-time = "2026-04-20T19:28:29.651Z" }, - { url = "https://files.pythonhosted.org/packages/4e/d4/0cefcbd2ad0f3d51762ccf58e652ec7da146eb6ae34f87228f6254bbb8be/ty-0.0.32-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e44ebe1bb4143a5628bc4db67ac0dfebe14594af671e4ee66f6f2e983da56501", size = 10726106, upload-time = "2026-04-20T19:29:06.3Z" }, - { url = "https://files.pythonhosted.org/packages/32/ad/2c8a97f91f06311f4367400f7d13534bbda2522c73c99a3e4c0757dff9b8/ty-0.0.32-py3-none-musllinux_1_2_i686.whl", hash = "sha256:06f17ada3e069cba6148342ef88e9929156beca8473e8d4f101b68f66c75643e", size = 10872951, upload-time = "2026-04-20T19:28:34.077Z" }, - { url = "https://files.pythonhosted.org/packages/ba/68/42293f9248106dd51875120971a5cc6ea315c2c4dcfb8e59aa063aa0af26/ty-0.0.32-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e96e60fa556cec04f15d7ea62d2ceee5982bd389233e961ab9fd42304e278175", size = 11363334, upload-time = "2026-04-20T19:28:54.036Z" }, - { url = "https://files.pythonhosted.org/packages/df/92/be9abf4d3e589ad5023e2ea965b93e204ec856420d46adf73c5c36c04678/ty-0.0.32-py3-none-win32.whl", hash = "sha256:2ff2ebb4986b24aebcf1444db7db5ca41b36086040e95eea9f8fb851c11e805c", size = 10260689, upload-time = "2026-04-20T19:28:56.541Z" }, - { url = "https://files.pythonhosted.org/packages/14/61/dc86acea899349d2579cb8419aecedd83dc504d7d6a10df65eef546c8300/ty-0.0.32-py3-none-win_amd64.whl", hash = "sha256:ba7284a4a954b598c1b31500352b3ec1f89bff533825592b5958848226fdc7ee", size = 11255371, upload-time = "2026-04-20T19:28:39.917Z" }, - { url = "https://files.pythonhosted.org/packages/43/01/beffec56d71ca25b343ede63adb076456b5b3e211f1c066452a44cd120b3/ty-0.0.32-py3-none-win_arm64.whl", hash = "sha256:7e10aadbdbda989a7d567ee6a37f8b98d4d542e31e3b190a2879fd581f75d658", size = 10658087, upload-time = "2026-04-20T19:28:59.286Z" }, -] - -[[package]] -name = "wcwidth" -version = "0.6.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, -] diff --git a/pyproject.toml b/pyproject.toml index 5361d69..7d4654e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,9 +3,10 @@ name = "kanta-lab-preprocessing" version = "0.1.0" description = "Add your description here" readme = "README.md" -requires-python = ">=3.13" +requires-python = ">=3.12" dependencies = [ "pandas>=3.0.2", + "polars>=1.40.0", ] [dependency-groups] diff --git a/mypolars/README.md b/src/kanta/config.py similarity index 100% rename from mypolars/README.md rename to src/kanta/config.py diff --git a/src/kanta/intake/__init__.py b/src/kanta/intake/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mypolars/main.py b/src/kanta/intake/assemble.py similarity index 100% rename from mypolars/main.py rename to src/kanta/intake/assemble.py diff --git a/uv.lock b/uv.lock index f1334b2..209cda9 100644 --- a/uv.lock +++ b/uv.lock @@ -1,6 +1,6 @@ version = 1 revision = 3 -requires-python = ">=3.13" +requires-python = ">=3.12" resolution-markers = [ "python_full_version >= '3.14' and sys_platform == 'win32'", "python_full_version >= '3.14' and sys_platform == 'emscripten'", @@ -106,6 +106,7 @@ version = "0.1.0" source = { editable = "." } dependencies = [ { name = "pandas" }, + { name = "polars" }, ] [package.dev-dependencies] @@ -116,7 +117,10 @@ dev = [ ] [package.metadata] -requires-dist = [{ name = "pandas", specifier = ">=3.0.2" }] +requires-dist = [ + { name = "pandas", specifier = ">=3.0.2" }, + { name = "polars", specifier = ">=1.40.0" }, +] [package.metadata.requires-dev] dev = [ @@ -143,6 +147,17 @@ version = "2.4.4" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0", size = 20731587, upload-time = "2026-03-29T13:22:01.298Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b", size = 16689272, upload-time = "2026-03-29T13:18:49.223Z" }, + { url = "https://files.pythonhosted.org/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e", size = 14699573, upload-time = "2026-03-29T13:18:52.629Z" }, + { url = "https://files.pythonhosted.org/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842", size = 5204782, upload-time = "2026-03-29T13:18:55.579Z" }, + { url = "https://files.pythonhosted.org/packages/de/2f/702a4594413c1a8632092beae8aba00f1d67947389369b3777aed783fdca/numpy-2.4.4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e4a010c27ff6f210ff4c6ef34394cd61470d01014439b192ec22552ee867f2a8", size = 6552038, upload-time = "2026-03-29T13:18:57.769Z" }, + { url = "https://files.pythonhosted.org/packages/7f/37/eed308a8f56cba4d1fdf467a4fc67ef4ff4bf1c888f5fc980481890104b1/numpy-2.4.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9e75681b59ddaa5e659898085ae0eaea229d054f2ac0c7e563a62205a700121", size = 15670666, upload-time = "2026-03-29T13:19:00.341Z" }, + { url = "https://files.pythonhosted.org/packages/0a/0d/0e3ecece05b7a7e87ab9fb587855548da437a061326fff64a223b6dcb78a/numpy-2.4.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f4a14bee47aec54f883e0cad2d73986640c1590eb9bfaaba7ad17394481e6e", size = 16645480, upload-time = "2026-03-29T13:19:03.63Z" }, + { url = "https://files.pythonhosted.org/packages/34/49/f2312c154b82a286758ee2f1743336d50651f8b5195db18cdb63675ff649/numpy-2.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:62d6b0f03b694173f9fcb1fb317f7222fd0b0b103e784c6549f5e53a27718c44", size = 17020036, upload-time = "2026-03-29T13:19:07.428Z" }, + { url = "https://files.pythonhosted.org/packages/7b/e9/736d17bd77f1b0ec4f9901aaec129c00d59f5d84d5e79bba540ef12c2330/numpy-2.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbc356aae7adf9e6336d336b9c8111d390a05df88f1805573ebb0807bd06fd1d", size = 18368643, upload-time = "2026-03-29T13:19:10.775Z" }, + { url = "https://files.pythonhosted.org/packages/63/f6/d417977c5f519b17c8a5c3bc9e8304b0908b0e21136fe43bf628a1343914/numpy-2.4.4-cp312-cp312-win32.whl", hash = "sha256:0d35aea54ad1d420c812bfa0385c71cd7cc5bcf7c65fed95fc2cd02fe8c79827", size = 5961117, upload-time = "2026-03-29T13:19:13.464Z" }, + { url = "https://files.pythonhosted.org/packages/2d/5b/e1deebf88ff431b01b7406ca3583ab2bbb90972bbe1c568732e49c844f7e/numpy-2.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5f0362dc928a6ecd9db58868fca5e48485205e3855957bdedea308f8672ea4a", size = 12320584, upload-time = "2026-03-29T13:19:16.155Z" }, + { url = "https://files.pythonhosted.org/packages/58/89/e4e856ac82a68c3ed64486a544977d0e7bdd18b8da75b78a577ca31c4395/numpy-2.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:846300f379b5b12cc769334464656bc882e0735d27d9726568bc932fdc49d5ec", size = 10221450, upload-time = "2026-03-29T13:19:18.994Z" }, { url = "https://files.pythonhosted.org/packages/14/1d/d0a583ce4fefcc3308806a749a536c201ed6b5ad6e1322e227ee4848979d/numpy-2.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:08f2e31ed5e6f04b118e49821397f12767934cfdd12a1ce86a058f91e004ee50", size = 16684933, upload-time = "2026-03-29T13:19:22.47Z" }, { url = "https://files.pythonhosted.org/packages/c1/62/2b7a48fbb745d344742c0277f01286dead15f3f68e4f359fbfcf7b48f70f/numpy-2.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e823b8b6edc81e747526f70f71a9c0a07ac4e7ad13020aa736bb7c9d67196115", size = 14694532, upload-time = "2026-03-29T13:19:25.581Z" }, { url = "https://files.pythonhosted.org/packages/e5/87/499737bfba066b4a3bebff24a8f1c5b2dee410b209bc6668c9be692580f0/numpy-2.4.4-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4a19d9dba1a76618dd86b164d608566f393f8ec6ac7c44f0cc879011c45e65af", size = 5199661, upload-time = "2026-03-29T13:19:28.31Z" }, @@ -207,6 +222,14 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/da/99/b342345300f13440fe9fe385c3c481e2d9a595ee3bab4d3219247ac94e9a/pandas-3.0.2.tar.gz", hash = "sha256:f4753e73e34c8d83221ba58f232433fca2748be8b18dbca02d242ed153945043", size = 4645855, upload-time = "2026-03-31T06:48:30.816Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/f3/b0/c20bd4d6d3f736e6bd6b55794e9cd0a617b858eaad27c8f410ea05d953b7/pandas-3.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:232a70ebb568c0c4d2db4584f338c1577d81e3af63292208d615907b698a0f18", size = 10347921, upload-time = "2026-03-31T06:46:33.36Z" }, + { url = "https://files.pythonhosted.org/packages/35/d0/4831af68ce30cc2d03c697bea8450e3225a835ef497d0d70f31b8cdde965/pandas-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:970762605cff1ca0d3f71ed4f3a769ea8f85fc8e6348f6e110b8fea7e6eb5a14", size = 9888127, upload-time = "2026-03-31T06:46:36.253Z" }, + { url = "https://files.pythonhosted.org/packages/61/a9/16ea9346e1fc4a96e2896242d9bc674764fb9049b0044c0132502f7a771e/pandas-3.0.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aff4e6f4d722e0652707d7bcb190c445fe58428500c6d16005b02401764b1b3d", size = 10399577, upload-time = "2026-03-31T06:46:39.224Z" }, + { url = "https://files.pythonhosted.org/packages/c4/a8/3a61a721472959ab0ce865ef05d10b0d6bfe27ce8801c99f33d4fa996e65/pandas-3.0.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef8b27695c3d3dc78403c9a7d5e59a62d5464a7e1123b4e0042763f7104dc74f", size = 10880030, upload-time = "2026-03-31T06:46:42.412Z" }, + { url = "https://files.pythonhosted.org/packages/da/65/7225c0ea4d6ce9cb2160a7fb7f39804871049f016e74782e5dade4d14109/pandas-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f8d68083e49e16b84734eb1a4dcae4259a75c90fb6e2251ab9a00b61120c06ab", size = 11409468, upload-time = "2026-03-31T06:46:45.2Z" }, + { url = "https://files.pythonhosted.org/packages/fa/5b/46e7c76032639f2132359b5cf4c785dd8cf9aea5ea64699eac752f02b9db/pandas-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:32cc41f310ebd4a296d93515fcac312216adfedb1894e879303987b8f1e2b97d", size = 11936381, upload-time = "2026-03-31T06:46:48.293Z" }, + { url = "https://files.pythonhosted.org/packages/7b/8b/721a9cff6fa6a91b162eb51019c6243b82b3226c71bb6c8ef4a9bd65cbc6/pandas-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:a4785e1d6547d8427c5208b748ae2efb64659a21bd82bf440d4262d02bfa02a4", size = 9744993, upload-time = "2026-03-31T06:46:51.488Z" }, + { url = "https://files.pythonhosted.org/packages/d5/18/7f0bd34ae27b28159aa80f2a6799f47fda34f7fb938a76e20c7b7fe3b200/pandas-3.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:08504503f7101300107ecdc8df73658e4347586db5cfdadabc1592e9d7e7a0fd", size = 9056118, upload-time = "2026-03-31T06:46:54.548Z" }, { url = "https://files.pythonhosted.org/packages/bf/ca/3e639a1ea6fcd0617ca4e8ca45f62a74de33a56ae6cd552735470b22c8d3/pandas-3.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b5918ba197c951dec132b0c5929a00c0bf05d5942f590d3c10a807f6e15a57d3", size = 10321105, upload-time = "2026-03-31T06:46:57.327Z" }, { url = "https://files.pythonhosted.org/packages/0b/77/dbc82ff2fb0e63c6564356682bf201edff0ba16c98630d21a1fb312a8182/pandas-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d606a041c89c0a474a4702d532ab7e73a14fe35c8d427b972a625c8e46373668", size = 9864088, upload-time = "2026-03-31T06:46:59.935Z" }, { url = "https://files.pythonhosted.org/packages/5c/2b/341f1b04bbca2e17e13cd3f08c215b70ef2c60c5356ef1e8c6857449edc7/pandas-3.0.2-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:710246ba0616e86891b58ab95f2495143bb2bc83ab6b06747c74216f583a6ac9", size = 10369066, upload-time = "2026-03-31T06:47:02.792Z" }, @@ -270,6 +293,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "polars" +version = "1.40.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "polars-runtime-32" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/8c/bc9bc948058348ed43117cecc3007cd608f395915dae8a00974579a5dab1/polars-1.40.1.tar.gz", hash = "sha256:ab2694134b137596b5a59bfd7b4c54ebbc9b59f9403127f18e32d363777552e8", size = 733574, upload-time = "2026-04-22T19:15:55.507Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ea/91/74fc60d94488685a92ac9d49d7ec55f3e91fe9b77942a6235a5fa7f249c3/polars-1.40.1-py3-none-any.whl", hash = "sha256:c0f861219d1319cdea45c4ce4d30355a47176b8f98dcedf95ea8269f131b8abd", size = 828723, upload-time = "2026-04-22T19:14:25.452Z" }, +] + +[[package]] +name = "polars-runtime-32" +version = "1.40.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/ba/26d40f039be9f552b5fd7365a621bdfc0f8e912ef77094ae4693491b0bae/polars_runtime_32-1.40.1.tar.gz", hash = "sha256:37f3065615d1bf90d03b5326222df4c5c1f8a5d33e50470aa588e3465e6eb814", size = 2935843, upload-time = "2026-04-22T19:15:57.26Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7d/46/22c8af5eed68ac2eeb556e0fa3ca8a7b798e984ceff4450888f3b5ac61fd/polars_runtime_32-1.40.1-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b748ef652270cc49e9e69f99a035e0eb4d5f856d42bcd6ac4d9d80a40142aa1e", size = 52098755, upload-time = "2026-04-22T19:14:28.555Z" }, + { url = "https://files.pythonhosted.org/packages/c6/3e/48599a38009ca60ff82a6f38c8a621ce3c0286aa7397c7d79e741bd9060e/polars_runtime_32-1.40.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:d249b3743e05986060cec0a7aaa542d020df6c6b876e556023a310efd581f9be", size = 46367542, upload-time = "2026-04-22T19:14:32.433Z" }, + { url = "https://files.pythonhosted.org/packages/43/e9/384bc069367a1a36ee31c13782c178dbd039b2b873b772d4a0fc23a2373d/polars_runtime_32-1.40.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5987b30e7aa1059d069498496e8dda35afd592b0ac3d46ed87e3ff8df1ad652c", size = 50252104, upload-time = "2026-04-22T19:14:35.945Z" }, + { url = "https://files.pythonhosted.org/packages/15/ef/7d57ceb0651af74194e97ed6583e148d352f03d696090221b8059cdfc90b/polars_runtime_32-1.40.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d7f42a8b3f16fc66002cc0f6516f7dd7653396886ae0ed362ab95c0b3408b59", size = 56250788, upload-time = "2026-04-22T19:14:39.743Z" }, + { url = "https://files.pythonhosted.org/packages/10/0f/e4b3ffc748827a14a474ec9c42e45c066050e440fec57e914091d9adda75/polars_runtime_32-1.40.1-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e5f7becc237a7ec9d9a10878dc8e54b73bbf4e2d94a2991c37d7a0b38590d8f9", size = 50432590, upload-time = "2026-04-22T19:14:43.388Z" }, + { url = "https://files.pythonhosted.org/packages/d9/0b/b8d95fbed869fa4caabe9c400e4210374913b376e925e96fdcfa9be6416b/polars_runtime_32-1.40.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:992d14cf191dde043d36fbdbc98a65e43fbc7e9a5024cecd45f838ac4988c1ee", size = 54155564, upload-time = "2026-04-22T19:14:47.239Z" }, + { url = "https://files.pythonhosted.org/packages/06/d9/d091d8fb5cbed5e9536adfed955c4c89987a4cc3b8e73ae4532402b91c74/polars_runtime_32-1.40.1-cp310-abi3-win_amd64.whl", hash = "sha256:f78bb2abd00101cbb23cc0cb068f7e36e081057a15d2ec2dde3dda280709f030", size = 51829755, upload-time = "2026-04-22T19:14:50.85Z" }, + { url = "https://files.pythonhosted.org/packages/65/ad/b33c3022a394f3eb55c3310597cec615412a8a33880055eee191d154a628/polars_runtime_32-1.40.1-cp310-abi3-win_arm64.whl", hash = "sha256:b5cbfaf6b085b420b4bfcbe24e8f665076d1cccfdb80c0484c02a023ce205537", size = 45822104, upload-time = "2026-04-22T19:14:54.192Z" }, +] + [[package]] name = "prompt-toolkit" version = "3.0.52" From 6876bad17575c789086f9c5fb29d842a1a814b34 Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 19 May 2026 09:48:17 +0000 Subject: [PATCH 06/22] intial rewrite of sort-dedup from WDL to Polars --- src/kanta/config.py | 0 src/kanta/intake/assemble.py | 3 -- src/kanta/intake/tidy.py | 88 ++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 3 deletions(-) delete mode 100644 src/kanta/config.py create mode 100644 src/kanta/intake/tidy.py diff --git a/src/kanta/config.py b/src/kanta/config.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py index cbaac33..38a096e 100644 --- a/src/kanta/intake/assemble.py +++ b/src/kanta/intake/assemble.py @@ -10,9 +10,6 @@ import polars as pl -# TODO -# 5. Post WDL sort-dup: subset columns, join SEX, sort, output unique/duplicates/error rows - EXPECTED_COLUMNS_MAIN = [ "FINNGENID", "EVENT_AGE", diff --git a/src/kanta/intake/tidy.py b/src/kanta/intake/tidy.py new file mode 100644 index 0000000..488c0f3 --- /dev/null +++ b/src/kanta/intake/tidy.py @@ -0,0 +1,88 @@ +# TODO: in assemble: get rid of main. / freetext. prefixes for columns that are in common, since we check they have the same values (right?), then change it here. +from argparse import ArgumentParser +from pathlib import Path + +import polars as pl + + +COLUMNS_OUTPUT = [ + "main.FINNGENID", + "main.EVENT_AGE", + "main.tutkimuskoodistonjarjestelma", + "main.paikallinentutkimusnimike_selite", + "main.tutkimustulosarvo", + "main.tutkimustulosyksikko", + "main.tutkimusvastauksentila", + "main.tuloksenpoikkeavuus", + "main.viitearvoryhma", + "main.viitevalialkuarvo", + "main.viitevalialkuyksikko", + "main.viitevaliloppuarvo", + "main.viitevaliloppuyksikko", + "freetext.tutkimustulosteksti", + "main.paikallinentutkimusnimike_koodi", + "main.laboratoriotutkimusnimike", + "main.APPROX_EVENT_DAY", + "main.TIME", + "main._rowid", +] + +COLUMNS_UNIQUENESS_SORT = [ + "main.FINNGENID", + "main.APPROX_EVENT_DAY", + "main.TIME", + "main.laboratoriotutkimusnimike", + "main.paikallinentutkimusnimike_koodi", + "main.tutkimusvastauksentila", + "main.tutkimustulosarvo", + "main.tutkimustulosyksikko", +] + + +def main(args): + df_pheno = pl.scan_csv( + args.phenotype_file, + infer_schema=False, + separator="\t", + ).select("FINNGENID", "SEX") + + ( + pl.scan_parquet(args.assembled_file) + .select(COLUMNS_OUTPUT) + # Dedup rows + # NOTE(Vincent 2026-05-20) Here the deduplication is done on whole data, + # not just on adjacent lines as was done in the previous implementation. + .unique(subset=COLUMNS_UNIQUENESS_SORT) + # Sort + .sort(by=COLUMNS_UNIQUENESS_SORT) + # join SEX + .join(df_pheno, left_on="main.FINNGENID", right_on="FINNGENID", how="left") + .sink_parquet(args.output_file) + ) + + # TODO validation + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "--assembled-file", + help="Path to assembled file from the intake.assemble step (Parquet)", + required=True, + type=Path, + ) + parser.add_argument( + "--phenotype-file", + help="Path to phenotype file with SEX column (.txt.gz)", + required=True, + type=Path, + ) + parser.add_argument( + "--output-file", + help="Path to write the tidied up output file (Parquet)", + required=True, + type=Path, + ) + args = parser.parse_args() + + main(args) From 998f8b35cd7db5cb57e81d90811a802dbc0e9df0 Mon Sep 17 00:00:00 2001 From: Vincent Date: Thu, 21 May 2026 05:09:06 +0000 Subject: [PATCH 07/22] use bucket partitioning to implement sort+dedup --- src/kanta/intake/tidy.py | 88 ------------------- src/kanta/intake/tidyup.py | 173 +++++++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+), 88 deletions(-) delete mode 100644 src/kanta/intake/tidy.py create mode 100644 src/kanta/intake/tidyup.py diff --git a/src/kanta/intake/tidy.py b/src/kanta/intake/tidy.py deleted file mode 100644 index 488c0f3..0000000 --- a/src/kanta/intake/tidy.py +++ /dev/null @@ -1,88 +0,0 @@ -# TODO: in assemble: get rid of main. / freetext. prefixes for columns that are in common, since we check they have the same values (right?), then change it here. -from argparse import ArgumentParser -from pathlib import Path - -import polars as pl - - -COLUMNS_OUTPUT = [ - "main.FINNGENID", - "main.EVENT_AGE", - "main.tutkimuskoodistonjarjestelma", - "main.paikallinentutkimusnimike_selite", - "main.tutkimustulosarvo", - "main.tutkimustulosyksikko", - "main.tutkimusvastauksentila", - "main.tuloksenpoikkeavuus", - "main.viitearvoryhma", - "main.viitevalialkuarvo", - "main.viitevalialkuyksikko", - "main.viitevaliloppuarvo", - "main.viitevaliloppuyksikko", - "freetext.tutkimustulosteksti", - "main.paikallinentutkimusnimike_koodi", - "main.laboratoriotutkimusnimike", - "main.APPROX_EVENT_DAY", - "main.TIME", - "main._rowid", -] - -COLUMNS_UNIQUENESS_SORT = [ - "main.FINNGENID", - "main.APPROX_EVENT_DAY", - "main.TIME", - "main.laboratoriotutkimusnimike", - "main.paikallinentutkimusnimike_koodi", - "main.tutkimusvastauksentila", - "main.tutkimustulosarvo", - "main.tutkimustulosyksikko", -] - - -def main(args): - df_pheno = pl.scan_csv( - args.phenotype_file, - infer_schema=False, - separator="\t", - ).select("FINNGENID", "SEX") - - ( - pl.scan_parquet(args.assembled_file) - .select(COLUMNS_OUTPUT) - # Dedup rows - # NOTE(Vincent 2026-05-20) Here the deduplication is done on whole data, - # not just on adjacent lines as was done in the previous implementation. - .unique(subset=COLUMNS_UNIQUENESS_SORT) - # Sort - .sort(by=COLUMNS_UNIQUENESS_SORT) - # join SEX - .join(df_pheno, left_on="main.FINNGENID", right_on="FINNGENID", how="left") - .sink_parquet(args.output_file) - ) - - # TODO validation - - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument( - "--assembled-file", - help="Path to assembled file from the intake.assemble step (Parquet)", - required=True, - type=Path, - ) - parser.add_argument( - "--phenotype-file", - help="Path to phenotype file with SEX column (.txt.gz)", - required=True, - type=Path, - ) - parser.add_argument( - "--output-file", - help="Path to write the tidied up output file (Parquet)", - required=True, - type=Path, - ) - args = parser.parse_args() - - main(args) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py new file mode 100644 index 0000000..5f9ddbf --- /dev/null +++ b/src/kanta/intake/tidyup.py @@ -0,0 +1,173 @@ +import tempfile +from argparse import ArgumentParser +from pathlib import Path + +import polars as pl + + +COLUMNS_UNIQUENESS_SORT = [ + "FINNGENID", + "APPROX_EVENT_DAY", + "TIME", + "laboratoriotutkimusnimike", + "paikallinentutkimusnimike_koodi", + "tutkimusvastauksentila", + "tutkimustulosarvo", + "tutkimustulosyksikko", +] + + +def main(args): + temp_dir = Path(tempfile.mkdtemp()) + print(f">> {temp_dir=}") + + temp_dir_partition = temp_dir / "partition" + temp_dir_partition.mkdir() + + temp_dir_tidyup = temp_dir / "tidyup" + temp_dir_tidyup.mkdir() + + print("# Consolidate") + consolidated_file = consolidate_columns(args.assembled_file, args.output_dir) + + print("# Partition") + partition(consolidated_file, temp_dir_partition, args.partition_n_buckets) + + print("# Tidy-up") + for bucket_file in temp_dir_partition.glob("bucket_id__*.parquet"): + ( + pl.scan_parquet(bucket_file) + .pipe(tidy_up) + .sink_parquet(temp_dir_tidyup / bucket_file.name) + ) + + df_pheno = pl.scan_csv( + args.phenotype_file, + infer_schema=False, + separator="\t", + ).select("FINNGENID", "SEX") + + print("# Concatenate + Unique + SEX join") + bucket_files = [] + for bucket_id in range(args.partition_n_buckets): + bucket_files.append(temp_dir_tidyup / f"bucket_id__{bucket_id}.parquet") + + ( + # TODO: verify the file order of `bucket_files` is kept + pl.scan_parquet(bucket_files) + # Join SEX + .join( + df_pheno, + left_on="FINNGENID", + right_on="FINNGENID", + how="left", + maintain_order="left", + ) + .sink_parquet("/tmp/out.parquet") + ) + + # TODO validation + # + print("<< end") + + # TODO: keep or delete intermediate files in temp_dir based on CLI flag + # with shutil.rmtree + + +def consolidate_columns(assembled_file: Path, output_dir: Path) -> Path: + """Remove unecessary columns form the assembled file and rename the ones we will keep.""" + output_file = output_dir / "consolidated.parquet" + + columns = { + "main.FINNGENID": "FINNGENID", + "main.EVENT_AGE": "EVENT_AGE", + "main.tutkimuskoodistonjarjestelma": "tutkimuskoodistonjarjestelma", + "main.paikallinentutkimusnimike_selite": "paikallinentutkimusnimike_selite", + "main.tutkimustulosarvo": "tutkimustulosarvo", + "main.tutkimustulosyksikko": "tutkimustulosyksikko", + "main.tutkimusvastauksentila": "tutkimusvastauksentila", + "main.tuloksenpoikkeavuus": "tuloksenpoikkeavuus", + "main.viitearvoryhma": "viitearvoryhma", + "main.viitevalialkuarvo": "viitevalialkuarvo", + "main.viitevalialkuyksikko": "viitevalialkuyksikko", + "main.viitevaliloppuarvo": "viitevaliloppuarvo", + "main.viitevaliloppuyksikko": "viitevaliloppuyksikko", + "freetext.tutkimustulosteksti": "tutkimustulosteksti", + "main.paikallinentutkimusnimike_koodi": "paikallinentutkimusnimike_koodi", + "main.laboratoriotutkimusnimike": "laboratoriotutkimusnimike", + "main.APPROX_EVENT_DAY": "APPROX_EVENT_DAY", + "main.TIME": "TIME", + } + + ( + pl.scan_parquet(assembled_file) + .with_columns( + ( + pl.col("main._rowid").cast(pl.String) + + "@" + + pl.col("main._filename") + + "|" + + pl.col("freetext._rowid").cast(pl.String) + + "@" + + pl.col("freetext._filename") + ).alias("_rowid") + ) + .select(pl.col(list(columns.keys()) + ["_rowid"])) + .rename(columns) + .sink_parquet(output_file) + ) + + return output_file + + +def partition(assembled_file: Path, temp_dir: Path, n_buckets): + for bucket_id in range(n_buckets): + ( + pl.scan_parquet(assembled_file) + .filter(pl.col("FINNGENID").hash() % n_buckets == bucket_id) + .sink_parquet(temp_dir / f"bucket_id__{bucket_id}.parquet") + ) + + +def tidy_up(frame: pl.LazyFrame | pl.DataFrame): + return ( + frame.sort(by=COLUMNS_UNIQUENESS_SORT) + # Dedup rows + # NOTE(Vincent 2026-05-20) The previous implementation (WDL/Python) was + # doing the dedup on adjacent lines. Here the deduplication is not done + # explicitely on adjacent lines (since polars `unique` does it on the + # full data), though the result should be the same. + .unique(subset=COLUMNS_UNIQUENESS_SORT, keep="first") + ) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "--assembled-file", + help="Path to assembled file from the intake.assemble step (Parquet)", + required=True, + type=Path, + ) + parser.add_argument( + "--phenotype-file", + help="Path to phenotype file with SEX column (.txt.gz)", + required=True, + type=Path, + ) + parser.add_argument( + "--partition-n-buckets", + help="How many buckets to partition the data into to spread the sort+unique computations.", + required=False, + type=int, + default=32 + ) + parser.add_argument( + "--output-dir", + help="Path to write the output files", + required=True, + type=Path, + ) + args = parser.parse_args() + + main(args) From 55a35095a37146f0220989deeb9256a9ecac976e Mon Sep 17 00:00:00 2001 From: Vincent Date: Mon, 25 May 2026 11:30:58 +0000 Subject: [PATCH 08/22] add parameter to keep intermediate files --- src/kanta/intake/tidyup.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index 5f9ddbf..d7c38e2 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -1,4 +1,5 @@ import tempfile +import shutil from argparse import ArgumentParser from pathlib import Path @@ -68,10 +69,11 @@ def main(args): # TODO validation # - print("<< end") - # TODO: keep or delete intermediate files in temp_dir based on CLI flag - # with shutil.rmtree + if not args.keep_intermediate_files: + shutil.rmtree(temp_dir) + + print("<< end") def consolidate_columns(assembled_file: Path, output_dir: Path) -> Path: @@ -155,6 +157,12 @@ def tidy_up(frame: pl.LazyFrame | pl.DataFrame): required=True, type=Path, ) + parser.add_argument( + "--output-dir", + help="Path to write the output files", + required=True, + type=Path, + ) parser.add_argument( "--partition-n-buckets", help="How many buckets to partition the data into to spread the sort+unique computations.", @@ -163,10 +171,9 @@ def tidy_up(frame: pl.LazyFrame | pl.DataFrame): default=32 ) parser.add_argument( - "--output-dir", - help="Path to write the output files", - required=True, - type=Path, + "--keep-intermediate-files", + help="Keep intermediate files, useful for debugging.", + action="store_true", ) args = parser.parse_args() From b30983121558ec055052112b282e1696286a8f65 Mon Sep 17 00:00:00 2001 From: Vincent Date: Mon, 25 May 2026 12:32:32 +0000 Subject: [PATCH 09/22] fix output paths --- src/kanta/intake/tidyup.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index d7c38e2..cf3bec4 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -1,6 +1,13 @@ +""" +Differences from the WDL implementation: +- no logging of duplicates/err lines +- outputs to a single parquet file, no .txt.gz, as this is very slow. +""" + import tempfile import shutil from argparse import ArgumentParser +from datetime import date from pathlib import Path import polars as pl @@ -19,9 +26,17 @@ def main(args): + # Set up output file and temporary directory for intermediate files + today = date.today() + output_file_stem = ( + args.output_dir / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}" + ) + temp_dir = Path(tempfile.mkdtemp()) print(f">> {temp_dir=}") + temp_file_consolidate = temp_dir / "consolidated.parquet" + temp_dir_partition = temp_dir / "partition" temp_dir_partition.mkdir() @@ -29,7 +44,7 @@ def main(args): temp_dir_tidyup.mkdir() print("# Consolidate") - consolidated_file = consolidate_columns(args.assembled_file, args.output_dir) + consolidated_file = consolidate_columns(args.assembled_file, temp_file_consolidate) print("# Partition") partition(consolidated_file, temp_dir_partition, args.partition_n_buckets) @@ -54,7 +69,6 @@ def main(args): bucket_files.append(temp_dir_tidyup / f"bucket_id__{bucket_id}.parquet") ( - # TODO: verify the file order of `bucket_files` is kept pl.scan_parquet(bucket_files) # Join SEX .join( @@ -64,22 +78,17 @@ def main(args): how="left", maintain_order="left", ) - .sink_parquet("/tmp/out.parquet") + .sink_parquet(output_file_stem.with_suffix(".parquet")) ) - # TODO validation - # - if not args.keep_intermediate_files: shutil.rmtree(temp_dir) - + print("<< end") -def consolidate_columns(assembled_file: Path, output_dir: Path) -> Path: +def consolidate_columns(assembled_file: Path, output_file: Path) -> Path: """Remove unecessary columns form the assembled file and rename the ones we will keep.""" - output_file = output_dir / "consolidated.parquet" - columns = { "main.FINNGENID": "FINNGENID", "main.EVENT_AGE": "EVENT_AGE", @@ -168,7 +177,7 @@ def tidy_up(frame: pl.LazyFrame | pl.DataFrame): help="How many buckets to partition the data into to spread the sort+unique computations.", required=False, type=int, - default=32 + default=32, ) parser.add_argument( "--keep-intermediate-files", From ccd59451461baa82593d80acf268b62e94a6e2a3 Mon Sep 17 00:00:00 2001 From: Vincent Date: Mon, 25 May 2026 13:15:53 +0000 Subject: [PATCH 10/22] use output order of tidyup as _rowid --- src/kanta/intake/tidyup.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index cf3bec4..cfd1185 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -2,6 +2,8 @@ Differences from the WDL implementation: - no logging of duplicates/err lines - outputs to a single parquet file, no .txt.gz, as this is very slow. +- uses CSV-aware parsing, robust to edge cases like new-line character inside + CSV values. """ import tempfile @@ -78,7 +80,9 @@ def main(args): how="left", maintain_order="left", ) - .sink_parquet(output_file_stem.with_suffix(".parquet")) + .with_row_index(name="_rowid", offset=1) + .drop("_rowid_consolidate_debug") + .sink_parquet(output_file_stem.with_name(output_file_stem.name + ".parquet")) ) if not args.keep_intermediate_files: @@ -121,9 +125,9 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path: + pl.col("freetext._rowid").cast(pl.String) + "@" + pl.col("freetext._filename") - ).alias("_rowid") + ).alias("_rowid_consolidate_debug") ) - .select(pl.col(list(columns.keys()) + ["_rowid"])) + .select(pl.col(list(columns.keys()) + ["_rowid_consolidate_debug"])) .rename(columns) .sink_parquet(output_file) ) From 6d5bb82a72178832ec77dcf047ed220e6942592f Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 26 May 2026 10:31:44 +0000 Subject: [PATCH 11/22] add notes from benchmarks & adapt default N buckets --- src/kanta/intake/tidyup.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index cfd1185..3e512fb 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -1,9 +1,24 @@ """ -Differences from the WDL implementation: -- no logging of duplicates/err lines -- outputs to a single parquet file, no .txt.gz, as this is very slow. -- uses CSV-aware parsing, robust to edge cases like new-line character inside +Differences from the WDL implementation +======================================= +- No logging of duplicates/err lines. +- Outputs to a single parquet file, no .txt.gz, as this is very slow. +- Uses CSV-aware parsing, robust to edge cases like new-line character inside CSV values. + + +VM choice and performance +========================= +Best config: 32 CPUs / 32 GB RAM and use 24 buckets. Runs in 2-3 min. + +For lower specs, run with 16 or 8 CPUs and allocate 2 GB RAM per CPU, use 24 +buckets. Runs in 5-8 min. + +Lowest tested working spec: 8 CPUs / 8 GB RAM, 32 buckets. Runs in 6-12 min. + +If failing due to OOM in the sort+dedup stage, try increasing the bucket count. + +The GCP VM type appears to matter. N2D is about 2x faster than E2. """ import tempfile @@ -178,10 +193,10 @@ def tidy_up(frame: pl.LazyFrame | pl.DataFrame): ) parser.add_argument( "--partition-n-buckets", - help="How many buckets to partition the data into to spread the sort+unique computations.", + help="How many buckets to partition the data into to spread the sort+dedup computations.", required=False, type=int, - default=32, + default=24, ) parser.add_argument( "--keep-intermediate-files", From 2e18c2c54beb5d6aaad7e2a37e9e72dd2a85cc79 Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 26 May 2026 10:55:48 +0000 Subject: [PATCH 12/22] rename tidy-up step to sort + dedup --- src/kanta/intake/tidyup.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index 3e512fb..db2e7b8 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -57,8 +57,8 @@ def main(args): temp_dir_partition = temp_dir / "partition" temp_dir_partition.mkdir() - temp_dir_tidyup = temp_dir / "tidyup" - temp_dir_tidyup.mkdir() + temp_dir_sort_dedup = temp_dir / "sort_dedup" + temp_dir_sort_dedup.mkdir() print("# Consolidate") consolidated_file = consolidate_columns(args.assembled_file, temp_file_consolidate) @@ -66,12 +66,12 @@ def main(args): print("# Partition") partition(consolidated_file, temp_dir_partition, args.partition_n_buckets) - print("# Tidy-up") + print("# Sort + Dedup") for bucket_file in temp_dir_partition.glob("bucket_id__*.parquet"): ( pl.scan_parquet(bucket_file) - .pipe(tidy_up) - .sink_parquet(temp_dir_tidyup / bucket_file.name) + .pipe(sort_dedup) + .sink_parquet(temp_dir_sort_dedup / bucket_file.name) ) df_pheno = pl.scan_csv( @@ -83,7 +83,7 @@ def main(args): print("# Concatenate + Unique + SEX join") bucket_files = [] for bucket_id in range(args.partition_n_buckets): - bucket_files.append(temp_dir_tidyup / f"bucket_id__{bucket_id}.parquet") + bucket_files.append(temp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet") ( pl.scan_parquet(bucket_files) @@ -159,7 +159,7 @@ def partition(assembled_file: Path, temp_dir: Path, n_buckets): ) -def tidy_up(frame: pl.LazyFrame | pl.DataFrame): +def sort_dedup(frame: pl.LazyFrame | pl.DataFrame): return ( frame.sort(by=COLUMNS_UNIQUENESS_SORT) # Dedup rows From 955f3ee6d6589b62acb80ce4fbb44842ab77ed2e Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 26 May 2026 10:56:13 +0000 Subject: [PATCH 13/22] improve info message of intake.tidyup --- src/kanta/intake/tidyup.py | 87 +++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 39 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index db2e7b8..b828610 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -5,7 +5,7 @@ - Outputs to a single parquet file, no .txt.gz, as this is very slow. - Uses CSV-aware parsing, robust to edge cases like new-line character inside CSV values. - + VM choice and performance ========================= @@ -45,12 +45,18 @@ def main(args): # Set up output file and temporary directory for intermediate files today = date.today() - output_file_stem = ( - args.output_dir / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}" + output_file = ( + args.output_dir + / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}.parquet" ) temp_dir = Path(tempfile.mkdtemp()) - print(f">> {temp_dir=}") + + print("# Run info") + print(f"- Partition into N buckets: {args.partition_n_buckets}") + print(f"- Directory for intermediate files: {temp_dir}") + print(f"- Output directory: {args.output_dir}") + print() temp_file_consolidate = temp_dir / "consolidated.parquet" @@ -97,13 +103,48 @@ def main(args): ) .with_row_index(name="_rowid", offset=1) .drop("_rowid_consolidate_debug") - .sink_parquet(output_file_stem.with_name(output_file_stem.name + ".parquet")) + .sink_parquet(output_file) ) if not args.keep_intermediate_files: shutil.rmtree(temp_dir) - print("<< end") + +def init_cli(): + parser = ArgumentParser() + parser.add_argument( + "--assembled-file", + help="Path to assembled file from the intake.assemble step (Parquet)", + required=True, + type=Path, + ) + parser.add_argument( + "--phenotype-file", + help="Path to phenotype file with SEX column (.txt.gz)", + required=True, + type=Path, + ) + parser.add_argument( + "--output-dir", + help="Path to write the output files", + required=True, + type=Path, + ) + parser.add_argument( + "--partition-n-buckets", + help="How many buckets to partition the data into to spread the sort+dedup computations.", + required=False, + type=int, + default=24, + ) + parser.add_argument( + "--keep-intermediate-files", + help="Keep intermediate files, useful for debugging.", + action="store_true", + ) + args = parser.parse_args() + + return args def consolidate_columns(assembled_file: Path, output_file: Path) -> Path: @@ -172,37 +213,5 @@ def sort_dedup(frame: pl.LazyFrame | pl.DataFrame): if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument( - "--assembled-file", - help="Path to assembled file from the intake.assemble step (Parquet)", - required=True, - type=Path, - ) - parser.add_argument( - "--phenotype-file", - help="Path to phenotype file with SEX column (.txt.gz)", - required=True, - type=Path, - ) - parser.add_argument( - "--output-dir", - help="Path to write the output files", - required=True, - type=Path, - ) - parser.add_argument( - "--partition-n-buckets", - help="How many buckets to partition the data into to spread the sort+dedup computations.", - required=False, - type=int, - default=24, - ) - parser.add_argument( - "--keep-intermediate-files", - help="Keep intermediate files, useful for debugging.", - action="store_true", - ) - args = parser.parse_args() - + args = init_cli() main(args) From 59dc347a38cb654b691c106b02c9e35fdccec70b Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 26 May 2026 12:13:17 +0000 Subject: [PATCH 14/22] add documentation for intake assemble and tidy-up --- src/kanta/intake/assemble.py | 6 + src/kanta/intake/tidyup.py | 6 +- wdl/pre-merge.json | 7 - wdl/pre-merge.wdl | 114 ---------------- wdl/sort_dup.json | 8 -- wdl/sort_dup.wdl | 249 ----------------------------------- 6 files changed, 10 insertions(+), 380 deletions(-) delete mode 100644 wdl/pre-merge.json delete mode 100644 wdl/pre-merge.wdl delete mode 100644 wdl/sort_dup.json delete mode 100644 wdl/sort_dup.wdl diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py index 38a096e..3b1f861 100644 --- a/src/kanta/intake/assemble.py +++ b/src/kanta/intake/assemble.py @@ -1,5 +1,11 @@ """ Merges the incoming Kanta Lab data from THL into one coherent file. + + +Differences from the WDL implementation +======================================= +- Uses CSV-aware parsing, robust to edge cases like new-line character inside + CSV values. """ import gzip diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index b828610..5c2343d 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -1,10 +1,12 @@ """ +Tidy-up the raw data into a subset of necessary column, and apply sorting +and deduplication. + + Differences from the WDL implementation ======================================= - No logging of duplicates/err lines. - Outputs to a single parquet file, no .txt.gz, as this is very slow. -- Uses CSV-aware parsing, robust to edge cases like new-line character inside - CSV values. VM choice and performance diff --git a/wdl/pre-merge.json b/wdl/pre-merge.json deleted file mode 100644 index 6ea88a5..0000000 --- a/wdl/pre-merge.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "pre_merge.test":false, - "pre_merge.kanta_list": "gs://fg-3/kanta_v3/inputs/kanta_file_list.txt", - "pre_merge.prefix": "finngen_R14_kanta_laboratory_responses_internal_VERSION.txt.gz", - "pre_merge.version": "1.0" - -} diff --git a/wdl/pre-merge.wdl b/wdl/pre-merge.wdl deleted file mode 100644 index a161165..0000000 --- a/wdl/pre-merge.wdl +++ /dev/null @@ -1,114 +0,0 @@ -version 1.0 - -workflow pre_merge { - input { - Boolean test - File kanta_list - String prefix - String version - - } - String docker = "eu.gcr.io/finngen-sandbox-v3-containers/bioinformatics:1.0.1" - - # Remove quotation marks (and test if needed) - scatter (year_files in read_tsv(kanta_list)) { - # THIS STEP REMOVES QUOTATION BLOCKS - call process_file as process_responses {input :input_file= year_files[0],test = test,docker=docker} - call process_file as process_ft { input :input_file= year_files[1],test = test ,docker=docker} - call merge_ft {input: responses_file = process_responses.cleaned_file,ft_file = process_ft.cleaned_file,docker=docker} - } - - call merge_files {input:rr_files = merge_ft.merged_year,out_file = sub(prefix,"VERSION",if test then version +"_test" else version),docker=docker } - output { - File merged_kanta =merge_files.merged_file - } -} - -task merge_ft { - input { - File responses_file - File ft_file - String docker - } - - String out_file = sub(basename(responses_file),'.txt.gz','_merged.txt.gz') - command <<< - set -euo pipefail - F1="~{responses_file}" - F2="~{ft_file}" - OUT="~{out_file}" - # 1. Get headers safely. - # 'head -1' often causes 'zcat' to return exit code 141 (SIGPIPE). - # '|| true' ensures H1/H2 assignments don't trigger 'set -e'. - H1=$(zcat -f "$F1" | head -1 || true) - H2=$(zcat -f "$F2" | head -1 || true) - -# Check if we actually got headers before proceeding - if [[ -z "$H1" || -z "$H2" ]]; then - echo "Error: Could not read headers from input files." >&2 - exit 1 - fi - # Get headers and find indices for columns in F2 not in F1 - OFF=$(echo "$H1" | tr '\t' '\n' | wc -l) - - # Join files and process in one AWK pass - paste <(zcat -f "$F1") <(zcat -f "$F2") | awk -F'\t' -v OFS='\t' -v h1="$H1" -v h2="$H2" -v off="$OFF" ' - BEGIN { - split(h1, a1); split(h2, a2) - for(i in a1) map[a1[i]] = i - for(i in a2) if(a2[i] in map) pairs[map[a2[i]]] = i + off; else new[++n] = i + off - } - { - for(p in pairs) if($p != $pairs[p]) { print "Err line "NR": "$p" != "$pairs[p] > "/dev/stderr"; exit 1 } - res = $1; for(i=2; i<=off; i++) res = res OFS $i - for(i=1; i<=n; i++) res = res OFS $(new[i]) - print res - if(NR%50000==0) printf "\rRow %d", NR > "/dev/stderr" - }' | gzip > "$OUT" - - - >>> - runtime { - disks: "local-disk ~{ceil(size(responses_file,'GB')*3) + 10} HDD" - docker : "~{docker}" - } - output { - File merged_year = out_file - } -} - -task process_file { - input { - File input_file - Boolean test - String docker - } - String base = sub(basename(input_file),'.txt.gz','_cleaned.txt.gz') - command <<< - zcat -f ~{input_file} | sed 's/\(^\|\t\)"/\1/g; s/"\(\t\|$\)/\1/g' | tr -d '\r' | awk -F'\t' '/^FG/{if(NR>1)print ""; printf "%s",$0; next} {printf " %s",$0} END{print ""}' | awk -F'\t' 'BEGIN{OFS="\t"} NR==1{cols=NF} {if(NF ~{base} - >>> - runtime { - disks: "local-disk ~{ceil(size(input_file,'GB')*3) + 10} HDD" - docker:"~{docker}" - } - output {File cleaned_file = base} - -} - -task merge_files { - input { - Array[File] rr_files - String out_file - String docker - } - command <<< - zcat ~{rr_files[0]} | head -n1 | bgzip -c > ~{out_file} - while read f; do echo $f && zcat $f | sed -E 1d | bgzip -c >> ~{out_file}; done < ~{write_lines(rr_files)} - zcat ~{out_file} | wc -l - >>> - runtime { - disks: "local-disk ~{ceil(size(rr_files,'GB'))*3 + 10} HDD" - docker:"~{docker}" - } - output { File merged_file = out_file} -} diff --git a/wdl/sort_dup.json b/wdl/sort_dup.json deleted file mode 100644 index 906d162..0000000 --- a/wdl/sort_dup.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "kanta_sort_dup.test":false, - "kanta_sort_dup.kanta_data":"gs://fg-3/kanta_v3/finngen_R14_kanta_laboratory_responses_internal_1.0.txt.gz", - "kanta_sort_dup.sex_map.min_pheno": "gs://finngen-production-library-red/finngen_R13/phenotype_1.0/data/finngen_R13_minimum_1.0.txt.gz", - "kanta_sort_dup.kanta_docker": "eu.gcr.io/finngen-sandbox-v3-containers/kanta:dev", - "kanta_sort_dup.split.n_chunks": 32, - -} diff --git a/wdl/sort_dup.wdl b/wdl/sort_dup.wdl deleted file mode 100644 index d5b6596..0000000 --- a/wdl/sort_dup.wdl +++ /dev/null @@ -1,249 +0,0 @@ -version 1.0 - -workflow kanta_sort_dup{ - input { - # works with 100k lines - Boolean test - File kanta_data - String kanta_docker - } - # this has python 3.6, needed in the merge step. - String base_docker = "eu.gcr.io/finngen-sandbox-v3-containers/kanta:v3_base" - call get_cols {input:docker=kanta_docker} - # split input in chunks - # s_cols (names) --> sort_cols (indices) - call split { - input: - test = test, - kanta_data = kanta_data, - cols = get_cols.cols, - s_cols = get_cols.s_cols, - docker=base_docker - } - - # builds sex dictionary mapping from pheno file - call sex_map {input: docker=base_docker} - - # extract columns sort and extract duplicates/errs - scatter (i in range(length(split.chunks))) { - call sort { - input : - index = i, - chunk = split.chunks[i], - sort_cols = split.sort_cols, - sex_map = sex_map.sex_map, - docker=base_docker - } - } - # merge chunks (unique/dup/err) - String prefix = basename(kanta_data,'.txt.gz') - call merge { - input : - sorted_chunks = sort.sorted_chunk, - sort_cols = split.sort_cols, - header = split.header, - docker= base_docker, - prefix = if test then prefix+ "_test" else prefix - } -} - -task merge { - input { - File header - Array[File] sorted_chunks - Array[String] sort_cols - String prefix - String docker - } - - Int chunk_size = ceil(size(sorted_chunks,"GB")) - command <<< - # CONCAT PRE-SORTED FILES - echo "SORT FILES" - for col in ~{sep=' ' sort_cols}; do echo "${col},${col}" >> sort_keys.tmp; done - SORT_ARGS=$(cat sort_keys.tmp | xargs -I {} echo "-k {}" | tr '\n' ' ') - /usr/bin/time -v sort -t $'\t' -m $SORT_ARGS ~{sep=" " sorted_chunks} > sorted.txt - #/usr/bin/time -v sort -t $'\t' -m -k ~{sep=" -k " sort_cols} ~{sep=" " sorted_chunks} > sorted.txt - # REMOVE DUPS - python3 <>> - runtime { - disks: "local-disk ~{chunk_size*4+10} HDD" - docker : "~{docker}" - } - output { - Array[File] kanta_files = glob("~{prefix}*gz") - } -} - -task sort { - input { - File chunk - Array[String] sort_cols - Int index - String docker - File sex_map - } - String out_file = "kanta_sorted_" + index - - command <<< - for col in ~{sep=' ' sort_cols}; do echo "${col},${col}" >> sort_keys.tmp; done - SORT_ARGS=$(cat sort_keys.tmp | xargs -I {} echo "-k {}" | tr '\n' ' ') - zcat ~{chunk} | sort -t $'\t' $SORT_ARGS > tmp.txt - #zcat ~{chunk} | sort -t $'\t' -k ~{sep=" -k " sort_cols} > ~{out_file} - - #add sex - awk -F'\t' 'BEGIN {OFS="\t"} NR==FNR {sex[$1]=$2; next} NR==1 {print $0, "SEX"; next} {print $0, (sex[$1] ? sex[$1] : "NA")}' \ - ~{sex_map} tmp.txt > ~{out_file} - - # check file size - count_tmp=$(wc -l < tmp.txt) - count_out=$(wc -l < ~{out_file}) - - # Perform the assertion - if [[ "$count_tmp" -ne "$count_out" ]]; then - echo "❌ Assertion Failed: Line counts do not match!" >&2 - echo "tmp.txt has $count_tmp lines." >&2 - echo "~{out_file} has $count_out lines." >&2 - exit 1 # Exit with a non-zero status to signal an error - else - echo "✅ Assertion Passed: Both files have $count_tmp lines." - fi - >>> - - runtime { - disks: "local-disk ~{ceil(size(chunk,'GB'))*3 + 10} HDD" - docker: "~{docker}" - } - - output { - File sorted_chunk = out_file - } -} - -task get_cols { - input { - String docker - } - - command <<< - # get required columns to cut from git repository - cp /finngen_qc/magic_config.py ./config.py - python3 -c "import config;o= open('./columns.txt','wt') ;o.write('\n'.join(list(config.config['rename_cols'].keys())) + '\n');o.write('\n'.join(config.config['other_cols'])+ '\n')" - python3 -c "import config;o= open('./sort_columns.txt','wt') ;o.write('\n'.join(config.config['sort_cols'])+ '\n')" - >>> - runtime { - disks: "local-disk 10 HDD" - docker : "~{docker}" - } - output { - Array[String] cols = read_lines("columns.txt") - Array[String] s_cols = read_lines("sort_columns.txt") - } -} - -task split { - input { - Boolean test - File kanta_data - Int n_chunks - Array[String] cols - Array[String] s_cols - String docker - } - - Int disk_size = ceil(size(kanta_data,"GB"))*10*n_chunks - - command <<< - echo "SORT KANTA" - cat ~{write_lines(cols)} > columns.txt - cat ~{write_lines(s_cols)} > sort_columns.txt - COLS=$(zcat ~{kanta_data} | head -n1 | tr '\t' '\n' | grep -wnf columns.txt | cut -f 1 -d ':' | tr '\n' ',' | rev | cut -c2- | rev) - echo $COLS - - # uncompress and split new header from body - zcat ~{kanta_data} | cut -f $COLS | head -n1 > header.txt - zcat ~{kanta_data} | cut -f $COLS | sed -E 1d ~{if test then " | head -n 10000 " else ""}> tmp.tsv - - # GET SORT COLS AND KEEP ORDER - echo "COLS" - while read f; - do - cat header.txt | head -n1 | tr '\t' '\n'| grep -wn $f | cut -f 1 -d ':' >> sort_cols.txt - done < sort_columns.txt - cat sort_cols.txt - - # SPLIT INTO N FILES - split tmp.tsv -n l/~{n_chunks} -d kanta_chunk --filter='gzip > $FILE.gz' - >>> - - runtime { - disks: "local-disk ~{disk_size} HDD" - docker : "~{docker}" - } - - output { - Array[File] chunks = glob("./kanta_chunk*gz") - File header = "header.txt" - Array[String] sort_cols = read_lines("sort_cols.txt") - } -} - -task sex_map { - input { - File min_pheno - String docker - } - String sex_file = "sex_map.txt" - command <<< - # get sex col - sexcol=$(awk '{for(i=1;i<=NF;i++){if($i=="SEX"){print i; exit}}}' <(zcat ~{min_pheno} | head -n1)) - # extract sex only and sort - zcat ~{min_pheno} | cut -f 1,$sexcol | (sed -u 1q ; sort )>> ~{sex_file} - >>> - runtime { - disks: "local-disk ~{ceil(size(min_pheno,'GB')) * 3} HDD" - docker : "~{docker}" - } - output {File sex_map = sex_file} -} From f77a1bbafb56e3c71375d931c8d978113ddcc006 Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 27 May 2026 07:54:44 +0000 Subject: [PATCH 15/22] add ability to run assemble + tidyup stages at once --- src/kanta/intake/__main__.py | 62 ++++++++++++++++++++++++++++++++++++ src/kanta/intake/assemble.py | 33 +++++++++++-------- src/kanta/intake/tidyup.py | 53 +++++++++++++++++------------- 3 files changed, 111 insertions(+), 37 deletions(-) create mode 100644 src/kanta/intake/__main__.py diff --git a/src/kanta/intake/__main__.py b/src/kanta/intake/__main__.py new file mode 100644 index 0000000..f44e2d3 --- /dev/null +++ b/src/kanta/intake/__main__.py @@ -0,0 +1,62 @@ +if __name__ == "__main__": + import tempfile + import os + from argparse import ArgumentParser + from pathlib import Path + + from kanta.intake import assemble + from kanta.intake import tidyup + + parser = ArgumentParser() + + parser.add_argument( + "--source-list-file", + required=True, + type=Path, + help="File containing pair of paths to main & freetext data, one pair per line (TSV without header).", + ) + parser.add_argument( + "--phenotype-file", + help="Path to phenotype file with FINNGENID and SEX columns (.txt.gz)", + required=True, + type=Path, + ) + parser.add_argument( + "--output-dir", + help="Path to write the output files", + required=True, + type=Path, + ) + parser.add_argument( + "--partition-n-buckets", + help="How many buckets to partition the data into to spread the sort+dedup computations.", + required=False, + type=int, + default=24, + ) + parser.add_argument( + "--debug", + help="Increase verbosity and keep intermediate files", + required=False, + action="store_true", + ) + + args = parser.parse_args() + + # Assemble stage + _fd, absolute_pathname = tempfile.mkstemp() + tmp_file_assemble = Path(absolute_pathname) + post_assemble_file = assemble.main(args.source_list_file, tmp_file_assemble) + + # Tidy-up stage + tidyup.main( + tmp_file_assemble, + args.phenotype_file, + args.output_dir, + partition_n_buckets=args.partition_n_buckets, + keep_intermediate_files=args.debug, + ) + + # Cleaning up + if not args.debug: + os.remove(tmp_file_assemble) diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py index 3b1f861..5e9a252 100644 --- a/src/kanta/intake/assemble.py +++ b/src/kanta/intake/assemble.py @@ -62,14 +62,25 @@ COL_PREFIX_FREETEXT = "freetext." -def validate_input_pairs(list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]: +def main(source_list_file: Path, output_file: Path) -> Path: + pairs = validate_input_pairs(source_list_file) + + print(">> merge_by_pair") + merge_by_pair(pairs, output_file) + + print(">> check_merge_consistency") + print(check_merge_consistency(output_file)) + + + +def validate_input_pairs(source_list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]: pairs = [] - with open(list_file) as fp: + with open(source_list_file) as fp: for line in fp: values = line.split(separator, maxsplit=2) - main = validate_tsv_gz(values[0], list_file.parent) - freetext = validate_tsv_gz(values[1], list_file.parent) + main = validate_tsv_gz(values[0], source_list_file.parent) + freetext = validate_tsv_gz(values[1], source_list_file.parent) pairs.append((main, freetext)) @@ -209,24 +220,18 @@ def get_columns(input_path: Path) -> list[str]: if __name__ == "__main__": parser = ArgumentParser() parser.add_argument( - "--list-file", + "--source-list-file", required=True, type=Path, help="File containing pair of paths to main & freetext data, one pair per line (TSV without header).", ) parser.add_argument( - "--post-merge-file", + "--output-file", required=True, type=Path, - help="Path to intermediary output file from the merge stage", + help="Path to output the intermediary file from this stage.", ) args = parser.parse_args() - pairs = validate_input_pairs(args.list_file) - - print(">> merge_by_pair") - merge_by_pair(pairs, args.post_merge_file) - - print(">> check_merge_consistency") - print(check_merge_consistency(args.post_merge_file)) + main(args.source_list_file, args.output_file) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index 5c2343d..fdf5e6f 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -44,54 +44,61 @@ ] -def main(args): +def main( + assembled_file: Path, + phenotype_file: Path, + output_dir: Path, + *, + partition_n_buckets: int, + keep_intermediate_files: bool, +): # Set up output file and temporary directory for intermediate files today = date.today() output_file = ( - args.output_dir + output_dir / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}.parquet" ) - temp_dir = Path(tempfile.mkdtemp()) + tmp_dir = Path(tempfile.mkdtemp()) print("# Run info") - print(f"- Partition into N buckets: {args.partition_n_buckets}") - print(f"- Directory for intermediate files: {temp_dir}") - print(f"- Output directory: {args.output_dir}") + print(f"- Partition into N buckets: {partition_n_buckets}") + print(f"- Directory for intermediate files: {tmp_dir}") + print(f"- Output directory: {output_dir}") print() - temp_file_consolidate = temp_dir / "consolidated.parquet" + tmp_file_consolidate = tmp_dir / "consolidated.parquet" - temp_dir_partition = temp_dir / "partition" - temp_dir_partition.mkdir() + tmp_dir_partition = tmp_dir / "partition" + tmp_dir_partition.mkdir() - temp_dir_sort_dedup = temp_dir / "sort_dedup" - temp_dir_sort_dedup.mkdir() + tmp_dir_sort_dedup = tmp_dir / "sort_dedup" + tmp_dir_sort_dedup.mkdir() print("# Consolidate") - consolidated_file = consolidate_columns(args.assembled_file, temp_file_consolidate) + consolidated_file = consolidate_columns(assembled_file, tmp_file_consolidate) print("# Partition") - partition(consolidated_file, temp_dir_partition, args.partition_n_buckets) + partition(consolidated_file, tmp_dir_partition, partition_n_buckets) print("# Sort + Dedup") - for bucket_file in temp_dir_partition.glob("bucket_id__*.parquet"): + for bucket_file in tmp_dir_partition.glob("bucket_id__*.parquet"): ( pl.scan_parquet(bucket_file) .pipe(sort_dedup) - .sink_parquet(temp_dir_sort_dedup / bucket_file.name) + .sink_parquet(tmp_dir_sort_dedup / bucket_file.name) ) df_pheno = pl.scan_csv( - args.phenotype_file, + phenotype_file, infer_schema=False, separator="\t", ).select("FINNGENID", "SEX") print("# Concatenate + Unique + SEX join") bucket_files = [] - for bucket_id in range(args.partition_n_buckets): - bucket_files.append(temp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet") + for bucket_id in range(partition_n_buckets): + bucket_files.append(tmp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet") ( pl.scan_parquet(bucket_files) @@ -108,8 +115,8 @@ def main(args): .sink_parquet(output_file) ) - if not args.keep_intermediate_files: - shutil.rmtree(temp_dir) + if not keep_intermediate_files: + shutil.rmtree(tmp_dir) def init_cli(): @@ -122,7 +129,7 @@ def init_cli(): ) parser.add_argument( "--phenotype-file", - help="Path to phenotype file with SEX column (.txt.gz)", + help="Path to phenotype file with FINNGENID and SEX columns (.txt.gz)", required=True, type=Path, ) @@ -193,12 +200,12 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path: return output_file -def partition(assembled_file: Path, temp_dir: Path, n_buckets): +def partition(assembled_file: Path, tmp_dir: Path, n_buckets): for bucket_id in range(n_buckets): ( pl.scan_parquet(assembled_file) .filter(pl.col("FINNGENID").hash() % n_buckets == bucket_id) - .sink_parquet(temp_dir / f"bucket_id__{bucket_id}.parquet") + .sink_parquet(tmp_dir / f"bucket_id__{bucket_id}.parquet") ) From 7d3314d9330eab06faffe34345f4016a39b835ac Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 27 May 2026 10:53:39 +0000 Subject: [PATCH 16/22] add end-to-end row id tracking GitHub: fixes #50 --- src/kanta/intake/__main__.py | 25 +++++++++++--------- src/kanta/intake/assemble.py | 24 +++++++++++++------ src/kanta/intake/tidyup.py | 46 ++++++++++++++---------------------- 3 files changed, 49 insertions(+), 46 deletions(-) diff --git a/src/kanta/intake/__main__.py b/src/kanta/intake/__main__.py index f44e2d3..6bd162d 100644 --- a/src/kanta/intake/__main__.py +++ b/src/kanta/intake/__main__.py @@ -1,7 +1,6 @@ if __name__ == "__main__": - import tempfile - import os from argparse import ArgumentParser + from datetime import date from pathlib import Path from kanta.intake import assemble @@ -44,19 +43,23 @@ args = parser.parse_args() # Assemble stage - _fd, absolute_pathname = tempfile.mkstemp() - tmp_file_assemble = Path(absolute_pathname) - post_assemble_file = assemble.main(args.source_list_file, tmp_file_assemble) + output_file_assemble_stage = ( + args.output_dir + / f"finngen_R14_kanta_laboratory_responses.assemble-stage.{date.today()}.parquet" + ) + post_assemble_file = assemble.main( + args.source_list_file, output_file_assemble_stage + ) # Tidy-up stage + output_file_tidyup_stage = ( + args.output_dir + / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{date.today()}.parquet" + ) tidyup.main( - tmp_file_assemble, + output_file_assemble_stage, args.phenotype_file, - args.output_dir, + output_file_tidyup_stage, partition_n_buckets=args.partition_n_buckets, keep_intermediate_files=args.debug, ) - - # Cleaning up - if not args.debug: - os.remove(tmp_file_assemble) diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py index 5e9a252..fea5895 100644 --- a/src/kanta/intake/assemble.py +++ b/src/kanta/intake/assemble.py @@ -63,17 +63,21 @@ def main(source_list_file: Path, output_file: Path) -> Path: + print() + print("=== ASSEMBLE STAGE ===") pairs = validate_input_pairs(source_list_file) - print(">> merge_by_pair") + print("# Merge by pair") merge_by_pair(pairs, output_file) - print(">> check_merge_consistency") - print(check_merge_consistency(output_file)) + print("# Checking merge consistency") + is_consistent = check_merge_consistency(output_file) + print("All good." if is_consistent else "!!! Inconsitent merge !!!") - -def validate_input_pairs(source_list_file: Path, *, separator="\t") -> list[tuple[Path, Path]]: +def validate_input_pairs( + source_list_file: Path, *, separator="\t" +) -> list[tuple[Path, Path]]: pairs = [] with open(source_list_file) as fp: for line in fp: @@ -124,7 +128,11 @@ def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) -> to_concat.append(df_merged) - pl.concat(to_concat).sink_parquet(parquet_output) + ( + pl.concat(to_concat) + .with_row_index(name="_rowid_source", offset=1) + .sink_parquet(parquet_output) + ) def check_merge_consistency(data_path: str | Path) -> bool: @@ -150,7 +158,9 @@ def check_merge_consistency(data_path: str | Path) -> bool: # the main and freetext data are of different height. check_same_height = ( pl.scan_parquet(data_path) - .select(pl.all_horizontal(pl.selectors.ends_with("._rowid").is_not_null().all())) + .select( + pl.all_horizontal(pl.selectors.ends_with("._rowid").is_not_null().all()) + ) .collect(engine="streaming") .item() ) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index fdf5e6f..8db218a 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -26,7 +26,6 @@ import tempfile import shutil from argparse import ArgumentParser -from datetime import date from pathlib import Path import polars as pl @@ -47,25 +46,20 @@ def main( assembled_file: Path, phenotype_file: Path, - output_dir: Path, + output_file: Path, *, partition_n_buckets: int, keep_intermediate_files: bool, ): # Set up output file and temporary directory for intermediate files - today = date.today() - output_file = ( - output_dir - / f"finngen_R14_kanta_laboratory_responses_internal_1.0_{today}.parquet" - ) - tmp_dir = Path(tempfile.mkdtemp()) + print() + print("=== TIDY-UP STAGE ===") print("# Run info") print(f"- Partition into N buckets: {partition_n_buckets}") print(f"- Directory for intermediate files: {tmp_dir}") - print(f"- Output directory: {output_dir}") - print() + print(f"- Output file: {output_file}") tmp_file_consolidate = tmp_dir / "consolidated.parquet" @@ -111,7 +105,6 @@ def main( maintain_order="left", ) .with_row_index(name="_rowid", offset=1) - .drop("_rowid_consolidate_debug") .sink_parquet(output_file) ) @@ -134,8 +127,8 @@ def init_cli(): type=Path, ) parser.add_argument( - "--output-dir", - help="Path to write the output files", + "--output-file", + help="Path to write the output file", required=True, type=Path, ) @@ -158,7 +151,7 @@ def init_cli(): def consolidate_columns(assembled_file: Path, output_file: Path) -> Path: """Remove unecessary columns form the assembled file and rename the ones we will keep.""" - columns = { + rename_columns = { "main.FINNGENID": "FINNGENID", "main.EVENT_AGE": "EVENT_AGE", "main.tutkimuskoodistonjarjestelma": "tutkimuskoodistonjarjestelma", @@ -179,21 +172,12 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path: "main.TIME": "TIME", } + out_columns = list(rename_columns.keys()) + ["_rowid_source"] + ( pl.scan_parquet(assembled_file) - .with_columns( - ( - pl.col("main._rowid").cast(pl.String) - + "@" - + pl.col("main._filename") - + "|" - + pl.col("freetext._rowid").cast(pl.String) - + "@" - + pl.col("freetext._filename") - ).alias("_rowid_consolidate_debug") - ) - .select(pl.col(list(columns.keys()) + ["_rowid_consolidate_debug"])) - .rename(columns) + .select(pl.col(out_columns)) + .rename(rename_columns) .sink_parquet(output_file) ) @@ -223,4 +207,10 @@ def sort_dedup(frame: pl.LazyFrame | pl.DataFrame): if __name__ == "__main__": args = init_cli() - main(args) + main( + args.assembled_file, + args.phenotype_file, + args.output_file, + partition_n_buckets=args.partition_n_buckets, + keep_intermediate_files=args.keep_intermediate_files + ) From 7f9f97ee75ea46f1281512f1c2a0f30cf14de0c9 Mon Sep 17 00:00:00 2001 From: Vincent Date: Fri, 29 May 2026 11:08:53 +0000 Subject: [PATCH 17/22] reword log message when merging files in assemble --- src/kanta/intake/assemble.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py index fea5895..ba1f562 100644 --- a/src/kanta/intake/assemble.py +++ b/src/kanta/intake/assemble.py @@ -98,7 +98,7 @@ def validate_input_pairs( def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) -> None: to_concat = [] for path_main, path_freetext in pairs: - print(f"Processing {path_main} & {path_freetext}") + print(f"Adding horizontal merge: {path_main} & {path_freetext}") df_main = ( pl.scan_csv( From 196d22311f8d7b2b709c9b783eff3bf8eb0226e1 Mon Sep 17 00:00:00 2001 From: Vincent Date: Fri, 29 May 2026 11:10:43 +0000 Subject: [PATCH 18/22] sanitize text fields by removing new-line character --- src/kanta/intake/tidyup.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index 8db218a..dada05e 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -89,12 +89,12 @@ def main( separator="\t", ).select("FINNGENID", "SEX") - print("# Concatenate + Unique + SEX join") + print("# Concatenate + join SEX") bucket_files = [] for bucket_id in range(partition_n_buckets): bucket_files.append(tmp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet") - ( + df_concat = ( pl.scan_parquet(bucket_files) # Join SEX .join( @@ -105,7 +105,32 @@ def main( maintain_order="left", ) .with_row_index(name="_rowid", offset=1) - .sink_parquet(output_file) + ) + + print("# Sanitize text fields") + + unicode_newline = "\u2424" # Unicode "SYMBOL FOR NEWLINE", displayed as: ␤ + trusted_columns = [ + "FINNGENID", + "EVENT_AGE", + "APPROX_EVENT_DAY", + "TIME", + "asiakirjaoid_pseudo", + "merkintaoid_pseudo", + "entryoid_pseudo", + "load_id_pseudo", + "file_name_pseudo", + "laboratoriotutkimusoid", + "_rowid", + "_rowid_source", + "SEX" + ] + ( + df_concat.with_columns( + pl.selectors.exclude(*trusted_columns).str.replace_all( + pattern="\r\n|\r|\n", value=unicode_newline + ) + ).sink_parquet(output_file) ) if not keep_intermediate_files: @@ -212,5 +237,5 @@ def sort_dedup(frame: pl.LazyFrame | pl.DataFrame): args.phenotype_file, args.output_file, partition_n_buckets=args.partition_n_buckets, - keep_intermediate_files=args.keep_intermediate_files + keep_intermediate_files=args.keep_intermediate_files, ) From a94c0232123078763952cd391d265f367d5941f4 Mon Sep 17 00:00:00 2001 From: Vincent Date: Wed, 3 Jun 2026 13:39:00 +0000 Subject: [PATCH 19/22] reorder columns to match previous implementation --- src/kanta/intake/tidyup.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index dada05e..7fb54b8 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -123,14 +123,39 @@ def main( "laboratoriotutkimusoid", "_rowid", "_rowid_source", - "SEX" + "SEX", ] ( df_concat.with_columns( pl.selectors.exclude(*trusted_columns).str.replace_all( pattern="\r\n|\r|\n", value=unicode_newline ) - ).sink_parquet(output_file) + ) + # Re-order column to be somewhat backward compatible with previous implementation + .select( + "_rowid", + "_rowid_source", + "FINNGENID", + "EVENT_AGE", + "APPROX_EVENT_DAY", + "TIME", + "laboratoriotutkimusnimike", + "paikallinentutkimusnimike_koodi", + "paikallinentutkimusnimike_selite", + "tutkimuskoodistonjarjestelma", + "tutkimusvastauksentila", + "tutkimustulosarvo", + "tutkimustulosyksikko", + "tuloksenpoikkeavuus", + "viitearvoryhma", + "viitevalialkuarvo", + "viitevalialkuyksikko", + "viitevaliloppuarvo", + "viitevaliloppuyksikko", + "tutkimustulosteksti", + "SEX", + ) + .sink_parquet(output_file) ) if not keep_intermediate_files: From 45e7e573f5da53e730b3cf6df691027fbbbd08bb Mon Sep 17 00:00:00 2001 From: Vincent Date: Thu, 4 Jun 2026 12:02:14 +0000 Subject: [PATCH 20/22] implement same-ish dedup logic as prev WDL version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Known differences: - Behavior change: Newline inside values are now replaced by the Unicode character `␤` (U+2424), instead of replacing by a space character ` `. - Bug fix: Values within quoted TSV fields are now correctly preserved, whereas the previous implementation added extra quotes. - Bug fix: Tab characters inside quoted TSV values are now correctly preserved, whereas the previous implementation treated them as field separator resulting in shifted values. --- src/kanta/intake/tidyup.py | 41 ++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index 7fb54b8..67dd59a 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -83,17 +83,17 @@ def main( .sink_parquet(tmp_dir_sort_dedup / bucket_file.name) ) + print("# Concatenate + join SEX") + bucket_files = [] + for bucket_id in range(partition_n_buckets): + bucket_files.append(tmp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet") + df_pheno = pl.scan_csv( phenotype_file, infer_schema=False, separator="\t", ).select("FINNGENID", "SEX") - print("# Concatenate + join SEX") - bucket_files = [] - for bucket_id in range(partition_n_buckets): - bucket_files.append(tmp_dir_sort_dedup / f"bucket_id__{bucket_id}.parquet") - df_concat = ( pl.scan_parquet(bucket_files) # Join SEX @@ -204,11 +204,15 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path: rename_columns = { "main.FINNGENID": "FINNGENID", "main.EVENT_AGE": "EVENT_AGE", - "main.tutkimuskoodistonjarjestelma": "tutkimuskoodistonjarjestelma", + "main.APPROX_EVENT_DAY": "APPROX_EVENT_DAY", + "main.TIME": "TIME", + "main.laboratoriotutkimusnimike": "laboratoriotutkimusnimike", + "main.paikallinentutkimusnimike_koodi": "paikallinentutkimusnimike_koodi", "main.paikallinentutkimusnimike_selite": "paikallinentutkimusnimike_selite", + "main.tutkimuskoodistonjarjestelma": "tutkimuskoodistonjarjestelma", + "main.tutkimusvastauksentila": "tutkimusvastauksentila", "main.tutkimustulosarvo": "tutkimustulosarvo", "main.tutkimustulosyksikko": "tutkimustulosyksikko", - "main.tutkimusvastauksentila": "tutkimusvastauksentila", "main.tuloksenpoikkeavuus": "tuloksenpoikkeavuus", "main.viitearvoryhma": "viitearvoryhma", "main.viitevalialkuarvo": "viitevalialkuarvo", @@ -216,10 +220,6 @@ def consolidate_columns(assembled_file: Path, output_file: Path) -> Path: "main.viitevaliloppuarvo": "viitevaliloppuarvo", "main.viitevaliloppuyksikko": "viitevaliloppuyksikko", "freetext.tutkimustulosteksti": "tutkimustulosteksti", - "main.paikallinentutkimusnimike_koodi": "paikallinentutkimusnimike_koodi", - "main.laboratoriotutkimusnimike": "laboratoriotutkimusnimike", - "main.APPROX_EVENT_DAY": "APPROX_EVENT_DAY", - "main.TIME": "TIME", } out_columns = list(rename_columns.keys()) + ["_rowid_source"] @@ -244,14 +244,17 @@ def partition(assembled_file: Path, tmp_dir: Path, n_buckets): def sort_dedup(frame: pl.LazyFrame | pl.DataFrame): - return ( - frame.sort(by=COLUMNS_UNIQUENESS_SORT) - # Dedup rows - # NOTE(Vincent 2026-05-20) The previous implementation (WDL/Python) was - # doing the dedup on adjacent lines. Here the deduplication is not done - # explicitely on adjacent lines (since polars `unique` does it on the - # full data), though the result should be the same. - .unique(subset=COLUMNS_UNIQUENESS_SORT, keep="first") + all_columns = frame.collect_schema().names() + sort_subset_columns = set(COLUMNS_UNIQUENESS_SORT) + other_columns = [] + for cc in all_columns: + if cc not in sort_subset_columns: + other_columns.append(cc) + + sort_full_columns = COLUMNS_UNIQUENESS_SORT + other_columns + + return frame.sort(by=sort_full_columns).unique( + subset=COLUMNS_UNIQUENESS_SORT, keep="first", maintain_order=True ) From 47434e1341ea3f535fd5d20cdfc48343b134c882 Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 9 Jun 2026 07:47:58 +0000 Subject: [PATCH 21/22] reorder columns in output of intake.assemble --- src/kanta/intake/assemble.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/kanta/intake/assemble.py b/src/kanta/intake/assemble.py index ba1f562..65ff63e 100644 --- a/src/kanta/intake/assemble.py +++ b/src/kanta/intake/assemble.py @@ -131,10 +131,24 @@ def merge_by_pair(pairs: list[tuple[Path, Path]], parquet_output: str | Path) -> ( pl.concat(to_concat) .with_row_index(name="_rowid_source", offset=1) + .pipe(reorder_columns) .sink_parquet(parquet_output) ) +def reorder_columns(frame: pl.LazyFame | pl.DataFrame) -> pl.LazyFrame | pl.DataFrame: + column_order = ( + ["_rowid_source"] + # Columns for main + + [COL_PREFIX_MAIN + "_rowid", COL_PREFIX_MAIN + "_filename"] + + [COL_PREFIX_MAIN + cc for cc in EXPECTED_COLUMNS_MAIN] + # Columns for freetext + + [COL_PREFIX_FREETEXT + "_rowid", COL_PREFIX_FREETEXT + "_filename"] + + [COL_PREFIX_FREETEXT + cc for cc in EXPECTED_COLUMNS_FREETEXT] + ) + return frame.select(column_order) + + def check_merge_consistency(data_path: str | Path) -> bool: # First check: all shared columns have the same values shared_cols = set(EXPECTED_COLUMNS_MAIN).intersection(EXPECTED_COLUMNS_FREETEXT) From 7d9b7cd04ded785b72e5d06d79c96b5c3e2fb1d4 Mon Sep 17 00:00:00 2001 From: Vincent Date: Tue, 9 Jun 2026 07:51:14 +0000 Subject: [PATCH 22/22] =?UTF-8?q?replace=20\t=20with=20=E2=90=89=20(U+2409?= =?UTF-8?q?)=20in=20raw=20data=20intake=20stage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is to prevent any naive TSV parsing from being tripped up. --- src/kanta/intake/tidyup.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/kanta/intake/tidyup.py b/src/kanta/intake/tidyup.py index 67dd59a..2f7ca0d 100644 --- a/src/kanta/intake/tidyup.py +++ b/src/kanta/intake/tidyup.py @@ -108,8 +108,10 @@ def main( ) print("# Sanitize text fields") - - unicode_newline = "\u2424" # Unicode "SYMBOL FOR NEWLINE", displayed as: ␤ + # Unicode "SYMBOL FOR NEWLINE", displayed as: ␤ + unicode_newline = "\u2424" + # Unicode "SYMBOL FOR HORIZONTAL TABULATION", displayed as: ␉ + unicode_tab = "\u2409" trusted_columns = [ "FINNGENID", "EVENT_AGE", @@ -127,9 +129,9 @@ def main( ] ( df_concat.with_columns( - pl.selectors.exclude(*trusted_columns).str.replace_all( - pattern="\r\n|\r|\n", value=unicode_newline - ) + pl.selectors.exclude(*trusted_columns) + .str.replace_all(pattern="\r\n|\r|\n", value=unicode_newline) + .str.replace_all(pattern="\t", value=unicode_tab, literal=True) ) # Re-order column to be somewhat backward compatible with previous implementation .select(