diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 92e47e5b39..67fd8e1042 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -2,6 +2,7 @@ name: Lint on: pull_request: + merge_group: jobs: Lint: diff --git a/c/tests/test_core.c b/c/tests/test_core.c index dad4a81bf2..9d8d9f5a7d 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -101,9 +101,9 @@ test_json_struct_metadata_get_blob(void) { int ret; char metadata[128]; - const char *json; + char *json; tsk_size_t json_buffer_length; - const char *blob; + char *blob; tsk_size_t blob_length; uint8_t *bytes; tsk_size_t metadata_length; @@ -111,9 +111,9 @@ test_json_struct_metadata_get_blob(void) size_t json_length; size_t payload_length; size_t total_length; - const char json_payload[] = "{\"a\":1}"; - const uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 }; - const uint8_t empty_payload[] = { 0 }; + char json_payload[] = "{\"a\":1}"; + uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 }; + uint8_t empty_payload[] = { 0 }; bytes = (uint8_t *) metadata; header_length = 4 + 1 + 8 + 8; @@ -135,7 +135,7 @@ test_json_struct_metadata_get_blob(void) ret = tsk_json_struct_metadata_get_blob( metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); CU_ASSERT_EQUAL(ret, 0); - CU_ASSERT_PTR_EQUAL(json, (const char *) bytes + header_length); + CU_ASSERT_PTR_EQUAL(json, (char *) bytes + header_length); CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); if (json_length > 0) { CU_ASSERT_EQUAL(memcmp(json, json_payload, json_length), 0); @@ -152,7 +152,7 @@ test_json_struct_metadata_get_blob(void) ret = tsk_json_struct_metadata_get_blob( metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); CU_ASSERT_EQUAL(ret, 0); - CU_ASSERT_PTR_EQUAL(json, (const char *) bytes + header_length); + CU_ASSERT_PTR_EQUAL(json, (char *) bytes + header_length); CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); @@ -168,7 +168,7 @@ test_json_struct_metadata_get_blob(void) ret = tsk_json_struct_metadata_get_blob( metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); CU_ASSERT_EQUAL(ret, 0); - CU_ASSERT_PTR_EQUAL(json, (const char *) bytes + header_length); + CU_ASSERT_PTR_EQUAL(json, (char *) bytes + header_length); CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c index 50a1c84417..249409dbd1 100644 --- a/c/tests/test_stats.c +++ b/c/tests/test_stats.c @@ -3680,9 +3680,9 @@ test_two_locus_stat_input_errors(void) row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_treeseq_two_locus_count_stat(&ts, num_sample_sets, sample_set_sizes, - sample_sets, 0, NULL, NULL, NULL, num_sites, row_sites, NULL, num_sites, - col_sites, NULL, 0, result); + ret = tsk_treeseq_two_locus_general_count_stat(&ts, num_sample_sets, + sample_set_sizes, sample_sets, 0, NULL, NULL, NULL, num_sites, row_sites, NULL, + num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_RESULT_DIMS); ret = tsk_treeseq_r2(&ts, 1, sample_set_sizes, sample_sets, num_sites, row_sites, diff --git a/c/tskit/core.c b/c/tskit/core.c index 574424c144..66f8cbd0ef 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -142,9 +142,8 @@ tsk_generate_uuid(char *dest, int TSK_UNUSED(flags)) } int -tsk_json_struct_metadata_get_blob(const char *metadata, tsk_size_t metadata_length, - const char **json, tsk_size_t *json_length, const char **blob, - tsk_size_t *blob_length) +tsk_json_struct_metadata_get_blob(char *metadata, tsk_size_t metadata_length, + char **json, tsk_size_t *json_length, char **blob, tsk_size_t *blob_length) { int ret; uint8_t version; @@ -152,16 +151,16 @@ tsk_json_struct_metadata_get_blob(const char *metadata, tsk_size_t metadata_leng uint64_t binary_length_u64; uint64_t header_and_json_length; uint64_t total_length; - const uint8_t *bytes; - const char *blob_start; - const char *json_start; + uint8_t *bytes; + char *blob_start; + char *json_start; if (metadata == NULL || json == NULL || json_length == NULL || blob == NULL || blob_length == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } - bytes = (const uint8_t *) metadata; + bytes = (uint8_t *) metadata; if (metadata_length < TSK_JSON_BINARY_HEADER_SIZE) { ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); goto out; @@ -191,8 +190,8 @@ tsk_json_struct_metadata_get_blob(const char *metadata, tsk_size_t metadata_leng ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); goto out; } - json_start = (const char *) bytes + TSK_JSON_BINARY_HEADER_SIZE; - blob_start = (const char *) bytes + TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; + json_start = (char *) bytes + TSK_JSON_BINARY_HEADER_SIZE; + blob_start = (char *) bytes + TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; *json = json_start; *json_length = (tsk_size_t) json_length_u64; *blob = blob_start; diff --git a/c/tskit/core.h b/c/tskit/core.h index 9fe0643975..2964e3d8f1 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -1153,9 +1153,8 @@ the original metadata buffer is alive. @param[out] blob_length On success, set to the payload length in bytes. @return Return 0 on success or a negative value on failure. */ -int tsk_json_struct_metadata_get_blob(const char *metadata, tsk_size_t metadata_length, - const char **json, tsk_size_t *json_length, const char **blob, - tsk_size_t *blob_length); +int tsk_json_struct_metadata_get_blob(char *metadata, tsk_size_t metadata_length, + char **json, tsk_size_t *json_length, char **blob, tsk_size_t *blob_length); /* TODO most of these can probably be macros so they compile out as no-ops. * Lets do the 64 bit tsk_size_t switch first though. */ diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 1aa06e5b03..7773e97f97 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -2298,8 +2298,9 @@ get_allele_samples(const tsk_site_t *site, tsk_size_t site_offset, } static int -norm_hap_weighted(tsk_size_t result_dim, const double *hap_weights, - tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params) +norm_hap_weighted(tsk_size_t TSK_UNUSED(state_dim), const double *hap_weights, + tsk_size_t result_dim, tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), + double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *weight_row; @@ -2315,8 +2316,9 @@ norm_hap_weighted(tsk_size_t result_dim, const double *hap_weights, } static int -norm_hap_weighted_ij(tsk_size_t result_dim, const double *hap_weights, - tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params) +norm_hap_weighted_ij(tsk_size_t TSK_UNUSED(state_dim), const double *hap_weights, + tsk_size_t result_dim, tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), + double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *weight_row; @@ -2341,8 +2343,9 @@ norm_hap_weighted_ij(tsk_size_t result_dim, const double *hap_weights, } static int -norm_total_weighted(tsk_size_t result_dim, const double *TSK_UNUSED(hap_weights), - tsk_size_t n_a, tsk_size_t n_b, double *result, void *TSK_UNUSED(params)) +norm_total_weighted(tsk_size_t TSK_UNUSED(state_dim), + const double *TSK_UNUSED(hap_weights), tsk_size_t result_dim, tsk_size_t n_a, + tsk_size_t n_b, double *result, void *TSK_UNUSED(params)) { tsk_size_t k; double norm = 1 / (double) (n_a * n_b); @@ -2411,8 +2414,8 @@ static int compute_general_normed_two_site_stat_result(const tsk_bitset_t *state, const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off, tsk_size_t num_a_alleles, tsk_size_t num_b_alleles, tsk_size_t state_dim, - tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, - norm_func_t *norm_f, bool polarised, two_locus_work_t *restrict work, double *result) + tsk_size_t result_dim, general_stat_func_t *f, void *f_params, norm_func_t *norm_f, + bool polarised, two_locus_work_t *restrict work, double *result) { int ret = 0; // Sample sets and b sites are rows, a sites are columns @@ -2445,7 +2448,7 @@ compute_general_normed_two_site_stat_result(const tsk_bitset_t *state, if (ret != 0) { goto out; } - ret = norm_f(result_dim, weights, num_a_alleles - is_polarised, + ret = norm_f(state_dim, weights, result_dim, num_a_alleles - is_polarised, num_b_alleles - is_polarised, norm, f_params); if (ret != 0) { goto out; @@ -2463,9 +2466,8 @@ compute_general_normed_two_site_stat_result(const tsk_bitset_t *state, static int compute_general_two_site_stat_result(const tsk_bitset_t *state, const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off, - tsk_size_t state_dim, tsk_size_t result_dim, general_stat_func_t *f, - sample_count_stat_params_t *f_params, two_locus_work_t *restrict work, - double *result) + tsk_size_t state_dim, tsk_size_t result_dim, general_stat_func_t *f, void *f_params, + two_locus_work_t *restrict work, double *result) { int ret = 0; tsk_size_t k; @@ -2653,9 +2655,8 @@ static int tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, - sample_count_stat_params_t *f_params, norm_func_t *norm_f, tsk_size_t n_rows, - const tsk_id_t *row_sites, tsk_size_t n_cols, const tsk_id_t *col_sites, - tsk_flags_t options, double *result) + void *f_params, norm_func_t *norm_f, tsk_size_t n_rows, const tsk_id_t *row_sites, + tsk_size_t n_cols, const tsk_id_t *col_sites, tsk_flags_t options, double *result) { int ret = 0; tsk_bitset_t allele_samples, allele_sample_sets; @@ -3089,9 +3090,8 @@ advance_collect_edges(iter_state *s, tsk_id_t index) static int compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, const iter_state *A_state, const iter_state *B_state, tsk_size_t state_dim, - tsk_size_t result_dim, int sign, general_stat_func_t *f, - sample_count_stat_params_t *f_params, two_locus_work_t *restrict work, - double *result) + tsk_size_t result_dim, int sign, general_stat_func_t *f, void *f_params, + two_locus_work_t *restrict work, double *result) { int ret = 0; double a_len, b_len; @@ -3141,8 +3141,8 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, static int compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, - iter_state *r_state, general_stat_func_t *f, sample_count_stat_params_t *f_params, - tsk_size_t result_dim, tsk_size_t state_dim, double *result) + iter_state *r_state, general_stat_func_t *f, void *f_params, tsk_size_t result_dim, + tsk_size_t state_dim, double *result) { int ret = 0; tsk_id_t e, c, ec, p, *updated_nodes = NULL; @@ -3188,8 +3188,11 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, while (n_updates != 0) { n_updates--; c = updated_nodes[n_updates]; - compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, + ret = compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, result_dim, -1, f, f_params, &work, result); + if (ret != 0) { + goto out; + } } // Remove samples under nodes from removed edges to parent nodes for (j = 0; j < r_state->n_edges_out; j++) { @@ -3229,8 +3232,11 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, while (n_updates != 0) { n_updates--; c = updated_nodes[n_updates]; - compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, + ret = compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, result_dim, +1, f, f_params, &work, result); + if (ret != 0) { + goto out; + } } out: tsk_safe_free(updated_nodes); @@ -3243,9 +3249,9 @@ static int tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, - sample_count_stat_params_t *f_params, norm_func_t *TSK_UNUSED(norm_f), - tsk_size_t n_rows, const double *row_positions, tsk_size_t n_cols, - const double *col_positions, tsk_flags_t TSK_UNUSED(options), double *result) + void *f_params, norm_func_t *TSK_UNUSED(norm_f), tsk_size_t n_rows, + const double *row_positions, tsk_size_t n_cols, const double *col_positions, + tsk_flags_t TSK_UNUSED(options), double *result) { int ret = 0; int r, c; @@ -3384,11 +3390,12 @@ check_sample_set_dups(tsk_size_t num_sample_sets, const tsk_size_t *sample_set_s return ret; } +/* Called directly by C python interface `two_locus_count_stat` */ int -tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f, - norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites, +tsk_treeseq_two_locus_general_count_stat(const tsk_treeseq_t *self, + tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + void *f_params, norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t out_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { @@ -3398,10 +3405,6 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl bool stat_site = !!(options & TSK_STAT_SITE); bool stat_branch = !!(options & TSK_STAT_BRANCH); tsk_size_t state_dim = num_sample_sets; - sample_count_stat_params_t f_params = { .sample_sets = sample_sets, - .num_sample_sets = num_sample_sets, - .sample_set_sizes = sample_set_sizes, - .set_indexes = set_indexes }; // We do not support two-locus node stats if (!!(options & TSK_STAT_NODE)) { @@ -3441,9 +3444,10 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl goto out; } ret = tsk_treeseq_two_site_count_stat(self, state_dim, num_sample_sets, - sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows, + sample_set_sizes, sample_sets, result_dim, f, f_params, norm_f, out_rows, row_sites, out_cols, col_sites, options, result); - } else if (stat_branch) { + } else { + tsk_bug_assert(stat_branch); ret = check_positions( row_positions, out_rows, tsk_treeseq_get_sequence_length(self)); if (ret != 0) { @@ -3455,13 +3459,31 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl goto out; } ret = tsk_treeseq_two_branch_count_stat(self, state_dim, num_sample_sets, - sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows, + sample_set_sizes, sample_sets, result_dim, f, f_params, norm_f, out_rows, row_positions, out_cols, col_positions, options, result); } out: return ret; } +/* Wrapper of `tsk_treeseq_two_locus_general_count_stat` for C summary Functions */ +static int +tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f, + norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites, + const double *row_positions, tsk_size_t out_cols, const tsk_id_t *col_sites, + const double *col_positions, tsk_flags_t options, double *result) +{ + sample_count_stat_params_t f_params = { .sample_sets = sample_sets, + .num_sample_sets = num_sample_sets, + .sample_set_sizes = sample_set_sizes, + .set_indexes = set_indexes }; + return tsk_treeseq_two_locus_general_count_stat(self, num_sample_sets, + sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows, + row_sites, row_positions, out_cols, col_sites, col_positions, options, result); +} + /*********************************** * Allele frequency spectrum ***********************************/ diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 84480ed96e..c056b1478e 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -1036,16 +1036,8 @@ int tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t K, const doub tsk_size_t M, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); -typedef int norm_func_t(tsk_size_t result_dim, const double *hap_weights, tsk_size_t n_a, - tsk_size_t n_b, double *result, void *params); - -int tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, - tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, - const tsk_id_t *sample_sets, tsk_size_t result_dim, const tsk_id_t *set_indexes, - general_stat_func_t *f, norm_func_t *norm_f, tsk_size_t out_rows, - const tsk_id_t *row_sites, const double *row_positions, tsk_size_t out_cols, - const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, - double *result); +typedef int norm_func_t(tsk_size_t state_dim, const double *hap_weights, + tsk_size_t result_dim, tsk_size_t n_a, tsk_size_t n_b, double *result, void *params); /* One way weighted stats */ @@ -1120,6 +1112,13 @@ typedef int general_sample_stat_method(const tsk_treeseq_t *self, const tsk_id_t *sample_sets, tsk_size_t num_indexes, const tsk_id_t *indexes, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); +int tsk_treeseq_two_locus_general_count_stat(const tsk_treeseq_t *self, + tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + void *f_params, norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites, + const double *row_positions, tsk_size_t out_cols, const tsk_id_t *col_sites, + const double *col_positions, tsk_flags_t options, double *result); + typedef int two_locus_count_stat_method(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, diff --git a/docs/development.md b/docs/development.md index b38cc292ce..87c92024f5 100644 --- a/docs/development.md +++ b/docs/development.md @@ -89,28 +89,40 @@ is required for building the C API documentation. On Debian/Ubuntu we can install these with: ```bash -$ sudo apt install build-essential doxygen +sudo apt install build-essential doxygen ``` -All Python development is managed using [uv](https://docs.astral.sh/uv/). +On macOS, either `brew install doxygen` or +`sudo port install doxygen` should get doxygen. +You'll also need a "essential build" tools: +a compiler (`gcc`) and a few other things (e.g., `make`). + +All Python development is managed using [uv](https://docs.astral.sh/uv/), +which takes the place of virtual/conda environments. It is not strictly necessary to use uv in order to make small changes, but -the development workflows of all tskit-dev packages are organised around +if you don't use it, you'll need to figure out how to install python +dependencies on your own, +and the development workflows of all tskit-dev packages are organised around using uv, and therefore we strongly recommend using it. Uv is straightforward to install, and not invasive (existing Python installations can be completely isolated if you don't use features like ``uv tool`` etc which update your -$HOME/.local/bin). Uv manages an isolated local environment per project +``$HOME/.local/bin``). Uv manages an isolated local environment per project and allows us to deterministically pin package versions and easily switch between Python versions, so that CI environments can be replicated exactly locally. The packages needed for development are specified as dependency groups in ``python/pyproject.toml`` and managed with [uv](https://docs.astral.sh/uv/). -Install all development dependencies using: +Install all development dependencies by running: ```bash -$ uv sync +cd python +uv sync ``` +Since `uv` operates from the `python/` subdirectory, +**all `uv` commands below must be run from within that subdirectory**; +otherwise errors like "No such file or directory" will occur. The lock file lives at `python/uv.lock` and must be kept up to date. Run `uv lock` after any change to the dependencies in `python/pyproject.toml`. @@ -127,11 +139,12 @@ To get a local git development environment, please follow these steps: - Make a fork of the tskit repo on [GitHub](http://github.com/tskit-dev/tskit) - Clone your fork into a local directory: ```bash - $ git clone git@github.com:YOUR_GITHUB_USERNAME/tskit.git + git clone git@github.com:YOUR_GITHUB_USERNAME/tskit.git ``` -- Install the {ref}`sec_development_workflow_prek` pre-commit hook: +- Install the {ref}`sec_development_workflow_prek` pre-commit hook + (again from the ``python/`` subdirectory): ```bash - $ uv run prek install + uv run prek install ``` See the {ref}`sec_development_workflow_git` section for detailed information @@ -167,15 +180,14 @@ skip to {ref}`sec_development_workflow_anothers_commit`. [upstream remote]( https://help.github.com/articles/configuring-a-remote-for-a-fork/): ```bash - $ git remote add upstream https://github.com/tskit-dev/tskit.git + git remote add upstream https://github.com/tskit-dev/tskit.git ``` 3. Create a "topic branch" to work on. One reliable way to do it is to follow this recipe: ```bash - $ git fetch upstream - $ git checkout upstream/main - $ git checkout -b topic_branch_name + git fetch upstream + git checkout -b topic_branch_name upstream/main ``` 4. Write your code following the outline in {ref}`sec_development_best_practices`. @@ -201,7 +213,7 @@ skip to {ref}`sec_development_workflow_anothers_commit`. to document any breaking changes separately in a "breaking changes" section. 8. Push your changes to your topic branch and either open the PR or, if you - opened a draft PR above change it to a non-draft PR by clicking "Ready to + already opened a draft PR change it to a non-draft PR by clicking "Ready to Review". 9. The tskit community will review the code, asking you to make changes where appropriate. @@ -235,7 +247,7 @@ Then, continuing from above: 3. Fetch the pull request, and store it as a local branch. For instance, to name the local branch `my_pr_copy`: ```bash - $ git fetch upstream pull/854/head:my_pr_copy + git fetch upstream pull/854/head:my_pr_copy ``` You should probably call the branch something more descriptive, though. (Also note that you might need to put `origin` instead @@ -244,7 +256,7 @@ Then, continuing from above: 4. Check out the pull request's local branch: ```bash - $ git checkout my_pr_copy + git checkout my_pr_copy ``` Now, your repository will be in exactly the same state as @@ -258,21 +270,22 @@ subdirectory. To test out changes to the *code*, you can change to the `python/` subdirectory, and run `make` to compile the C code. -If you then execute `python` from this subdirectory (and only this one!), +If you then execute python commands from this subdirectory (and only this one!), it will use the modified version of the package. -(For instance, you might want to -open an interactive `python` shell from the `python/` subdirectory, +(For instance, you might want to open an interactive python shell by running +`uv run python` in the `python/` subdirectory, or running `uv run pytest` from this subdirectory.) After you're done, you should do: ```bash -$ git checkout main +git checkout main ``` to get your repository back to the "main" branch of development. If the pull request is changed and you want to do the same thing again, -then first *delete* your local copy (by doing `git branch -d my_pr_copy`) +then to avoid conflicts with any changes you might have made, +first *delete* your local copy (by doing `git branch -d my_pr_copy`) and repeat the steps again. @@ -285,16 +298,10 @@ On each commit a [prek](https://prek.j178.dev) hook will run checks for code style (see the {ref}`sec_development_python_style` section for details) and other common problems. -To install the hook: +To run checks manually without committing, from the `python/` subdirectory: ```bash -$ uv run prek install -``` - -To run checks manually without committing: - -```bash -$ uv run prek --all-files +uv run prek --all-files ``` If local results differ from CI, run `uv run prek cache clean` to clear the cache. @@ -467,6 +474,9 @@ See :ref:`sec_development_documentation_cross_referencing` for details. The :meth:`.TreeSequence.trees` method returns an iterator. ```` +Some errors may occur because of out-of-date cached results, +which can be cleared by running `make clean`. + (sec_development_python)= @@ -528,27 +538,31 @@ The tests are defined in the `tests` directory, and run using If you want to run the tests in a particular module (say, `test_tables.py`), use: ```bash -$ uv run pytest tests/test_tables.py +uv run pytest tests/test_tables.py ``` To run all the tests in a particular class in this module (say, `TestNodeTable`) use: ```bash -$ uv run pytest tests/test_tables.py::TestNodeTable +uv run pytest tests/test_tables.py::TestNodeTable ``` To run a specific test case in this class (say, `test_copy`) use: ```bash -$ uv run pytest tests/test_tables.py::TestNodeTable::test_copy +uv run pytest tests/test_tables.py::TestNodeTable::test_copy ``` +In general, you can copy-paste the string describing a failed test from the +output of pytest to re-run just that test (including specific parametrized +arguments present as `[args]`). + You can also run tests with a keyword expression search. For example this will run all tests that have `TestNodeTable` but not `copy` in their name: ```bash -$ uv run pytest -k "TestNodeTable and not copy" +uv run pytest -k "TestNodeTable and not copy" ``` When developing your own tests, it is much quicker to run the specific tests @@ -558,41 +572,41 @@ suite each time. To run all of the tests, we can use: ```bash -$ uv run pytest +uv run pytest ``` By default the tests are run on 4 cores, if you have more you can specify: ```bash -$ uv run pytest -n8 +uv run pytest -n8 ``` A few of the tests take most of the time, we can skip the slow tests to get the test run under 20 seconds on an modern workstation: ```bash -$ uv run pytest --skip-slow +uv run pytest --skip-slow ``` If you have an agent running the tests in a sandboxed environment, you may need to skip tests thsat require network access or FIFOs: ```bash -$ uv run pytest --skip-network +uv run pytest --skip-network ``` If you have a lot of failing tests it can be useful to have a shorter summary of the failing lines: ```bash -$ uv run pytest --tb=line +uv run pytest --tb=line ``` If you need to see the output of tests (e.g. `print` statements) then you need to use these flags to run a single thread and capture output: ```bash -$ uv run pytest -n0 -vs +uv run pytest -n0 -vs ``` All new code must have high test coverage, which will be checked as part of the @@ -642,7 +656,7 @@ However, if you really need to be on the bleeding edge, you can use the following command to install: ```bash -$ python3 -m pip install git+https://github.com/tskit-dev/tskit.git#subdirectory=python +python3 -m pip install git+https://github.com/tskit-dev/tskit.git#subdirectory=python ``` (Because the Python package is not defined in the project root directory, using pip to @@ -687,13 +701,16 @@ to automatically format code. On Debian/Ubuntu, install the system dependencies with: ```bash -$ sudo apt install libcunit1-dev ninja-build +sudo apt install libcunit1-dev ninja-build ``` -Install meson using uv: +On macOS, you can run `brew install cunit ninja` +or `sudo port install cunit ninja`. + +You can install meson using uv: ```bash -$ uv tool install meson +uv tool install meson ``` An exact version of clang-format is required because formatting rules @@ -716,7 +733,7 @@ with a custom configuration. This is checked as part of the {ref}`prek checks `. To manually format all files run: ```bash -$ uv run prek --all-files +uv run prek --all-files ``` If you are doing this in the ``c`` directory, use @@ -728,7 +745,7 @@ prek searching for configuration within subdirectories. To avoid this, tell prek where to find its config explicitly: ```bash -$ uv run prek --all-files -c prek.toml +uv run prek --all-files -c prek.toml ``` @@ -741,17 +758,19 @@ is defined in `meson.build`. To set up the initial build directory, run ```bash -$ cd c -$ meson setup build +cd c +meson setup build ``` -To setup a debug build add `--buildtype=debug` to the above command. This will set the `TSK_TRACE_ERRORS` +To setup a debug build add `--buildtype=debug` to the above command. +(Re-running the command with this argument will have the desired effect.) +This will set the `TSK_TRACE_ERRORS` flag, which will print error messages to `stderr` when errors occur which is useful for debugging. To compile the code run ```bash -$ ninja -C build +ninja -C build ``` All the tests and other artefacts are in the build directory. Individual test @@ -759,7 +778,7 @@ suites can be run, via (e.g.) `./build/test_trees`. To run all of the tests, run ```bash -$ ninja -C build test +ninja -C build test ``` For vim users, the [mesonic](https://www.vim.org/scripts/script.php?script_id=5378) plugin @@ -790,7 +809,14 @@ To just run a specific test on its own, provide this test name as a command line argument, e.g.: ```bash -$ ./build/test_tables test_node_table +./build/test_tables test_node_table +``` + +After making sure tests pass, you should next run the tests through valgrind, +to check for memory leaks, for instance: + +```bash +valgrind ./build/test_tables test_node_table ``` While 100% test coverage is not feasible for C code, we aim to cover all code @@ -805,20 +831,20 @@ To generate and view coverage reports for the C tests locally: Compile with coverage enabled: ```bash - $ cd c - $ meson build -D b_coverage=true - $ ninja -C build + cd c + meson build -D b_coverage=true + ninja -C build ``` Run the tests: ```bash - $ ninja -C build test + ninja -C build test ``` Generate coverage data: ```bash - $ cd build - $ find ../tskit/*.c -type f -printf "%f\n" | xargs -i gcov -pb libtskit.a.p/tskit_{}.gcno ../tskit/{} + cd build + find ../tskit/*.c -type f -printf "%f\n" | xargs -i gcov -pb libtskit.a.p/tskit_{}.gcno ../tskit/{} ``` The generated `.gcov` files can then be viewed directly with `cat filename.c.gcov`. @@ -826,10 +852,10 @@ Lines prefixed with `#####` were never executed, lines with numbers show executi `lcov` can be used to create browsable HTML coverage reports: ```bash - $ sudo apt-get install lcov # if needed - $ lcov --capture --directory build-gcc --output-file coverage.info - $ genhtml coverage.info --output-directory coverage_html - $ firefox coverage_html/index.html + sudo apt-get install lcov # if needed + lcov --capture --directory build-gcc --output-file coverage.info + genhtml coverage.info --output-directory coverage_html + firefox coverage_html/index.html ``` ### Coding conventions @@ -963,20 +989,20 @@ module and how it is built from source. The module is built automatically by The simplest way to do this is to run `make` in the `python` directory: ```bash -$ make +make ``` If `make` is not available, you can run the same command manually: ```bash -$ uv run python setup.py build_ext --inplace +uv run python setup.py build_ext --inplace ``` It is sometimes useful to specify compiler flags when building the low level module. For example, to make a debug build you can use: ```bash -$ CFLAGS='-Wall -O0 -g' make +CFLAGS='-Wall -O0 -g' make ``` If you need to track down a segfault etc, running some code through gdb can @@ -984,10 +1010,7 @@ be very useful. For example, to run a particular test case, we can do: ```bash -$ gdb python -(gdb) run -m pytest tests/test_python_c.py - - +uv run gdb python (gdb) run -m pytest -vs tests/test_tables.py::TestNodeTable::test_copy Starting program: /usr/bin/python3 run -m pytest tests/test_tables.py::TestNodeTable::test_copy [Thread debugging using libthread_db enabled] @@ -1029,7 +1052,7 @@ Continuous integration is handled by [GitHub Actions](https://help.github.com/en tskit uses shared workflows defined in the [tskit-dev/.github](https://github.com/tskit-dev/.github) repository: -- **lint** — runs prek against all files +- **lint** — runs ruff and clang (using prek) against all files - **python-tests** — runs the pytest suite with coverage on Linux, macOS and Windows - **python-c-tests** — builds the C extension with coverage and runs low-level tests - **c-tests** — runs C unit tests under gcc, clang, and valgrind @@ -1050,6 +1073,9 @@ tskit codebase. Note that this guide covers the most complex case of adding a new function to both the C and Python APIs. +0. Draft a docstring for your function, that describes exactly what the function + takes as arguments and what it returns under what conditions. Update this + docstring as you go along and make modifications. 1. Write your function in Python: in `python/tests/` find the test module that pertains to the functionality you wish to add. For instance, the kc_distance metric was added to @@ -1085,7 +1111,7 @@ the C and Python APIs. the example of other tests, you might need to only add a single line of code here. In this case, the tests are well factored so that we can easily compare the results from both the Python and C versions. -9. Write a docstring for your function in the Python API: for instance, the kc_distance +9. Finalize your docstring and insert it into the Python API: for instance, the kc_distance docstring is in [tskit/python/tskit/trees.py](https://github.com/tskit-dev/tskit/blob/main/python/tskit/trees.py). Ensure that your docstring renders correctly by building the documentation diff --git a/docs/export.md b/docs/export.md index 6dddb73a7c..f5f039e0d1 100644 --- a/docs/export.md +++ b/docs/export.md @@ -41,7 +41,7 @@ If we have a tree sequence file the convenient way to convert to VCF: :::{code-block} bash -$ tskit vcf example.trees > example.vcf +tskit vcf example.trees > example.vcf ::: See the {ref}`sec_export_vcf_compression` section for information @@ -137,14 +137,14 @@ The simplest way to compress the VCF output is to use the and pipe the output to `bgzip`: :::{code-block} bash -$ tskit vcf example.trees | bgzip -c > example.vcf.gz +tskit vcf example.trees | bgzip -c > example.vcf.gz ::: A general way to convert VCF data to various formats is to pipe the text produced by ``tskit`` into ``bcftools`` using the command line interface: :::{code-block} bash -$ tskit vcf example.trees | bcftools view -O b > example.bcf +tskit vcf example.trees | bcftools view -O b > example.bcf ::: If you need more control over the form of the output (or want to work diff --git a/docs/installation.md b/docs/installation.md index 6da52e12a8..ef557c1d02 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -46,7 +46,7 @@ Packages for recent version of Python are available for Linux, OSX and Windows. using: ```bash -$ conda install -c conda-forge tskit +conda install -c conda-forge tskit ``` ### Quick Start @@ -75,7 +75,7 @@ may result in code that is (slightly) faster on your specific hardware. installations. Installation is straightforward: ```bash -$ python3 -m pip install tskit +python3 -m pip install tskit ``` (sec_installation_development_versions)= diff --git a/docs/provenance.md b/docs/provenance.md index d07da38b11..330ad859d4 100644 --- a/docs/provenance.md +++ b/docs/provenance.md @@ -251,4 +251,4 @@ should validate the output JSON against this schema. ```{eval-rst} .. literalinclude:: ../python/tskit/provenance.schema.json :language: json -``` \ No newline at end of file +``` diff --git a/docs/stats.md b/docs/stats.md index 87a2ed5f83..b0f8b7b55f 100644 --- a/docs/stats.md +++ b/docs/stats.md @@ -684,15 +684,397 @@ and boolean expressions (e.g., {math}`(x > 0)`) are interpreted as 0/1. and {math}`v_j` is the covariance of the trait with the j-th covariate. +(sec_stats_multi_site)= + ## Multi site statistics -:::{todo} -Document statistics that use information about correlation between sites, such as -LdCalculator (and perhaps reference {ref}`sec_identity`). Note that if we have a general -framework which has the same calling conventions as the single site stats, -we can rework the sections above. +(sec_stats_two_locus)= + +### Two-locus statistics + +The {meth}`~TreeSequence.ld_matrix` method provides an interface to +a collection of two-locus statistics with predefined summary functions (see +{ref}`sec_stats_two_locus_summary_functions`). +The LD matrix method differs from other +statistics methods in that it provides a unified API with an argument to +specify different two-locus summaries of the data. It otherwise behaves +similarly to most other functions with respect to `sample_sets` and `indexes`. + +Two-locus statistics can be computed using two {ref}`modes `, +either `site` or `branch`, and these should be interpreted in the same way as +these modes in the single-site statistics. That is, the `site` mode computes LD +over observed alleles at pairs of sites, while the `branch` model computes +expected LD conditioned on pairs of trees. + +(sec_stats_two_locus_site)= + +#### Site mode + +The `"site"` mode computes two-locus statistics summarized over alleles between +all pairs of specified sites. The default behavior, leaving `sites` +unspecified, will compute a matrix for all pairs of sites, with one row and +column for each site in the tree sequence (i.e., an {math}`n \times n` matrix +where {math}`n` is the number of sites in the tree sequence). We can also +restrict the output to a subset of sites, either by specifying a single vector +of site indexes for both rows and columns or a pair of vectors for the row +sites and column sites separately. + +The following computes a matrix of the {math}`r^2` measure of linkage +disequilibrium (LD) computed pairwise between the first 4 sites in the tree +sequence among all samples. The `sites` must be given as a list of lists, and +with a single list of sites specified, we obtain a symmetric square matrix. + +```{code-cell} ipython3 +ld = ts.ld_matrix(sites=[[0, 1, 2, 3]]) +print(ld) +``` + +If a list of two lists of site indexes is provided, these specify the row and +column sites. For instance, here we specify 2 rows and 3 columns, which +computes a subset of the matrix shown above. + +```{code-cell} ipython3 +ld = ts.ld_matrix(sites=[[1, 2], [1, 2, 3]]) +print(ld) +``` + +(sec_stats_two_locus_computational_details)= + +#### Computational details + +Because we allow for two-locus statistics to be computed for multi-allelic +data, we need to be able to combine statistical results from each pair of +alleles into one summary for a pair of sites. This does not affect biallelic +data (and so this section can be skipped on first reading). +We use two implementations for +combining results from multiple alleles: `hap_weighted` and `total_weighted`. +These are statistic-specific and not chosen by the user, with choices motivated +by [Zhao (2007)](https://doi.org/10.1017/S0016672307008634). + +Briefly, consider a pair of sites with {math}`n` alleles at the first locus and +{math}`m` alleles at the second. (Whether this includes the ancestral allele +depends on whether the statistic is polarised.) Write {math}`f_{ij}` as the +statistic computed for focal alleles {math}`A_i` and {math}`B_j`. Then the +weighting schemes are defined as: + +- `hap_weighted`: {math}`\sum_{i=1}^{n}\sum_{j=1}^{m}p(A_{i}B_{j})f_{ij}`, + where {math}`p(A_{i}B_{j})` is the frequency of haplotype {math}`A_{i}B_{j}`. + This method was first introduced in [Karlin + (1981)](https://doi.org/10.1111/j.1469-1809.1981.tb00308.x) and reviewed in + [Zhao (2007)](https://doi.org/10.1017/S0016672307008634). + +- `total_weighted`: {math}`\frac{1}{n m}\sum_{i=1}^{n}\sum_{j=1}^{m}f_{ij}`. + This method assigns equal weight to each of the possible pairs of focal + alleles at the two sites, taking the arithmetic mean of statistics over + focal haplotypes. + +Out of all of the available summary functions, only {math}`r^2` uses +`hap_weighted` normalisation, with the remainder using uniform weighting +(`total_weighted`). + +Within this framework, statistics may be either polarised or unpolarised. For +statistics that are polarised, we compute statistic values for pairs of derived +alleles. (For this purpose, the "derived" alleles at a site are all alleles +except that stored as the ``ancestral_state`` for the site.) Unpolarised +statistics compute statistics over all pairs of alleles, derived and ancestral. +In either case, the result is averaged over these values, using one of the +weighting scheme (described below for each statistics). The option for +polarisation is not exposed to the user, and we list which statistics are +polarised below. + +(sec_stats_two_locus_branch)= + +#### Branch mode + +The `"branch"` mode computes expected two-locus statistics between pairs of +trees, conditioned on the marginal topologies and branch lengths of those +trees. The trees for which we compute statistics are specified by positions, +and for a pair of positions we consider all possible haplotypes that could be +generated by a single mutation occurring on each of the two trees. + +For two trees, one with {math}`n` branches and the other with {math}`m` +branches, there are {math}`nm` possible pairs of branches that may carry the +pair of mutations. For each pair, we compute the two-locus statistic, and then +sum these values weighted by the product of the two branch lengths. Given that +the two mutations occur, this accounts for the relative probability that the +two mutations fall on any pair of branches. + +In other words, imagine we place two mutations uniformly, one on each tree, and +then compute the statistic. The branch mode computes the expected value of the +statistic over this process, multiplied by the product of the total branch +lengths of each tree. This weighting accounts for mutational opportunity, so that +the sum of the branch-mode statistic over all positions in a genomic region, +multiplied by a mutation rate, is equal to the expected sum of the two-locus site +statistic over all mutations falling in that region under an infinite-sites model. + +The time complexity of this method is quadratic in the number of samples, +due to the pairwise comparisons of branches from each pair of trees. +By default, this method computes +a symmetric matrix for all pairs of trees, with rows and columns representing +each tree in the tree sequence. Similar to the site method, we can restrict the +output to a subset of trees, either by specifying a vector of positions or +a pair of vectors for row and column positions separately. To select a specific +tree, the specified positions must land in the tree span (`[start, end)`). + +In the following, we compute a matrix of expected {math}`r^2` within and +between the first 4 trees in the tree sequence. The tree breakpoints are +a convenient way to specify those first four trees. + +```{code-cell} ipython3 +ld = ts.ld_matrix( + mode="branch", + positions=[ts.breakpoints(as_array=True)[0:4]] +) +print(ld) +``` + +We note that these values are quite large: as described above, the statistic is +scaled by the product of the total branch lengths of each pair of trees. To +compute the expected {math}`r^2` value for a pair of mutations that each land +uniformly on the pair of trees, we can divide by the product of the total +branch lengths: + +```{code-cell} ipython3 +total_branch_lengths = [tree.total_branch_length for tree in ts.trees()] +prod_branch_lengths = np.outer(total_branch_lengths, total_branch_lengths) +print(ld / prod_branch_lengths[0:4, 0:4]) +``` + +To compute the average {math}`r^2` for a uniformly chosen pair of mutations, we also +weight by tree span: + +```{code-cell} ipython3 +tree_spans = np.array([t.span for t in ts.trees()]) +total_opportunity = np.sum(tree_spans * total_branch_lengths) +all_ld = ts.ld_matrix(mode="branch") +mean_ld = np.sum(all_ld * np.outer(tree_spans, tree_spans)) / total_opportunity ** 2 +print("mean infinite-sites LD:", mean_ld) +``` + +As with the `"site"` mode above, we can specify the row and column trees +separately. + +```{code-cell} ipython3 +breakpoints = ts.breakpoints(as_array=True) +ld = ts.ld_matrix( + mode="branch", + positions=[breakpoints[[0]], breakpoints[0:4]] +) +print(ld) +``` + +(sec_stats_two_locus_sample_sets)= + +#### Sample Sets + +Without specifying `sample_sets` or `indexes`, the `ld_matrix()` method +computes statistics over a single sample set that includes all samples in the +tree sequence. The API allows for the specification of a subset or multiple +subsets of samples, so that a separate LD matrix can be computed for each. If +`sample_sets` is specified as a single list of samples, then a single LD matrix +is returned. A list of lists of samples will return a 3D array containing an LD +matrix for each list of samples. + +Some LD statistics can be computed between sample sets (two-way statistics are +specified below), in which case `indexes` must be specified that reference the +indexes of the `sample_sets`, which must be a list of lists of sample nodes. +This results in an LD matrix computed for each list of indexes. The statistics +are selected in the same way (with the `stat` argument), and these are limited +to a handful of statistics (see +{ref}`sec_stats_two_locus_summary_functions_two_way`). The dimension-dropping +rules for the result follow the rest of the tskit stats API in that a single +list or tuple will produce a single two-dimensional matrix, while a list of +these will produce a three-dimensional array, with the first dimension of +length equal to the length of the list. + +For example, to compute the {math}`r^2` LD matrix over a subset of samples in +the tree sequence (such as sample nodes 0 through 7), we would specify the +samples as follows: + +```{code-cell} ipython3 +ts = msprime.sim_ancestry( + 20, + population_size=10000, + sequence_length=1000, + recombination_rate=2e-8, + random_seed=12) +ts = msprime.sim_mutations(ts, rate=2e-8, random_seed=12) + +ld = ts.ld_matrix(mode="site", sample_sets=range(8)) +print(ld) +``` + +We would get the following dimensions with the specified +`sample_sets` and `indexes` arguments. + +``` +# one-way +ts.ld_matrix(sample_sets=None) # -> 2 dimensions +ts.ld_matrix(sample_sets=[0, 1, 2, 3]) # -> 2 dimensions +ts.ld_matrix(sample_sets=[[0, 1, 2, 3]]) # -> 3 dimensions +# two-way +ts.ld_matrix(sample_sets=[[0, 1, 2, 3], [4, 5, 6, 7]], indexes=(0, 1)) # -> 2 dimensions +ts.ld_matrix(sample_sets=[[0, 1, 2, 3], [4, 5, 6, 7]], indexes=[(0, 1)]) # -> 3 dimensions +``` + +#### Why are there `nan` values in the LD matrix? + +For some statistics, it is possible to observe `nan` entries in the LD matrix, +which can be surprising and may numerically impact downstream analyses. A `nan` +entry occurs if the denominator of a ratio statistic (including {math}`r` and +{math}`r^2`) is zero, indicating that one or both of the alleles in the pair is +fixed or absent in the given sample set(s). This can happen for +a number of reasons: + +- Some mutation models allow for reversible mutations, so a back mutation at + a site can result in a single allele despite multiple mutations in the + history of the sample. +- LD is computed for a subsample of individuals, and some sites are not + variable among the sample nodes in the subsample. +- A mutation exists above the root of the local tree, so that all samples carry + the mutation, and one or more sites are not variable. + +The `branch` mode will also return `nan` values for ratio statistics if there +are branches in either tree on which a mutation would not result in +a polymorphism within a sample set. + +:::{warning} +This means there are two common situations in which many or all LD values will be `nan`. +These are: + +1. A branch-mode ratio statistic computed on less than the full set of samples + will always be `nan`, since part of the trees are ancestral to none of the samples. +2. A site-mode ratio statistic will be `nan` at any sites at which there are alleles found + in the entire set of samples that are not seen in the provided sample set. + +This behavior **may change in the future**, +because possibly more natural behavior not currently implemented +would be to ignore the branches/alleles not ancestral +to any of the provided samples. ::: +(sec_stats_two_locus_sample_one_way_stats)= + +#### One-way Statistics + +One-way statistics are summaries of two loci in a single sample set, using +a triple of haplotype counts {math}`\{n_{AB}, n_{Ab}, n_{aB}\}` and the size of +the sample set {math}`n`, where the capitalized and lowercase letters in our +notation represent alternate alleles. + +(sec_stats_two_locus_sample_two_way_stats)= + +#### Two-way Statistics + +Two-way statistics are summaries of haplotype counts between two sample sets, +which operate on the three haplotype counts (as in one-way stats, above) +computed from each sample set, indexed by `(i, j)`. These statistics take on +a different meaning from their one-way counterparts. For instance `stat="D2"` +over a pair of sample sets computes {math}`D_i D_j`, which is the product of +the covariance measure of LD within each sample set and is related to the +covariance of {math}`D` between sample sets. + +Only a subset of our summary functions are two-way statistics (see +{ref}`sec_two_locus_summary_functions_two_way`). Note that the unbiased two-way +statistics expect non-overlapping sample sets (see [Ragsdale and Gravel +(2020)](https://doi.org/10.1093/molbev/msz265)), and we do not make any +assertions about the sample sets and assume that `i` and `j` represent disjoint +sets of samples (see also the note in {meth}`~TreeSequence.divergence`). + +(sec_stats_two_locus_summary_functions)= + +#### Summary Functions + +(sec_stats_two_locus_summary_functions_one_way)= + +##### One-way + +The two-locus summary functions all take haplotype counts and sample set size +as input. Each of our summary functions has the signature +{math}`f(n_{AB}, n_{Ab}, n_{aB}, n)`, converting to haplotype frequencies +{math}`\{p_{AB}, p_{Ab}, p_{aB}\}` by dividing by {math}`n`. Below, +{math}`n_{ab} = n - n_{AB} - n_{Ab} - n_{aB}`, {math}`n_A = n_{AB} + n_{Ab}` +and {math}`n_B = n_{AB} + n_{aB}`, with frequencies {math}`p` found by dividing +by {math}`n`. + +Our convention is to use {math}`A,B` to denote derived alleles, and {math}`a,b` +ancestral alleles (or other alleles, if the site is multi-allelic). For +polarised statistics, we average statistics over all non-ancestral alleles. For +unpolarised statistics, the labeling is arbitrary as we average over all +alleles (derived and ancestral). + +`D` +: {math}`f(n_{AB}, n_{Ab}, n_{aB}, n) = p_{AB}p_{ab} - p_{Ab}p_{aB} \, (=p_{AB} - p_A p_B)` + + This statistic is polarised, as the unpolarised result, which averages over + allele labelings, is zero. Uses the `total` weighting method. + +`D_prime` +: {math}`f(n_{AB}, n_{Ab}, n_{aB}, n) = \frac{D}{D_{\max}}`, + + where {math}`D_{\max} = \begin{cases} + \min\{p_A (1-p_B), p_B (1-p_B)\} & \textrm{if }D>=0 \\ + \min\{p_A p_B, (1-p_B) (1-p_B)\} & \textrm{if }D<0 + \end{cases}` + + and {math}`D` is defined above. Polarised, `total` weighted. + +`D2` +: {math}`f(n_{AB}, n_{Ab}, n_{aB}, n) = D^2` + + and {math}`D` is defined above. Unpolarised, `total` weighted. + +`Dz` +: {math}`f(n_{AB}, n_{Ab}, n_{aB}, n) = D (1 - 2 p_A) (1 - 2 p_B)`, + + where {math}`D` is defined above. Unpolarised, `total` weighted. + +`pi2` +: {math}`f(n_{AB}, n_{Ab}, n_{aB}, n) = p_A (1-p_A) p_B (1-p_B)` + + Unpolarised, `total` weighted. + +`r` +: {math}`f(n_{AB}, n_{Ab}, n_{aB}, n) = \frac{D}{\sqrt{p_A (1-p_A) p_B (1-p_B)}}`, + + where {math}`D` is defined above. Polarised, `total` weighted. + +`r2` +: {math}`f(n_{AB}, n_{Ab}, n_{aB}, n) = \frac{D^{2}}{p_A (1-p_A) p_B (1-p_B))}`, + + where {math}`D` is defined above. Unpolarised, `haplotype` weighted. + +Unbiased two-locus statistics from the Hill-Robertson (1968) system are +computed from haplotype counts. Definitions of these unbiased estimators can +be found in [Ragsdale and Gravel +(2020)](https://doi.org/10.1093/molbev/msz265). They require at least 4 samples +to be valid and are specified as `stat="D2_unbiased"`, `"Dz_unbiased"`, or +`"pi2_unbiased"`. + +(sec_two_locus_summary_functions_two_way)= + +(sec_stats_two_locus_summary_functions_two_way)= + +##### Two-way + +Two-way statistics are indexed by sample sets {math}`i, j` and compute values +using haplotype counts within pairs of sample sets. + +`D2` +: {math}`f(n_{AB}, n_{Ab}, n_{aB}, n) = D_i D_j`, + + where {math}`D_i` denotes {math}`D` computed within sample set {math}`i`, + and {math}`D` is defined above. Unpolarised, `total` weighted. + +`r2` +: {math}`f(n_{AB}, n_{Ab}, n_{aB}, n) = r_i r_j`, + + where {math}`r_i` denotes {math}`r` computed within sample set {math}`i`, + and {math}`r` is defined above. Unpolarised, `haplotype` weighted. + +And `D2_unbiased`, which can be found in [Ragsdale and Gravel +(2020)](https://doi.org/10.1093/molbev/msz265). + (sec_stats_notes)= diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 916412cc81..bc9b6217da 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -7,6 +7,12 @@ In development - Add ``json+struct`` metadata codec that allows storing binary data using a struct schema alongside JSON metadata. (:user:`benjeffery`, :pr:`3306`) +**Features** + +- Add ``TreeSequence.ld_matrix`` stats method and documentation, for computing + two-locus statistics in site and branch mode. + (:user:`lkirk`, :user:`apragsdale`, :pr:`3416`) + -------------------- [1.0.2] - 2026-03-06 -------------------- diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 0e0c1c5ed5..d61776c74d 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -7916,6 +7916,7 @@ parse_sites(TreeSequence *self, PyObject *sites, npy_intp *out_dim) if (array == NULL) { goto out; } + PyArray_CLEARFLAGS(array, NPY_ARRAY_WRITEABLE); *out_dim = PyArray_DIM(array, 0); } @@ -7940,12 +7941,289 @@ parse_positions(TreeSequence *self, PyObject *positions, npy_intp *out_dim) if (array == NULL) { goto out; } + PyArray_CLEARFLAGS(array, NPY_ARRAY_WRITEABLE); *out_dim = PyArray_DIM(array, 0); } out: return array; } +typedef struct { + PyArrayObject *sample_set_sizes; + PyObject *summary_func; + PyObject *norm_func; +} two_locus_general_stat_params; + +static int +general_two_locus_norm_func(tsk_size_t K, const double *X, tsk_size_t result_dim, + tsk_size_t n_a, tsk_size_t n_b, double *Y, void *params) +{ + int ret = TSK_PYTHON_CALLBACK_ERROR; + PyObject *arglist = NULL; + PyObject *result = NULL; + PyArrayObject *n_a_scalar = NULL; + PyArrayObject *n_b_scalar = NULL; + PyArrayObject *X_array = NULL; + PyArrayObject *Y_array = NULL; + two_locus_general_stat_params *tl_params = params; + PyObject *summary_func = tl_params->norm_func; + PyArrayObject *ss_sizes = tl_params->sample_set_sizes; + npy_intp X_dims[2] = { 3, K }; + npy_intp X_strides[2] = { sizeof(double), sizeof(double) * 3 }; + + // Create a read only view of X as a numpy array. X is transposed from its + // native memory layout (K, 3) -> (3, K). More detailed comment below. + X_array = (PyArrayObject *) PyArray_New(&PyArray_Type, 2, X_dims, NPY_FLOAT64, + X_strides, (void *) X, -1, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_ALIGNED, NULL); + if (X_array == NULL) { + goto out; + } + n_a_scalar + = (PyArrayObject *) PyArray_Scalar(&n_a, PyArray_DescrFromType(NPY_INT64), NULL); + if (n_a_scalar == NULL) { + goto out; + } + n_b_scalar + = (PyArrayObject *) PyArray_Scalar(&n_b, PyArray_DescrFromType(NPY_INT64), NULL); + if (n_b_scalar == NULL) { + goto out; + } + arglist = Py_BuildValue("OOOO", X_array, ss_sizes, n_a_scalar, n_b_scalar); + if (arglist == NULL) { + goto out; + } + result = PyObject_CallObject(summary_func, arglist); + if (result == NULL) { + goto out; + } + Y_array = (PyArrayObject *) PyArray_FromAny( + result, PyArray_DescrFromType(NPY_FLOAT64), 0, 0, NPY_ARRAY_IN_ARRAY, NULL); + if (Y_array == NULL) { + goto out; + } + if (PyArray_NDIM(Y_array) != 1) { + PyErr_Format(PyExc_ValueError, + "Array returned by norm function callback is %d dimensional; " + "must be 1D", + (int) PyArray_NDIM(Y_array)); + goto out; + } + if (PyArray_DIM(Y_array, 0) != (npy_intp) result_dim) { + PyErr_Format(PyExc_ValueError, + "Array returned by norm function callback is of length %d; must be %d", + PyArray_DIM(Y_array, 0), result_dim); + goto out; + } + /* Copy the contents of the return Y array into Y */ + memcpy(Y, PyArray_DATA(Y_array), result_dim * sizeof(*Y)); + ret = 0; +out: + Py_XDECREF(X_array); + Py_XDECREF(arglist); + Py_XDECREF(result); + Py_XDECREF(Y_array); + Py_XDECREF(n_a_scalar); + Py_XDECREF(n_b_scalar); + return ret; +} + +static int +general_two_locus_count_stat_func( + tsk_size_t K, const double *X, tsk_size_t result_dim, double *Y, void *params) +{ + int ret = TSK_PYTHON_CALLBACK_ERROR; + PyObject *arglist = NULL; + PyObject *result = NULL; + PyArrayObject *X_array = NULL; + PyArrayObject *Y_array = NULL; + two_locus_general_stat_params *tl_params = params; + PyObject *summary_func = tl_params->summary_func; + PyArrayObject *ss_sizes = tl_params->sample_set_sizes; + npy_intp X_dims[2] = { 3, K }; + npy_intp X_strides[2] = { sizeof(double), sizeof(double) * 3 }; + + // Create a transposed, read only view of X as a numpy array. The native + // memory layout of X is (K, 3), we wrap it in a numpy array with dimensions + // (3, K), creating row arrays of haplotype counts so that we can easily + // decompose the results. For example: `pAB, pAb, paB = X / n` which works + // with K>1. Itemsize is -1 because we specify the dtype. NB: we do not set + // NPY_ARRAY_WRITEABLE, so X_array is read only. + X_array = (PyArrayObject *) PyArray_New(&PyArray_Type, 2, X_dims, NPY_FLOAT64, + X_strides, (void *) X, -1, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_ALIGNED, NULL); + if (X_array == NULL) { + goto out; + } + arglist = Py_BuildValue("OO", X_array, ss_sizes); + if (arglist == NULL) { + goto out; + } + result = PyObject_CallObject(summary_func, arglist); + if (result == NULL) { + goto out; + } + Y_array = (PyArrayObject *) PyArray_FromAny( + result, PyArray_DescrFromType(NPY_FLOAT64), 0, 0, NPY_ARRAY_IN_ARRAY, NULL); + if (Y_array == NULL) { + goto out; + } + if (PyArray_NDIM(Y_array) != 1) { + PyErr_Format(PyExc_ValueError, + "Array returned by summary function callback is %d dimensional; must be 1D", + (int) PyArray_NDIM(Y_array)); + goto out; + } + if (PyArray_DIM(Y_array, 0) != (npy_intp) result_dim) { + PyErr_Format(PyExc_ValueError, + "Array returned by summary function callback is of length %d; must be %d", + PyArray_DIM(Y_array, 0), result_dim); + goto out; + } + /* Copy the contents of the return Y array into Y */ + memcpy(Y, PyArray_DATA(Y_array), result_dim * sizeof(*Y)); + ret = 0; +out: + Py_XDECREF(X_array); + Py_XDECREF(arglist); + Py_XDECREF(result); + Py_XDECREF(Y_array); + return ret; +} + +static PyObject * +TreeSequence_two_locus_count_stat(TreeSequence *self, PyObject *args, PyObject *kwds) +{ + PyObject *ret = NULL; + static char *kwlist[] = { "sample_set_sizes", "sample_sets", "summary_func", + "norm_func", "output_dim", "polarised", "row_sites", "col_sites", + "row_positions", "column_positions", "mode", NULL }; + two_locus_general_stat_params *params; + PyObject *summary_func = NULL; + PyObject *norm_func = NULL; + unsigned int output_dim; + PyObject *sample_set_sizes = NULL; + PyObject *sample_sets = NULL; + PyObject *row_sites = NULL; + PyObject *col_sites = NULL; + PyObject *row_positions = NULL; + PyObject *col_positions = NULL; + char *mode = NULL; + PyArrayObject *sample_set_sizes_array = NULL; + PyArrayObject *sample_sets_array = NULL; + PyArrayObject *row_sites_array = NULL; + PyArrayObject *col_sites_array = NULL; + PyArrayObject *row_positions_array = NULL; + PyArrayObject *col_positions_array = NULL; + PyArrayObject *result_matrix = NULL; + tsk_id_t *row_sites_parsed = NULL; + tsk_id_t *col_sites_parsed = NULL; + double *row_positions_parsed = NULL; + double *col_positions_parsed = NULL; + npy_intp result_dim[3] = { 0, 0, 0 }; + tsk_size_t num_sample_sets; + tsk_flags_t options = 0; + int polarised = 0; + int err; + + if (TreeSequence_check_state(self) != 0) { + goto out; + } + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOOOIiOOOO|s", kwlist, + &sample_set_sizes, &sample_sets, &summary_func, &norm_func, &output_dim, + &polarised, &row_sites, &col_sites, &row_positions, &col_positions, &mode)) { + Py_XINCREF(summary_func); + Py_XINCREF(norm_func); + goto out; + } + Py_INCREF(summary_func); + Py_INCREF(norm_func); + if (!PyCallable_Check(summary_func)) { + PyErr_SetString(PyExc_TypeError, "summary_func must be callable"); + goto out; + } + if (!PyCallable_Check(norm_func)) { + PyErr_SetString(PyExc_TypeError, "norm_func must be callable"); + goto out; + } + if (parse_stats_mode(mode, &options) != 0) { + goto out; + } + if (polarised) { + options |= TSK_STAT_POLARISED; + } + + if (parse_sample_sets(sample_set_sizes, &sample_set_sizes_array, sample_sets, + &sample_sets_array, &num_sample_sets) + != 0) { + goto out; + } + PyArray_CLEARFLAGS(sample_set_sizes_array, NPY_ARRAY_WRITEABLE); + + if (options & TSK_STAT_SITE) { + if (row_positions != Py_None || col_positions != Py_None) { + PyErr_SetString(PyExc_ValueError, "Cannot specify positions in site mode"); + goto out; + } + row_sites_array = parse_sites(self, row_sites, &(result_dim[0])); + col_sites_array = parse_sites(self, col_sites, &(result_dim[1])); + if (row_sites_array == NULL || col_sites_array == NULL) { + goto out; + } + row_sites_parsed = PyArray_DATA(row_sites_array); + col_sites_parsed = PyArray_DATA(col_sites_array); + } else if (options & TSK_STAT_BRANCH) { + if (row_sites != Py_None || col_sites != Py_None) { + PyErr_SetString(PyExc_ValueError, "Cannot specify sites in branch mode"); + goto out; + } + row_positions_array = parse_positions(self, row_positions, &(result_dim[0])); + col_positions_array = parse_positions(self, col_positions, &(result_dim[1])); + if (col_positions_array == NULL || row_positions_array == NULL) { + goto out; + } + row_positions_parsed = PyArray_DATA(row_positions_array); + col_positions_parsed = PyArray_DATA(col_positions_array); + } + + result_dim[2] = output_dim; + result_matrix = (PyArrayObject *) PyArray_ZEROS(3, result_dim, NPY_FLOAT64, 0); + if (result_matrix == NULL) { + PyErr_NoMemory(); + goto out; + } + + params = &(two_locus_general_stat_params) { + .sample_set_sizes = sample_set_sizes_array, + .summary_func = summary_func, + .norm_func = norm_func, + }; + err = tsk_treeseq_two_locus_general_count_stat(self->tree_sequence, num_sample_sets, + PyArray_DATA(sample_set_sizes_array), PyArray_DATA(sample_sets_array), + output_dim, general_two_locus_count_stat_func, params, + general_two_locus_norm_func, result_dim[0], row_sites_parsed, + row_positions_parsed, result_dim[1], col_sites_parsed, col_positions_parsed, + options, PyArray_DATA(result_matrix)); + + if (err == TSK_PYTHON_CALLBACK_ERROR) { + goto out; + } else if (err != 0) { + handle_library_error(err); + goto out; + } + ret = (PyObject *) result_matrix; + result_matrix = NULL; +out: + Py_XDECREF(summary_func); + Py_XDECREF(norm_func); + Py_XDECREF(row_sites_array); + Py_XDECREF(col_sites_array); + Py_XDECREF(row_positions_array); + Py_XDECREF(col_positions_array); + Py_XDECREF(sample_sets_array); + Py_XDECREF(sample_set_sizes_array); + Py_XDECREF(result_matrix); + return ret; +} + static PyObject * TreeSequence_ld_matrix(TreeSequence *self, PyObject *args, PyObject *kwds, two_locus_count_stat_method *method) @@ -8831,6 +9109,11 @@ static PyMethodDef TreeSequence_methods[] = { .ml_meth = (PyCFunction) TreeSequence_general_stat, .ml_flags = METH_VARARGS | METH_KEYWORDS, .ml_doc = "Runs the general stats algorithm for a given summary function." }, + { .ml_name = "two_locus_count_stat", + .ml_meth = (PyCFunction) TreeSequence_two_locus_count_stat, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc + = "Runs the general two locus stats algorithm for a given summary function." }, { .ml_name = "diversity", .ml_meth = (PyCFunction) TreeSequence_diversity, .ml_flags = METH_VARARGS | METH_KEYWORDS, diff --git a/python/tests/test_ld_matrix.py b/python/tests/test_ld_matrix.py index 4d6e47ddcc..584e1a4791 100644 --- a/python/tests/test_ld_matrix.py +++ b/python/tests/test_ld_matrix.py @@ -2063,38 +2063,36 @@ def compute_branch_stat( ts for ts in get_example_tree_sequences() if ts.id - not in { - "no_samples", - "empty_ts", - # We must skip these cases so that tests run in a reasonable - # amount of time. To get more complete testing, these filters - # can be commented out. (runtime ~1hr) - "gap_0", - "gap_0.1", - "gap_0.5", - "gap_0.75", - "n=2_m=32_rho=0", - "n=10_m=1_rho=0", - "n=10_m=1_rho=0.1", - "n=10_m=2_rho=0", - "n=10_m=2_rho=0.1", - "n=10_m=32_rho=0", - "n=10_m=32_rho=0.1", - "n=10_m=32_rho=0.5", + in { + # We run only these cases so that tests run in a reasonable + # amount of time. All examples takes ~1hr. + "decapitate_recomb", + "gap_at_end", + "all_nodes_samples", + "internal_nodes_samples", + "mixed_internal_leaf_samples", + "bottleneck_n=3_mutated", + "bottleneck_n=10_mutated", + "rev_node_order", + "empty_tree", + "n=3_m=2_rho=0.5", + "n=3_m=32_rho=0", + "n=3_m=32_rho=0.1", + "n=2_m=1_rho=0", + "n=2_m=1_rho=0.1", + "n=2_m=1_rho=0.5", + "n=2_m=2_rho=0", + "n=2_m=2_rho=0.1", + "n=2_m=2_rho=0.5", + "n=2_m=32_rho=0.1", + "n=2_m=32_rho=0.5", + "n=3_m=1_rho=0", + "n=3_m=1_rho=0.5", + "n=3_m=2_rho=0", + "n=10_m=1_rho=0.5", + "n=10_m=2_rho=0.5", # we keep one n=100 case to ensure bit arrays are working - "n=100_m=1_rho=0.1", - "n=100_m=1_rho=0.5", - "n=100_m=2_rho=0", - "n=100_m=2_rho=0.1", - "n=100_m=2_rho=0.5", - "n=100_m=32_rho=0", - "n=100_m=32_rho=0.1", - "n=100_m=32_rho=0.5", - "all_fields", - "back_mutations", - "multichar", - "multichar_no_metadata", - "bottleneck_n=100_mutated", + "n=100_m=1_rho=0", } ], ) @@ -2398,3 +2396,386 @@ def test_multipopulation_r2_varying_unequal_set_sizes(genotypes, sample_sets, ex norm_hap_weighted_ij(1, state, max(a) + 1, max(b) + 1, norm[i, j], params) np.testing.assert_allclose((result * norm).sum(), expected) + + +class GeneralStatFuncs: + """ + Summary functions take X, n as parameters where X is a matrix of haplotype + counts per sample set and n is a vector of sample set sizes. X has shape (3, k) + and n has shape (k, ), where k is the number of sample sets. The rows of X + contain haplotype counts for AB, Ab, aB (capitalized == derived). + + X: shape=(3, k) + sample sets + count AB [[ #ss1, #ss2, ... ] + count Ab [ #ss1, #ss2, ... ] + count aB [ #ss1, #ss2, ... ]] + + n: shape=(k, ) + [ #ss1, #ss2, ... ] + """ + + @staticmethod + def D(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + return pAB - (pA * pB) + + @staticmethod + def D2(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + return (pAB - (pA * pB)) ** 2 + + @staticmethod + def r2(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + D = pAB - (pA * pB) + denom = pA * pB * (1 - pA) * (1 - pB) + with suppress_overflow_div0_warning(): + return D**2 / denom + + @staticmethod + def r(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + D = pAB - (pA * pB) + denom = pA * pB * (1 - pA) * (1 - pB) + with suppress_overflow_div0_warning(): + return D / np.sqrt(denom) + + @staticmethod + def D_prime(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + D = pAB - (pA * pB) + denom = np.vstack( + [ + np.min([pA * (1 - pB), (1 - pA) * pB], axis=0), + np.min([pA * pB, (1 - pA) * (1 - pB)], axis=0), + ] + ) + with suppress_overflow_div0_warning(): + return D / denom[(D < 0).astype(int), range(len(D))] + + @staticmethod + def Dz(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + D = pAB - (pA * pB) + return D * (1 - 2 * pA) * (1 - 2 * pB) + + @staticmethod + def pi2(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + return pA * (1 - pA) * pB * (1 - pB) + + @staticmethod + def D2_unbiased(X, n): + AB, Ab, aB = X + ab = n - X.sum(0) + return ( + ((aB**2) * (Ab - 1) * Ab) + + ((ab - 1) * ab * (AB - 1) * AB) + - (aB * Ab * (Ab + (2 * ab * AB) - 1)) + ) / (n * (n - 1) * (n - 2) * (n - 3)) + + @staticmethod + def Dz_unbiased(X, n): + AB, Ab, aB = X + ab = n - X.sum(0) + return ( + (((AB * ab) - (Ab * aB)) * (aB + ab - AB - Ab) * (Ab + ab - AB - aB)) + - ((AB * ab) * (AB + ab - Ab - aB - 2)) + - ((Ab * aB) * (Ab + aB - AB - ab - 2)) + ) / (n * (n - 1) * (n - 2) * (n - 3)) + + @staticmethod + def pi2_unbiased(X, n): + AB, Ab, aB = X + ab = n - X.sum(0) + return ( + ((AB + Ab) * (aB + ab) * (AB + aB) * (Ab + ab)) + - ((AB * ab) * (AB + ab + (3 * Ab) + (3 * aB) - 1)) + - ((Ab * aB) * (Ab + aB + (3 * AB) + (3 * ab) - 1)) + ) / (n * (n - 1) * (n - 2) * (n - 3)) + + # Two-way statistics have the _ij suffix. + @staticmethod + def r2_ij(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + # keepdims preserves the output shape of (1, ) + D2_ij = np.prod(pAB - (pA * pB), keepdims=True) + denom = np.prod(np.sqrt(pA * pB * (1 - pA) * (1 - pB)), keepdims=True) + with suppress_overflow_div0_warning(): + return D2_ij / denom + + @staticmethod + def D2_ij(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + return np.prod(pAB - (pA * pB), keepdims=True) + + @staticmethod + def D2_ij_unbiased(X, n): + AB, Ab, aB = X + ab = n - X.sum(0) + return [ + (Ab[0] * aB[0] - AB[0] * ab[0]) + * (Ab[1] * aB[1] - AB[1] * ab[1]) + / (n[0] * (n[0] - 1) * n[1] * (n[1] - 1)) + ] + + @staticmethod + def D2_ii_ij_jj_unbiased(X, n): + AB, Ab, aB = X + ab = n - X.sum(0) + + # unbiased estimator for equal sample sets + ii, jj = ( + AB * (AB - 1) * ab * (ab - 1) + + Ab * (Ab - 1) * aB * (aB - 1) + - 2 * AB * Ab * aB * ab + ) / (n * (n - 1) * (n - 2) * (n - 3)) + # unbiased estimator for disjoint sample sets + ij = ( + (Ab[0] * aB[0] - AB[0] * ab[0]) + * (Ab[1] * aB[1] - AB[1] * ab[1]) + / (n[0] * (n[0] - 1) * n[1] * (n[1] - 1)) + ) + return [ii, ij, jj] + + +class GeneralStatNormFuncs: + @staticmethod + def hap_norm(X, n, nA, nB): + """Stat from 1 sample set -> 1 result""" + return X[0] / n + + @staticmethod + def k_way_hap_norm(X, n, nA, nB): + """Stat from k sample sets -> 1 result""" + return X[0].sum(keepdims=True) / n.sum() + + @staticmethod + def assert_no_norm_func(*_): + """Normalisation is not required in branch mode and with biallelic sites.""" + raise Exception("Normalisation function should not be called") + + @classmethod + def choose(cls, stat, mode, ts): + """ + Choose norm function based on stat, mode, presence of multiallelic sites + """ + is_multiallelic = max({len(s.mutations) for s in ts.sites()}) > 1 + match (stat, mode, is_multiallelic): + case ("r2", "site", True): + return cls.hap_norm + case ("r2_ij", "site", True): + return cls.k_way_hap_norm + case (_, "branch", _): # branch stats do not need a norm func + return cls.assert_no_norm_func + case (_, _, False): # biallelic sites should not use the norm func + return cls.assert_no_norm_func + case _: # total_norm is default (1 / (nA * nB)). handles multi-way stats + return None + + +@pytest.fixture(scope="module") +def ts_10_samp_with_sites_fixture(): + ts = tsutil.get_sim_example( + sample_size=10, + sequence_length=15, + recombination_rate=0.1, + mutation_rate=0.1, + seed=123, + ) + assert ts.num_sites > 0, "sites are required" + assert ts.num_samples == 10 # Samples directly indexed in tests below + assert max({len(s.mutations) for s in ts.sites()}) == 1, "sites must be biallelic" + return ts + + +@pytest.fixture(scope="module") +def ts_multiallelic_fixture(): + ts = msprime.sim_mutations( + msprime.sim_ancestry( + 2, recombination_rate=0.1, sequence_length=100, random_seed=123 + ), + rate=0.1, + random_seed=123, + ) + # Need at least 4 samples to test unbiased statistics + assert ts.num_samples >= 4, "At least 4 samples required" + assert max({len(s.mutations) for s in ts.sites()}) > 1, ( + "At least one multiallelic site required" + ) + return ts + + +@pytest.fixture(scope="module") +def ts_no_sites_fixture(): + ts = msprime.sim_ancestry( + 2, recombination_rate=0.1, sequence_length=100, random_seed=123 + ) + assert ts.num_sites == 0 + return ts + + +@pytest.mark.parametrize("mode", ["site", "branch"]) +@pytest.mark.parametrize( + "ts", + [ts for ts in get_example_tree_sequences() if ts.id in {"no_samples", "empty_ts"}], +) +def test_general_empty_ts(mode, ts): + with pytest.raises(ValueError, match="at least one element"): + ts.two_locus_count_stat([ts.samples()], GeneralStatFuncs.D, 1, mode=mode) + + +def test_general_no_sites(ts_no_sites_fixture): + ts = ts_no_sites_fixture + ldg = ts.two_locus_count_stat([ts.samples()], GeneralStatFuncs.D, 1) + np.testing.assert_array_equal(ldg, np.zeros((1, 0, 0), np.float64)) + + +@pytest.mark.parametrize("mode", ["site", "branch"]) +def test_general_output_dimensions(mode, ts_multiallelic_fixture): + ts = ts_multiallelic_fixture + norm_f = GeneralStatNormFuncs.choose("D", mode, ts) + samples = ts.samples() + expected_dims = dict( + site=(1, ts.num_sites, ts.num_sites), branch=(1, ts.num_trees, ts.num_trees) + )[mode] + result = ts.two_locus_count_stat( + samples, GeneralStatFuncs.D, 1, mode=mode, norm_f=norm_f + ) + assert result.shape == expected_dims + # we expect that dims are the same with `samples` or `[samples]` + result = ts.two_locus_count_stat( + [samples], GeneralStatFuncs.D, 1, mode=mode, norm_f=norm_f + ) + assert result.shape == expected_dims + + expected_dims = dict( + site=(2, ts.num_sites, ts.num_sites), branch=(2, ts.num_trees, ts.num_trees) + )[mode] + result = ts.two_locus_count_stat( + [samples, samples], GeneralStatFuncs.D, 2, mode=mode, norm_f=norm_f + ) + assert result.shape == expected_dims + + +@pytest.mark.parametrize("mode", ["site", "branch"]) +@pytest.mark.parametrize("stat", SUMMARY_FUNCS.keys()) +def test_general_one_way_multi_sample_set(mode, stat, ts_10_samp_with_sites_fixture): + ts = ts_10_samp_with_sites_fixture + norm_f = GeneralStatNormFuncs.choose(stat, mode, ts) + sample_sets = [ts.samples()[0:5], ts.samples()[5:10]] + ldg = ts.two_locus_count_stat( + sample_sets, + getattr(GeneralStatFuncs, stat), + 2, + norm_f=norm_f, + mode=mode, + ) + ld = ts.ld_matrix(sample_sets=sample_sets, stat=stat, mode=mode) + np.testing.assert_array_almost_equal(ldg, ld) + + +@pytest.mark.parametrize("mode", ["site", "branch"]) +@pytest.mark.parametrize("stat", ["r2_ij", "D2_ij", "D2_ij_unbiased"]) +def test_general_two_way(mode, stat, ts_10_samp_with_sites_fixture): + ts = ts_10_samp_with_sites_fixture + general_func = getattr(GeneralStatFuncs, stat) + norm_f = GeneralStatNormFuncs.choose(stat, mode, ts) + sample_sets = [ts.samples()[0:5], ts.samples()[5:10]] + ldg = ts.two_locus_count_stat(sample_sets, general_func, 1, norm_f=norm_f, mode=mode) + ld = ts.ld_matrix( + sample_sets=sample_sets, + stat=stat.replace("_ij", ""), + indexes=[(0, 1)], + mode=mode, + ) + np.testing.assert_array_almost_equal(ldg, ld) + + +# NB: multiallelic testing only needed for sites. branches are biallelic. + + +@pytest.mark.parametrize("stat", SUMMARY_FUNCS.keys()) +def test_general_one_way_multiallelic(stat, ts_multiallelic_fixture): + ts = ts_multiallelic_fixture + general_func = getattr(GeneralStatFuncs, stat) + norm_f = GeneralStatNormFuncs.choose(stat, "site", ts) + polarised = POLARIZATION[SUMMARY_FUNCS[stat]] + ldg = ts.two_locus_count_stat( + [ts.samples()], general_func, 1, norm_f=norm_f, polarised=polarised + ) + ld = ts.ld_matrix(stat=stat) + # ld_matrix drops dims, expand for comparison + np.testing.assert_array_almost_equal(ldg, np.expand_dims(ld, 0)) + + +@pytest.mark.parametrize("stat", SUMMARY_FUNCS.keys()) +def test_general_one_way_multiallelic_multi_sample_set(stat, ts_multiallelic_fixture): + ts = ts_multiallelic_fixture + general_func = getattr(GeneralStatFuncs, stat) + norm_f = GeneralStatNormFuncs.choose(stat, "site", ts) + polarised = POLARIZATION[SUMMARY_FUNCS[stat]] + sample_sets = [ts.samples(), ts.samples()] + ldg = ts.two_locus_count_stat( + sample_sets, general_func, 2, norm_f=norm_f, polarised=polarised + ) + ld = ts.ld_matrix(stat=stat, sample_sets=sample_sets) + np.testing.assert_array_almost_equal(ldg, ld) + + +@pytest.mark.parametrize("stat", ["r2_ij", "D2_ij", "D2_ij_unbiased"]) +def test_general_two_way_multiallelic(stat, ts_multiallelic_fixture): + ts = ts_multiallelic_fixture + general_func = getattr(GeneralStatFuncs, stat) + norm_f = GeneralStatNormFuncs.choose(stat, "site", ts) + sample_sets = [ts.samples(), ts.samples()] + ldg = ts.two_locus_count_stat(sample_sets, general_func, 1, norm_f=norm_f) + ld = ts.ld_matrix( + stat=stat.replace("_ij", ""), indexes=(0, 1), sample_sets=sample_sets + ) + # ld_matrix drops dims, expand for comparison + np.testing.assert_array_almost_equal(ldg, np.expand_dims(ld, 0)) + + +@pytest.mark.parametrize("mode", ["site", "branch"]) +def test_general_multi_outputs(mode): + ts = msprime.sim_mutations( + msprime.sim_ancestry( + 4, recombination_rate=0.1, sequence_length=35, random_seed=123 + ), + rate=0.1, + random_seed=123, + ) + assert ts.num_samples == 8, "8 samples are required" + assert max({len(s.mutations) for s in ts.sites()}) > 2, ( + "At least one multiallelic site required" + ) + A = ts.samples()[0:4] + B = ts.samples()[4:] + + norm_f = GeneralStatNormFuncs.choose("D2_unbiased", mode, ts) + general_func = GeneralStatFuncs.D2_ii_ij_jj_unbiased + ldg = ts.two_locus_count_stat([A, B], general_func, 3, mode=mode, norm_f=norm_f) + ld = ts.ld_matrix( + [A, B], stat="D2_unbiased", indexes=[(0, 0), (0, 1), (1, 1)], mode=mode + ) + np.testing.assert_array_almost_equal(ldg, ld) diff --git a/python/tests/test_python_c.py b/python/tests/test_python_c.py index 15f9967f3f..09370a9a78 100644 --- a/python/tests/test_python_c.py +++ b/python/tests/test_python_c.py @@ -138,6 +138,23 @@ def get_example_migration_tree_sequence(self): ) return ts.ll_tree_sequence + def get_example_tree_sequence_multiallelic(self, sample_size=10): + ts = msprime.sim_mutations( + msprime.sim_ancestry( + sample_size, + recombination_rate=0.1, + sequence_length=100, + ploidy=1, + random_seed=123, + ), + rate=0.1, + random_seed=123, + ) + assert max({len(s.mutations) for s in ts.sites()}) > 2, ( + "At least one multiallelic site required" + ) + return ts.ll_tree_sequence + def verify_iterator(self, iterator): """ Checks that the specified non-empty iterator implements the @@ -1987,6 +2004,246 @@ def test_ld_matrix_multipop(self, stat_method_name): with pytest.raises(_tskit.LibraryError, match="TSK_ERR_UNSUPPORTED_STAT_MODE"): stat_method(ss_sizes, ss, indexes, col_sites, row_sites, None, None, "node") + def test_two_locus_count_stat(self): + """Test two_locus_count_stat on biallelic data (no norm function)""" + ts = self.get_example_tree_sequence(10) + ss = ts.get_samples() # sample sets + ss_sizes = np.array([len(ss)], dtype=np.uint32) + row_sites = np.arange(ts.get_num_sites(), dtype=np.int32) + col_sites = row_sites + row_pos = ts.get_breakpoints()[:-1] + col_pos = row_pos + row_sites_list = list(range(ts.get_num_sites())) + col_sites_list = row_sites_list + row_pos_list = list(map(float, ts.get_breakpoints()[:-1])) + col_pos_list = row_pos_list + + def stat_func(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + return pAB - (pA * pB) + + def norm_func(*_): + raise Exception # norm function will not be used + + method = ts.two_locus_count_stat + site_args = row_sites, col_sites, None, None, "site" + branch_args = None, None, row_pos, col_pos, "branch" + # happy path + a = method(ss_sizes, ss, stat_func, norm_func, 1, True, *site_args) + assert a.shape == (ts.get_num_sites(), ts.get_num_sites(), 1) + a = method(ss_sizes, ss, stat_func, norm_func, 1, True, *branch_args) + assert a.shape == (ts.get_num_trees(), ts.get_num_trees(), 1) + # happy path - sample sets as lists are also valid + site_list_args = row_sites_list, col_sites_list, None, None, "site" + branch_list_args = None, None, row_pos_list, col_pos_list, "branch" + a = method(ss_sizes, ss, stat_func, norm_func, 1, True, *site_list_args) + assert a.shape == (ts.get_num_sites(), ts.get_num_sites(), 1) + a = method(ss_sizes, ss, stat_func, norm_func, 1, True, *branch_list_args) + assert a.shape == (ts.get_num_trees(), ts.get_num_trees(), 1) + # happy path - default values for site and position lists + a = method( + ss_sizes, ss, stat_func, norm_func, 1, True, None, None, None, None, "site" + ) + assert a.shape == (ts.get_num_sites(), ts.get_num_sites(), 1) + a = method( + ss_sizes, ss, stat_func, norm_func, 1, True, None, None, None, None, "branch" + ) + assert a.shape == (ts.get_num_trees(), ts.get_num_trees(), 1) + # CPython API errors + with pytest.raises(ValueError, match="Sum of sample_set_sizes"): + bad_ss = np.array([], dtype=np.int32) + method(ss_sizes, bad_ss, stat_func, norm_func, 1, True, *site_args) + with pytest.raises(TypeError, match="cast array data"): + bad_ss = np.array(ts.get_samples(), dtype=np.uint32) + method(ss_sizes, bad_ss, stat_func, norm_func, 1, True, *site_args) + with pytest.raises(ValueError, match="Unrecognised stats mode"): + bad_args = row_sites, col_sites, None, None, "bla" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_args) + with pytest.raises(TypeError, match="at most"): + method(ss_sizes, ss, stat_func, norm_func, 1, True, *site_args, "extraarg") + with pytest.raises(ValueError, match="invalid literal"): + bad_sites = ["abadsite", 0, 3, 2] + bad_site_args = bad_sites, col_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(TypeError): + bad_sites = [None, 0, 3, 2] + bad_site_args = bad_sites, col_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(TypeError): + bad_sites = [{}, 0, 3, 2] + bad_site_args = bad_sites, col_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(TypeError, match="Cannot cast array data"): + bad_sites = np.array([0, 1, 2], dtype=np.uint32) + bad_site_args = bad_sites, col_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(ValueError, match="invalid literal"): + bad_sites = ["abadsite", 0, 3, 2] + bad_site_args = row_sites, bad_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(TypeError): + bad_sites = [None, 0, 3, 2] + bad_site_args = row_sites, bad_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(TypeError): + bad_sites = [{}, 0, 3, 2] + bad_site_args = row_sites, bad_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(TypeError, match="Cannot cast array data"): + bad_sites = np.array([0, 1, 2], dtype=np.uint32) + bad_site_args = row_sites, bad_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(ValueError): + bad_pos = ["abadpos", 0.1, 0.2, 2.0] + bad_branch_args = None, None, bad_pos, col_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(TypeError): + bad_pos = [{}, 0.1, 0.2, 2.0] + bad_branch_args = None, None, bad_pos, col_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(ValueError): + bad_pos = ["abadpos", 0, 3, 2] + bad_branch_args = None, None, row_pos, bad_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(TypeError): + bad_pos = [{}, 0, 3, 2] + bad_branch_args = None, None, row_pos, bad_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(ValueError, match="Cannot specify positions in site mode"): + bad_site_args = None, None, row_pos, col_pos, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(ValueError, match="Cannot specify sites in branch mode"): + bad_branch_args = row_sites, col_sites, None, None, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(TypeError, match="summary_func must be callable"): + method(ss_sizes, ss, "uncallable", norm_func, 1, True, *site_args) + with pytest.raises(TypeError, match="norm_func must be callable"): + method(ss_sizes, ss, stat_func, "uncallable", 1, True, *site_args) + with pytest.raises(ValueError, match="summary function.*must be 1D"): + method(ss_sizes, ss, lambda *_: 1, norm_func, 1, True, *site_args) + with pytest.raises(ValueError, match="summary function.*length 2; must be 1"): + method(ss_sizes, ss, lambda *_: [1, 2], norm_func, 1, True, *site_args) + with pytest.raises(ValueError, match="could not convert string to float"): + method(ss_sizes, ss, lambda *_: ["nonfloat"], norm_func, 1, True, *site_args) + with pytest.raises(ValueError, match="assignment destination is read-only"): + + def bad_stat_func(X, n): + X[0] = [1] + return [1] + + method(ss_sizes, ss, bad_stat_func, norm_func, 1, True, *site_args) + # Exceptions within stat_func are correctly raised. + for exception in [ValueError, TypeError]: + + def stat_func_except(*_): + raise exception("test") # noqa: B023 + + with pytest.raises(exception, match="test"): + method(ss_sizes, ss, stat_func_except, norm_func, 1, True, *site_args) + # C API errors + with pytest.raises(tskit.LibraryError, match="TSK_ERR_BAD_RESULT_DIMS"): + method(ss_sizes, ss, stat_func, norm_func, 0, True, *site_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_STAT_UNSORTED_SITES"): + bad_sites = np.array([1, 0, 2], dtype=np.int32) + bad_site_args = bad_sites, col_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_STAT_UNSORTED_SITES"): + bad_sites = np.array([1, 0, 2], dtype=np.int32) + bad_site_args = row_sites, bad_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_STAT_DUPLICATE_SITES"): + bad_sites = np.array([1, 1, 2], dtype=np.int32) + bad_site_args = bad_sites, col_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_STAT_DUPLICATE_SITES"): + bad_sites = np.array([1, 1, 2], dtype=np.int32) + bad_site_args = row_sites, bad_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_SITE_OUT_OF_BOUNDS"): + bad_sites = np.array([-1, 0, 2], dtype=np.int32) + bad_site_args = bad_sites, col_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_SITE_OUT_OF_BOUNDS"): + bad_sites = np.array([-1, 0, 2], dtype=np.int32) + bad_site_args = row_sites, bad_sites, None, None, "site" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_site_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_STAT_UNSORTED_POSITIONS"): + bad_pos = np.array([0.7, 0, 0.8], dtype=np.float64) + bad_branch_args = None, None, bad_pos, col_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_STAT_UNSORTED_POSITIONS"): + bad_pos = np.array([0.7, 0, 0.8], dtype=np.float64) + bad_branch_args = None, None, row_pos, bad_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_STAT_DUPLICATE_POSITIONS"): + bad_pos = np.array([0.7, 0.7, 0.8], dtype=np.float64) + bad_branch_args = None, None, bad_pos, col_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_STAT_DUPLICATE_POSITIONS"): + bad_pos = np.array([0.7, 0.7, 0.8], dtype=np.float64) + bad_branch_args = None, None, row_pos, bad_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_POSITION_OUT_OF_BOUNDS"): + bad_pos = np.array([-0.1, 0.7, 0.8], dtype=np.float64) + bad_branch_args = None, None, bad_pos, col_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_POSITION_OUT_OF_BOUNDS"): + bad_pos = np.array([-0.1, 0.7, 0.8], dtype=np.float64) + bad_branch_args = None, None, row_pos, bad_pos, "branch" + method(ss_sizes, ss, stat_func, norm_func, 1, True, *bad_branch_args) + + def test_two_locus_count_stat_multialleliic(self): + """ + Test two_locus_count_stat on multiallelic sites to test the behavior of + the norm function. + """ + ts = self.get_example_tree_sequence_multiallelic() + + def stat_func(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + return pAB - (pA * pB) + + def norm_func(X, n, nA, nB): + return X[0].sum(keepdims=True) / n.sum() + + ss = ts.get_samples() # sample sets + ss_sizes = np.array([len(ss)], dtype=np.uint32) + row_sites = np.arange(ts.get_num_sites(), dtype=np.int32) + col_sites = row_sites + method = ts.two_locus_count_stat + site_args = row_sites, col_sites, None, None, "site" + + # happy path + a = method(ss_sizes, ss, stat_func, norm_func, 1, True, *site_args) + assert a.shape == (ts.get_num_sites(), ts.get_num_sites(), 1) + # CPython API errors + with pytest.raises(ValueError, match="norm function.*must be 1D"): + method(ss_sizes, ss, stat_func, lambda *_: 1, 1, True, *site_args) + with pytest.raises( + TypeError, match="takes 1 positional argument but 2 were given" + ): + method(ss_sizes, ss, lambda _: 1, norm_func, 1, True, *site_args) + with pytest.raises(ValueError, match="norm function.*length 2; must be 1"): + method(ss_sizes, ss, stat_func, lambda *_: [1, 2], 1, True, *site_args) + with pytest.raises( + TypeError, match="takes 1 positional argument but 4 were given" + ): + method(ss_sizes, ss, stat_func, lambda _: [1, 2], 1, True, *site_args) + with pytest.raises(ValueError, match="could not convert string to float"): + method(ss_sizes, ss, stat_func, lambda *_: ["nonfloat"], 1, True, *site_args) + # Exceptions within stat_func are correctly raised. + for exception in [ValueError, TypeError]: + + def norm_func_except(*_): + raise exception("test") # noqa: B023 + + with pytest.raises(exception, match="test"): + method(ss_sizes, ss, stat_func, norm_func_except, 1, True, *site_args) + def test_kc_distance_errors(self): ts1 = self.get_example_tree_sequence(10) with pytest.raises(TypeError): diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 0037e06391..48ff72d0d7 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -2463,6 +2463,28 @@ def get_back_mutation_examples(): yield insert_branch_mutations(ts) +@functools.lru_cache +def get_sim_example( + sample_size, sequence_length, recombination_rate, mutation_rate, seed +): + recomb_map = msprime.RecombinationMap.uniform_map( + sequence_length, recombination_rate + ) + ts = msprime.simulate( + recombination_map=recomb_map, + mutation_rate=mutation_rate, + random_seed=seed, + population_configurations=[ + msprime.PopulationConfiguration(sample_size), + msprime.PopulationConfiguration(0), + ], + migration_matrix=[[0, 1], [1, 0]], + ) + ts = insert_random_ploidy_individuals(ts, 4, seed=seed) + ts = add_random_metadata(ts, seed=seed) + return ts + + def make_example_tree_sequences(custom_max=None): yield from get_decapitated_examples(custom_max=custom_max) yield from get_gap_examples(custom_max=custom_max) @@ -2475,22 +2497,14 @@ def make_example_tree_sequences(custom_max=None): for n in n_list: for m in [1, 2, 32]: for rho in [0, 0.1, 0.5]: - recomb_map = msprime.RecombinationMap.uniform_map(m, rho, num_loci=m) - ts = msprime.simulate( - recombination_map=recomb_map, + ts = get_sim_example( + sample_size=n, + sequence_length=m, + recombination_rate=rho, mutation_rate=0.1, - random_seed=seed, - population_configurations=[ - msprime.PopulationConfiguration(n), - msprime.PopulationConfiguration(0), - ], - migration_matrix=[[0, 1], [1, 0]], - ) - ts = insert_random_ploidy_individuals(ts, 4, seed=seed) - yield ( - f"n={n}_m={m}_rho={rho}", - add_random_metadata(ts, seed=seed), + seed=seed, ) + yield (f"n={n}_m={m}_rho={rho}", ts) seed += 1 for name, ts in get_bottleneck_examples(custom_max=custom_max): yield ( diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 45d2da59e0..d5359b4f1a 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -8249,19 +8249,7 @@ def parse_positions(self, positions): ) return row_positions, col_positions - def __two_locus_sample_set_stat( - self, - ll_method, - sample_sets, - sites=None, - positions=None, - mode=None, - ): - if sample_sets is None: - sample_sets = self.samples() - row_sites, col_sites = self.parse_sites(sites) - row_positions, col_positions = self.parse_positions(positions) - + def __convert_sample_sets(self, sample_sets): # First try to convert to a 1D numpy array. If we succeed, then we strip off # the corresponding dimension from the output. drop_dimension = False @@ -8283,7 +8271,23 @@ def __two_locus_sample_set_stat( raise ValueError("Sample sets must contain at least one element") flattened = util.safe_np_int_cast(np.hstack(sample_sets), np.int32) + return drop_dimension, flattened, sample_set_sizes + def __two_locus_sample_set_stat( + self, + ll_method, + sample_sets, + sites=None, + positions=None, + mode=None, + ): + if sample_sets is None: + sample_sets = self.samples() + row_sites, col_sites = self.parse_sites(sites) + row_positions, col_positions = self.parse_positions(positions) + drop_dimension, flattened, sample_set_sizes = self.__convert_sample_sets( + sample_sets + ) result = ll_method( sample_set_sizes, flattened, @@ -10927,15 +10931,238 @@ def impute_unknown_mutations_time( mutations_time[unknown] = self.nodes_time[self.mutations_node[unknown]] return mutations_time - def ld_matrix( + def two_locus_count_stat( self, - sample_sets=None, + sample_sets, + f, + result_dim, + norm_f=None, + polarised=False, sites=None, positions=None, mode="site", + ): + """ + Compute two-locus statistics with a user-defined python function that + operates on haplotype counts. Statistics can be computed in ``site`` + mode (see :ref:`sec_stats_two_locus_site`) or ``branch`` mode (see + :ref:`sec_stats_two_locus_branch`). On each pair of sites or trees, the + summary function is called with haplotype counts for all provided sample + sets. The summary function (``f``) must accept two parameters: ``X``, a + matrix with shape (3, k) and ``n``, a vector with shape (k,), where k is + the number of sample sets provided. ``X`` is a read-only matrix whose + rows contain haplotype counts (AB, Ab, aB) per sample set and ``n`` is a + read-only vector of sample set sizes. ``f`` and ``norm_f`` must return a + list of results with length ``result_dim``. + + What follows is an example of computing ``D`` from a tree sequence. The + result will be equivalent to using ``ts.ld_matrix(stat="D")`` (see + :ref:`sec_stats_two_locus` for usage of the built-in LD matrix + calculation, and :ref:`sec_stats_two_locus_summary_functions` for + available statistics). In the example summary function, we convert + counts to proportions, then compute ``D``, returning a numpy array with + length equal to the number of sample sets. + + .. code-block:: python + + def D(X, n): + pAB, pAb, paB = X / n + pA = pAb + pAB + pB = paB + pAB + return pAB - (pA * pB) + + The summary function is called for each pair of sites or trees, + producing results that must be combined when multiallelic sites are + present (``site`` mode only), so summary function results must need to + be normalised in order to be aggragated for all pairs of alleles between + both sites. Branch statistics and biallelic sites do not require any + normalisation, and ``norm_f`` is only called if one of the two sites + under consideration is multiallelic. See + :ref:`sec_stats_two_locus_computational_details` for further information + about normalisation. ``norm_f`` is a normalisation function that must + accept four parameters: ``X`` and ``n`` are the same inputs that ``f`` + accepts, along with ``nA`` and ``nB``, which hold the count of ``A`` + alleles and ``B`` alleles. For example, if ``A`` is biallelic and ``B`` + is triallelic, ``nA=2`` and ``nB=3``. ``f`` must return a list of + results with length ``result_dim``. The default normalisation function + is identical to ``total_norm`` shown in the example below. ``hap_norm`` + is required for normalising :math:`r^2`. Both of these examples return a + numpy array with length equal to the number of sample sets. + + .. code-block:: python + + def total_norm(X, n, nA, nB): + [1 / (nA * nB)] * result_dim + + def hap_norm(X, n, nA, nB): + X[0] / n + + A simple call (without specifying normalisation) would look like this + + .. code-block:: python + + ts.two_locus_count_stat([ts.samples()], D, 1, polarised=True) + + :param list sample_sets: A list of lists of Node IDs, specifying the + groups of nodes to compute the statistic with. + :param f: A function that takes two arguments - a two-dimensional array + with shape (3, k) and a one-dimensional array with shape (k, ) where + k is the number of sample sets. + :param int result_dim: The length of ``f`` and ``norm_f``'s return value. + :param norm_f: A function that takes four arguments - the first two are + the same as ``f``, the second two are scalars representing the + number of A and B alleles, respectively. If ``None``, then defaults + to the "total" normalization described above. + :param bool polarised: Whether to leave the ancestral state out of + computations: see :ref:`sec_stats` for more details. + :param list sites: A list of lists of sites over which to compute an + LD matrix. Can be specified as a list of lists to control the row + and column sites. Only available in "site" mode. Specify as + ``[row_sites, col_sites]`` or ``[all_sites]``. + Defaults to all sites. More information can be found in the + docstring of :meth:`.ld_matrix` + :param list positions: A list of lists of genomic positions where + expected LD is computed based on tree topologies and branch + lengths. Only applicable in "branch" mode. Specify as a list of + two lists to control the row and column positions, as + ``[row_positions, col_positions]``, or ``[all_positions]``. More + information can be found in the docstring of :meth:`.ld_matrix` + Defaults to the leftmost coordinates of all trees and computes + LD between all pairs of trees. + :param str mode: A string giving the "type" of the statistic to be + computed (defaults to "site"). + :return: A ndarray with shape equal to shape=(k, m, m) where + k=result_dim, m=(num_sites or num_trees, restricted by ``sites`` or + ``positions``). + """ + row_sites, col_sites = self.parse_sites(sites) + row_positions, col_positions = self.parse_positions(positions) + _, sample_sets, sample_set_sizes = self.__convert_sample_sets(sample_sets) + if norm_f is None: + # produce the same number of dims as result dimensions with [val] * dim + def norm_f(X, n, nA, nB): + return [1 / (nA * nB)] * result_dim + + result = self._ll_tree_sequence.two_locus_count_stat( + sample_set_sizes, + sample_sets, + f, + norm_f, + result_dim, + polarised, + row_sites, + col_sites, + row_positions, + col_positions, + mode, + ) + # Orient the data so that the first dimension is the result_dim so that + # we get one LD matrix per result dimension + return np.moveaxis(result, -1, 0) + + def ld_matrix( + self, + sample_sets=None, + mode="site", stat="r2", + sites=None, + positions=None, indexes=None, ): + r""" + + Returns a matrix of the specified two-locus statistic (default + :math:`r^2`) computed from sample allelic states or branch lengths. + The resulting linkage disequilibrium (LD) matrix represents either the + two-locus statistic as computed between all pairs of specified + ``sites`` (``"site"`` mode, producing a + ``len(sites)``-by-``len(sites)`` sized matrix), or as computed from the + branch structures at marginal trees between pairs of trees at all + specified ``positions`` (``"branch"`` mode, producing a + ``len(positions)``-by-``len(positions)`` sized matrix). + + The sites considered for ``"site"`` mode defaults to all sites (which may + result in a very large matrix!), but can be restricted using + the ``sites`` argument. Sites must be passed as a list of lists, + specifying the ``[row_sites, col_sites]``, resulting in a + rectangular matrix, or by specifying a single list of ``[sites]``, in + which a square matrix will be produced (see + :ref:`sec_stats_two_locus_site` for examples). Here, ``sites``, + ``row_sites``, and ``col_sites`` are each lists of site indexes. + + Similarly, in the ``"branch"`` mode, the ``positions`` argument specifies + genomic coordinates at which the expectation for the two-locus statistic + is computed, given the local tree structure. + (See :ref:`sec_stats_two_locus_branch` for explanation of in what sense + this is an expectation.) This defaults to computing + the LD for each pair of distinct trees (this is equivalent to passing in + the leftmost coordinates of each tree's span, since intervals are closed on + the left and open on the right). Similar to the site mode, a nested list + of row and column positions can be specified separately (resulting in a + rectangular matrix) or a single list of a specified positions results + in a square matrix (see :ref:`sec_stats_two_locus_branch` for + examples). Like ``sites``, the ``positions`` must be specified as a list + of lists. + + Some LD statistics are defined for both within a single set of samples + and for two sample sets. If the ``indexes`` argument is specified, then + ``indexes`` specifies the indexes of the sample sets in the + ``sample_sets`` list between which to compute LD. For instance, this + results in a 3D array whose ``[k,:,:]``-th slice contains LD values + between ``sample_sets[i]`` and ``sample_sets[j]``, where ``(i, j)`` is + the ``k``-th element of ``indexes``. + + For more on how the ``indexes`` and ``sample_sets`` interact with the + output dimensions, see the :ref:`sec_stats_two_locus_sample_sets` + section. Statistics are defined in the + :ref:`sec_stats_two_locus_summary_functions_two_way` section. + + **Available Stats** (use ``Stat Name`` in the ``stat`` keyword + argument). Statistics marked as "multi sample set" allow + (but do not require) computation from two sample sets + via the ``indexes`` argument. + + ======================= ========== ================ ============== + Stat Polarised Multi Sample Set Stat Name + ======================= ========== ================ ============== + :math:`r^2` n y "r2" + :math:`r` y n "r" + :math:`D^2` n y "D2" + :math:`D` y n "D" + :math:`D'` y n "D_prime" + :math:`D_z` n n "Dz" + :math:`\pi_2` n n "pi2" + :math:`\widehat{D^2}` n y "D2_unbiased" + :math:`\widehat{D_z}` n n "Dz_unbiased" + :math:`\widehat{\pi_2}` n n "pi2_unbiased" + ======================= ========== ================ ============== + + :param list sample_sets: A list, or a list of lists of sample node IDs, + specifying the groups of nodes to compute the statistic with. Defaults + to all samples. + :param str mode: A string giving the "type" of the statistic to be + computed. Defaults to "site", can be "site" or "branch". + :param str stat: A string giving the selected two-locus statistic to + compute. Defaults to "r2". + :param list sites: A list of lists of sites over which to compute an + LD matrix. Can be specified as a list of lists to control the row + and column sites. Only available in "site" mode. Specify as + ``[row_sites, col_sites]`` or ``[all_sites]``. + Defaults to all sites. + :param list positions: A list of lists of genomic positions where + expected LD is computed based on tree topologies and branch + lengths. Only applicable in "branch" mode. Specify as a list of + two lists to control the row and column positions, as + ``[row_positions, col_positions]``, or ``[all_positions]``. + Defaults to the leftmost coordinates of all trees and computes + LD between all pairs of trees. + :param list indexes: A list of 2-tuples or a single 2-tuple, specifying + the indexes of two sample sets over which to compute a two-way LD + statistic. Only :math:`r^2`, :math:`D^2`, and :math:`\widehat{D^2}` + are implemented for two-way statistics. + :return: A 2D or 3D array of LD matrices. + :rtype: numpy.ndarray + """ one_way_stats = { "D": self._ll_tree_sequence.D_matrix, "D2": self._ll_tree_sequence.D2_matrix,