diff --git a/pheweb/serve/components/colocalization/README.md b/pheweb/serve/components/colocalization/README.md index f106e0e1..3ee79490 100644 --- a/pheweb/serve/components/colocalization/README.md +++ b/pheweb/serve/components/colocalization/README.md @@ -5,6 +5,8 @@ DuckDB database, enriching it with parsed and cleaned fields for analysis. Instructions for loading data to the previous version can be found [here](README.v1.md). + + ## Prerequisites Ensure the following tools are available: @@ -13,6 +15,7 @@ Ensure the following tools are available: - Bash shell - Compressed TSV input files - mysql client +- python 3 Assuming you're using Linux and using the mysql client: @@ -79,13 +82,94 @@ echo "CREDSET_DATA_PATH : $CREDSET_DATA_PATH" zcat $CREDSET_DATA_PATH | head -n 1 ``` +## Preprocess data for ingestion + +In R14, some changes were made into the colocalization data: +1. dataset column was separated into dataset, quant, tissue in both the colocQC and credset data +2. hit1_info and hit2_info was separated into hit1_beta, hit1_p, hit2_beta, hit2_p +3. new eQTL Catalog data had non-numerical credible set ids. +This means that the dataset+trait+region+cs columns are no longer a unique identifier for a colocalization resource. +To make the data work with the schema used in pheweb, a new dataset_key column was created by joining dataset, tissue, quant columns, which is used as the "dataset" column in the database. Dataset, tissue and quant columns are then used as the dataset_label, dataset_sample, dataset_methods columns. + +Here are preprocessing scripts for processing the data into ingestable form: +process.py +```python +import gzip +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("input") +parser.add_argument("output") +args = parser.parse_args() + +with gzip.open(args.input,"rt",encoding="utf-8") as f, gzip.open(args.output,"wt",encoding="utf-8") as of: + input_header = f.readline().strip().split("\t") + input_hdi = {a:i for i,a in enumerate(input_header)} + output_header = ["dataset1","dataset2","tissue1","tissue2","quant1","quant2","trait1","trait2","region1","region2","cs1","cs2","nsnps","hit1","hit2","PP.H0.abf","PP.H1.abf","PP.H2.abf","PP.H3.abf","PP.H4.abf","low_purity1","low_purity2","nsnps1","nsnps2","cs1_log10bf","cs2_log10bf","clpp","clpa","cs1_size","cs2_size","cs_overlap","topInOverlap","probmass_1","probmass_2","hit1_beta","hit1_p","hit2_beta","hit2_p","colocRes"] + of.write("\t".join(output_header+["dataset_key1","dataset_key2"])+"\n") + cs_count = 0 + for i,line in enumerate(f): + cols = line.strip().split("\t") + #fix cs + if cols[input_hdi["cs2"]].startswith(f"{cols[input_hdi['trait2']]}_L"): + cols[input_hdi["cs2"]] = cols[input_hdi["cs2"]].replace(f"{cols[input_hdi['trait2']]}_L","") + cs_count +=1 + #write outputs + key1 = f"{cols[input_hdi['dataset1']]}--{cols[input_hdi['tissue1']]}--{cols[input_hdi['quant1']]}" + key2 = f"{cols[input_hdi['dataset2']]}--{cols[input_hdi['tissue2']]}--{cols[input_hdi['quant2']]}" + out_line = [cols[input_hdi[a]] for a in output_header] + of.write("\t".join(out_line+[key1,key2])+"\n") + if i%100000 == 0: + print(f"{i} lines processed, {cs_count} cs fixed") +``` + +process_cs.py +```python +import gzip +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("input") +parser.add_argument("output") +args = parser.parse_args() + +with gzip.open(args.input,"rt",encoding="utf-8") as f, gzip.open(args.output,"wt",encoding="utf-8") as of: + input_header = f.readline().strip().split("\t") + input_hdi = {a:i for i,a in enumerate(input_header)} + output_header = ["trait","region","rsid","cs","low_purity","p","beta","se","mlogp","cs_specific_prob","dataset","tissue","quant"] + of.write("\t".join(output_header+["dataset_key"])+"\n") + cs_count = 0 + for i,line in enumerate(f): + cols = line.strip().split("\t") + # check credset + if cols[input_hdi["cs"]].startswith(f"{cols[input_hdi['trait']]}_L"): + cols[input_hdi["cs"]] = cols[input_hdi["cs"]].replace(f"{cols[input_hdi['trait']]}_L","") + cs_count +=1 + #write outputs + key = f"{cols[input_hdi['dataset']]}--{cols[input_hdi['tissue']]}--{cols[input_hdi['quant']]}" + out_line = [cols[input_hdi[a]] for a in output_header] + of.write("\t".join(out_line+[key])+"\n") + if i%100000 == 0: + print(f"{i} lines processed, {cs_count} cs fixed") +``` + +parse datasets: +```sh +export PROCESSED_COLOC=${WORK_DIRECTORY}/processed_coloc_${TABLE_VERSION}.tsv.gz +export PROCESSED_CS=${WORK_DIRECTORY}/processed_cs_${TABLE_VERSION}.tsv.gz + +python3 process.py ${COLOC_DATA_PATH} ${PROCESSED_COLOC} +python3 process_cs.py ${COLOC_DATA_PATH} ${PROCESSED_COLOC} + +``` + ## Cloud settings If you are working in a GCP environment set these variables to reflect your environment. ```bash -export GS_PATH="gs://r13-data-green/pheweb/coloc_susie" +export GS_PATH="gs://r14-data-green/pheweb/colocalization" export INSTANCE_NAME="production-releases-pheweb-database" ``` @@ -95,7 +179,7 @@ Define the environment variables: ```bash export DB_PATH="${WORK_DIRECTORY}/colocalization.db" -export DUCKDB_CMD="env COLOC_DATA_PATH=${COLOC_DATA_PATH} CREDSET_DATA_PATH=${CREDSET_DATA_PATH} duckdb $DB_PATH" +export DUCKDB_CMD="env COLOC_DATA_PATH=${PROCESSED_COLOC} CREDSET_DATA_PATH=${PROCESSED_CS} duckdb $DB_PATH" export CONNECTION_STRING="user=${MYSQL_USER} port=${MYSQL_PORT} database=${MYSQL_DATABASE} password=${MYSQL_PASSWORD} host=${MYSQL_HOST}" echo "CONNECTION_STRING : $CONNECTION_STRING" @@ -128,239 +212,165 @@ mysql --defaults-group-suffix=${GROUP_SUFFIX} <