Using GlobalRefgetStore¶
GlobalRefgetStore allows a user to store sequence collections on disk and retrieve sequences and substrings of sequences. It also allows a user to retrieve sequences from specific regions given an input BED file.
In [ ]:
Copied!
%pip install refget
import os
import tempfile
%pip install refget
import os
import tempfile
Requirement already satisfied: gtars in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (0.3.0) Requirement already satisfied: pytest>=8.3.4 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from gtars) (8.4.1) Requirement already satisfied: pytest-cov>=6.0.0 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from gtars) (6.2.1) Requirement already satisfied: maturin>=1.8.1 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from gtars) (1.9.3) Requirement already satisfied: tomli>=1.1.0 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from maturin>=1.8.1->gtars) (2.2.1) Requirement already satisfied: exceptiongroup>=1 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from pytest>=8.3.4->gtars) (1.3.0) Requirement already satisfied: iniconfig>=1 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from pytest>=8.3.4->gtars) (2.1.0) Requirement already satisfied: packaging>=20 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from pytest>=8.3.4->gtars) (25.0) Requirement already satisfied: pluggy<2,>=1.5 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from pytest>=8.3.4->gtars) (1.6.0) Requirement already satisfied: pygments>=2.7.2 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from pytest>=8.3.4->gtars) (2.19.2) Requirement already satisfied: typing-extensions>=4.6.0 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from exceptiongroup>=1->pytest>=8.3.4->gtars) (4.14.1) Requirement already satisfied: coverage>=7.5 in /home/drc/GITHUB/refgenie-docs/.venv/lib/python3.10/site-packages (from coverage[toml]>=7.5->pytest-cov>=6.0.0->gtars) (7.10.2)
In [ ]:
Copied!
from refget.refget_store import GlobalRefgetStore, StorageMode
from refget.digest_functions import fasta_to_seq_digests
from refget.refget_store import GlobalRefgetStore, StorageMode
from refget.digest_functions import fasta_to_seq_digests
In [26]:
Copied!
temp_dir_obj = tempfile.TemporaryDirectory()
temp_dir_path = temp_dir_obj.name
temp_dir_obj = tempfile.TemporaryDirectory()
temp_dir_path = temp_dir_obj.name
In [27]:
Copied!
# 1. Prepare a dummy FASTA file
fasta_content = (
">chr1\n"
"ATGCATGCATGCAGTCGTAGC\n"
">chr2\n"
"GGGGAAAA\n"
)
source_fasta_path = os.path.join(temp_dir_path, "source.fa")
with open(source_fasta_path, "w") as f:
f.write(fasta_content)
# 1. Prepare a dummy FASTA file
fasta_content = (
">chr1\n"
"ATGCATGCATGCAGTCGTAGC\n"
">chr2\n"
"GGGGAAAA\n"
)
source_fasta_path = os.path.join(temp_dir_path, "source.fa")
with open(source_fasta_path, "w") as f:
f.write(fasta_content)
In [ ]:
Copied!
# 2. Digest the FASTA to get collection info and digest
collection = fasta_to_seq_digests(source_fasta_path)
collection_digest = collection.digest
print(f"Source FASTA digested. Collection digest: {collection_digest}\n")
# 2. Digest the FASTA to get collection info and digest
collection = fasta_to_seq_digests(source_fasta_path)
collection_digest = collection.digest
print(f"Source FASTA digested. Collection digest: {collection_digest}\n")
Source FASTA digested. Collection digest: aep4zTAZpy0wOIfWEQ5o8oykl9M47YVJ Processing FASTA file: /tmp/tmposoumopu/source.fa lvl1 digest: aep4zTAZpy0wOIfWEQ5o8oykl9M47YVJ
In [29]:
Copied!
# 3. Initialize GlobalRefgetStore in Encoded mode
store = GlobalRefgetStore(StorageMode.Encoded)
print(f"Initialized store: {store}\n")
# 3. Initialize GlobalRefgetStore in Encoded mode
store = GlobalRefgetStore(StorageMode.Encoded)
print(f"Initialized store: {store}\n")
Initialized store: SeqColStore object: >Sequences (n=0): >Collections (n=1): 1. Collection Digest: "DEFAULT_REFGET_SEQUENCE_COLLECTI"
In [30]:
Copied!
# 4. Import FASTA into the store
store.import_fasta(source_fasta_path)
print("FASTA imported into the store.\n")
# 4. Import FASTA into the store
store.import_fasta(source_fasta_path)
print("FASTA imported into the store.\n")
FASTA imported into the store. Loading farg index... from path with cache: reading from file: "/tmp/tmposoumopu/source.fa" Farg file path: "/tmp/tmposoumopu/source.farg" Computing digests...: "/tmp/tmposoumopu/source.farg" Processing FASTA file: /tmp/tmposoumopu/source.fa lvl1 digest: aep4zTAZpy0wOIfWEQ5o8oykl9M47YVJ Writing farg file: "/tmp/tmposoumopu/source.farg" Farg file written to "/tmp/tmposoumopu/source.farg" Preparing to load sequences into refget SeqColStore... Digest result: SequenceMetadata { name: "chr1", length: 21, sha512t24u: "k4ZQ03Uo81xEjENxmvAkcLnTez9OFlQE", md5: "0de429429b73df6142414eb38293c84e", alphabet: Dna2bit } Storing encoded sequence. Name: chr1; Alphabet: dna2bit; Digest: k4ZQ03Uo81xEjENxmvAkcLnTez9OFlQE Digest result: SequenceMetadata { name: "chr2", length: 8, sha512t24u: "OyXJErGtjgcIVSdobGkHE3sBdQ5faDTf", md5: "362f911bd5c5ffb1ca50a3910774a764", alphabet: Dna2bit } Storing encoded sequence. Name: chr2; Alphabet: dna2bit; Digest: OyXJErGtjgcIVSdobGkHE3sBdQ5faDTf Finished loading sequences into refget SeqColStore.
In [31]:
Copied!
# 5. Get a sequence by its ID (using the digest from the first sequence in collection)
seq_digest_chr1 = collection[0].metadata.sha512t24u
record_chr1 = store.get_sequence_by_id(seq_digest_chr1)
if record_chr1:
print(f"Retrieved sequence by ID: {record_chr1.metadata.name}, length {record_chr1.metadata.length}")
# Note: record_chr1.data might be None if store mode is Encoded and data isn't decoded automatically
print(f" Sequence (full): {store.get_substring(seq_digest_chr1, 0, record_chr1.metadata.length)}\n")
# 5. Get a sequence by its ID (using the digest from the first sequence in collection)
seq_digest_chr1 = collection[0].metadata.sha512t24u
record_chr1 = store.get_sequence_by_id(seq_digest_chr1)
if record_chr1:
print(f"Retrieved sequence by ID: {record_chr1.metadata.name}, length {record_chr1.metadata.length}")
# Note: record_chr1.data might be None if store mode is Encoded and data isn't decoded automatically
print(f" Sequence (full): {store.get_substring(seq_digest_chr1, 0, record_chr1.metadata.length)}\n")
Retrieved sequence by ID: chr1, length 21 Sequence (full): ATGCATGCATGCAGTCGTAGC
In [34]:
Copied!
# 6. Get a substring
sub_seq = store.get_substring(seq_digest_chr1, 5, 15)
print(f"Substring from chr1[5:15]: {sub_seq}\n") # Expected: TGCATGCAGT
# 6. Get a substring
sub_seq = store.get_substring(seq_digest_chr1, 5, 15)
print(f"Substring from chr1[5:15]: {sub_seq}\n") # Expected: TGCATGCAGT
Substring from chr1[5:15]: TGCATGCAGT
In [35]:
Copied!
# 7. Prepare a BED file for region retrieval
bed_content = (
"chr1\t0\t10\n"
"chr2\t2\t6\n"
"chr_nonexistent\t0\t5\n" # This entry will be skipped
)
bed_path = os.path.join(temp_dir_path, "regions.bed")
with open(bed_path, "w") as f:
f.write(bed_content)
# 7. Prepare a BED file for region retrieval
bed_content = (
"chr1\t0\t10\n"
"chr2\t2\t6\n"
"chr_nonexistent\t0\t5\n" # This entry will be skipped
)
bed_path = os.path.join(temp_dir_path, "regions.bed")
with open(bed_path, "w") as f:
f.write(bed_content)
In [36]:
Copied!
# 8. Retrieve sequences from BED file to a list
retrieved_list = store.get_seqs_bed_file_to_vec(collection_digest, bed_path)
print("Retrieved sequences from BED file (as list):")
for rs in retrieved_list:
print(f" - {rs}")
print("\n")
# 8. Retrieve sequences from BED file to a list
retrieved_list = store.get_seqs_bed_file_to_vec(collection_digest, bed_path)
print("Retrieved sequences from BED file (as list):")
for rs in retrieved_list:
print(f" - {rs}")
print("\n")
Retrieved sequences from BED file (as list): - chr1|0-10: ATGCATGCAT - chr2|2-6: GGAA
Warning: Skipping line 4 because sequence 'chr_nonexistent' not found in collection 'aep4zTAZpy0wOIfWEQ5o8oykl9M47YVJ'.
In [37]:
Copied!
# 9. Retrieve sequences from BED file and write to new FASTA
output_fasta_path = os.path.join(temp_dir_path, "output_regions.fa")
store.get_seqs_bed_file(collection_digest, bed_path, output_fasta_path)
print(f"Retrieved sequences from BED file written to: {output_fasta_path}")
with open(output_fasta_path, "r") as f:
print("Content of output FASTA:\n" + f.read())
print("\n")
# 9. Retrieve sequences from BED file and write to new FASTA
output_fasta_path = os.path.join(temp_dir_path, "output_regions.fa")
store.get_seqs_bed_file(collection_digest, bed_path, output_fasta_path)
print(f"Retrieved sequences from BED file written to: {output_fasta_path}")
with open(output_fasta_path, "r") as f:
print("Content of output FASTA:\n" + f.read())
print("\n")
Retrieved sequences from BED file written to: /tmp/tmposoumopu/output_regions.fa Content of output FASTA: >chr1 21 dna2bit k4ZQ03Uo81xEjENxmvAkcLnTez9OFlQE 0de429429b73df6142414eb38293c84e ATGCATGCAT >chr2 8 dna2bit OyXJErGtjgcIVSdobGkHE3sBdQ5faDTf 362f911bd5c5ffb1ca50a3910774a764 GGAA
Warning: Skipping line 4 because sequence 'chr_nonexistent' not found in collection 'aep4zTAZpy0wOIfWEQ5o8oykl9M47YVJ'.
In [38]:
Copied!
# 10. Write store to a new directory
saved_store_path = os.path.join(temp_dir_path, "my_refget_store")
store.write_store_to_directory(saved_store_path, "{digest_prefix}/{digest}.gz") # Custom template
print(f"Store saved to: {saved_store_path}\n")
# 10. Write store to a new directory
saved_store_path = os.path.join(temp_dir_path, "my_refget_store")
store.write_store_to_directory(saved_store_path, "{digest_prefix}/{digest}.gz") # Custom template
print(f"Store saved to: {saved_store_path}\n")
Store saved to: /tmp/tmposoumopu/my_refget_store Writing store to directory: /tmp/tmposoumopu/my_refget_store; Using seqdata path template: {digest_prefix}/{digest}.gz Writing farg file: "/tmp/tmposoumopu/my_refget_store/collections/aep4zTAZpy0wOIfWEQ5o8oykl9M47YVJ.farg" Writing farg file: "/tmp/tmposoumopu/my_refget_store/sequences.farg"
In [39]:
Copied!
# 11. Load store from the directory
loaded_store = GlobalRefgetStore.load_from_directory(saved_store_path)
print(f"Store successfully loaded from: {saved_store_path}")
# 11. Load store from the directory
loaded_store = GlobalRefgetStore.load_from_directory(saved_store_path)
print(f"Store successfully loaded from: {saved_store_path}")
Store successfully loaded from: /tmp/tmposoumopu/my_refget_store
In [40]:
Copied!
# Clean up tempdir manually
temp_dir_obj.cleanup()
# Clean up tempdir manually
temp_dir_obj.cleanup()