Skip to content

Package refget documentation

create_refget_router

create_refget_router(sequences=False, collections=True, pangenomes=False)

Create a FastAPI router for the sequence collection API. This router provides endpoints for retrieving and comparing sequence collections. You can choose which endpoints to include by setting the sequences, collections, or pangenomes flags.

Parameters:

Name Type Description Default
sequences bool

Include sequence endpoints

False
collections bool

Include sequence collection endpoints

True
pangenomes bool

Include pangenome endpoints

False

Returns:

Type Description
APIRouter

A FastAPI router with the specified endpoints

Examples:

app.include_router(create_refget_router(sequences=False, pangenomes=False))
Source code in refget/refget_router.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def create_refget_router(
    sequences: bool = False, collections: bool = True, pangenomes: bool = False
):
    """
    Create a FastAPI router for the sequence collection API.
    This router provides endpoints for retrieving and comparing sequence collections.
    You can choose which endpoints to include by setting the sequences, collections,
    or pangenomes flags.

    Args:
        sequences (bool): Include sequence endpoints
        collections (bool): Include sequence collection endpoints
        pangenomes (bool): Include pangenome endpoints

    Returns:
        (APIRouter): A FastAPI router with the specified endpoints

    Examples:
        ```
        app.include_router(create_refget_router(sequences=False, pangenomes=False))
        ```
    """

    refget_router = APIRouter()
    if sequences:
        _LOGGER.info("Adding sequence endpoints...")
        refget_router.include_router(seq_router)
    if collections:
        _LOGGER.info("Adding collection endpoints...")
        refget_router.include_router(seqcol_router)
    if pangenomes:
        _LOGGER.info("Adding pangenome endpoints...")
        refget_router.include_router(pangenome_router)
    return refget_router

fasta_to_seqcol_dict

fasta_to_seqcol_dict(fasta_file_path, digest_function=sha512t24u_digest)

Convert a FASTA file into a Sequence Collection object.

Parameters:

Name Type Description Default
fasta_file_path str

Path to the FASTA file

required
digest_function DigestFunction

Digest function to use. Defaults to sha512t24u_digest.

sha512t24u_digest

Returns:

Type Description
dict

A canonical sequence collection object

Source code in refget/utilities.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def fasta_to_seqcol_dict(
    fasta_file_path: str,
    digest_function: DigestFunction = sha512t24u_digest,
) -> SeqColDict:
    """
    Convert a FASTA file into a Sequence Collection object.

    Args:
        fasta_file_path (str): Path to the FASTA file
        digest_function (DigestFunction, optional): Digest function to use. Defaults to sha512t24u_digest.

    Returns:
        (dict): A canonical sequence collection object
    """

    fasta_seq_digests = fasta_to_seq_digests(fasta_file_path)
    seqcol_dict = {
        "lengths": [],
        "names": [],
        "sequences": [],
        "sorted_name_length_pairs": [],
        "sorted_sequences": [],
    }
    for s in fasta_seq_digests:
        seq_name = s.id
        seq_length = s.length
        seq_digest = "SQ." + s.sha512t24u
        nlp = {"length": seq_length, "name": seq_name}  # for name_length_pairs
        # snlp_digest = digest_function(canonical_str(nlp)) # for sorted_name_length_pairs
        snlp_digest = canonical_str(nlp)  # for sorted_name_length_pairs
        seqcol_dict["lengths"].append(seq_length)
        seqcol_dict["names"].append(seq_name)
        # seqcol_dict["name_length_pairs"].append(nlp)
        seqcol_dict["sorted_name_length_pairs"].append(snlp_digest)
        seqcol_dict["sequences"].append(seq_digest)
        seqcol_dict["sorted_sequences"].append(seq_digest)
    seqcol_dict["sorted_name_length_pairs"].sort()
    # seqcol_dict_digest = seqcol_digest(seqcol_dict)
    # dsc = DigestedSequenceCollection(**seqcol_dict)
    # dsc.digest = seqcol_digest(seqcol_dict)
    return seqcol_dict

fasta_to_digest

fasta_to_digest(fa_file_path, inherent_attrs=['names', 'sequences'])

Given a fasta file path, return a digest

Parameters:

Name Type Description Default
fa_file_path str | Path

Path to the fasta file

required
inherent_attrs Optional[list]

Attributes to include in the digest.

['names', 'sequences']

Returns:

Type Description
str

The top-level digest for this sequence collection

Source code in refget/utilities.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def fasta_to_digest(
    fa_file_path: str | Path, inherent_attrs: Optional[list] = ["names", "sequences"]
) -> str:
    """
    Given a fasta file path, return a digest

    Args:
        fa_file_path (str | Path): Path to the fasta file
        inherent_attrs (Optional[list], optional): Attributes to include in the digest.

    Returns:
        (str): The top-level digest for this sequence collection
    """
    seqcol_obj = fasta_to_seqcol_dict(fa_file_path)
    return seqcol_digest(seqcol_obj, inherent_attrs)

SequenceClient

SequenceClient(urls=['https://www.ebi.ac.uk/ena/cram'], raise_errors=None)

Bases: RefgetClient

A client for interacting with a refget sequences API.

Initializes the sequences client.

Parameters:

Name Type Description Default
urls list

A list of base URLs of the sequences API. Defaults to ["https://www.ebi.ac.uk/ena/cram/sequence/"].

['https://www.ebi.ac.uk/ena/cram']
raise_errors bool

Whether to raise errors or log them. Defaults to None, which will guess.

None

Attributes: urls (list): The list of base URLs of the sequences API.

Source code in refget/clients.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def __init__(self, urls=["https://www.ebi.ac.uk/ena/cram"], raise_errors=None):
    """
    Initializes the sequences client.

    Args:
        urls (list, optional): A list of base URLs of the sequences API. Defaults to ["https://www.ebi.ac.uk/ena/cram/sequence/"].
        raise_errors (bool, optional): Whether to raise errors or log them. Defaults to None, which will guess.
    Attributes:
        urls (list): The list of base URLs of the sequences API.
    """
    # Remove trailing slaches from input URLs
    self.urls = [url.rstrip("/") for url in urls]
    # If raise_errors is None, set it to True if the client is not being used as a library
    if raise_errors is None:
        raise_errors = __name__ == "__main__"
    self.raise_errors = raise_errors

get_metadata

get_metadata(digest)

Retrieves metadata for a given sequence digest.

Parameters:

Name Type Description Default
digest str

The digest of the sequence.

required

Returns:

Type Description
dict

The metadata.

Source code in refget/clients.py
78
79
80
81
82
83
84
85
86
87
88
89
def get_metadata(self, digest):
    """
    Retrieves metadata for a given sequence digest.

    Args:
        digest (str): The digest of the sequence.

    Returns:
        (dict): The metadata.
    """
    endpoint = f"/sequence/{digest}/metadata"
    return _try_urls(self.urls, endpoint, raise_errors=self.raise_errors)

get_sequence

get_sequence(digest, start=None, end=None)

Retrieves a sequence for a given digest.

Parameters:

Name Type Description Default
digest str

The digest of the sequence.

required

Returns:

Type Description
str

The sequence.

Source code in refget/clients.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def get_sequence(self, digest, start=None, end=None):
    """
    Retrieves a sequence for a given digest.

    Args:
        digest (str): The digest of the sequence.

    Returns:
        (str): The sequence.
    """
    query_params = {}
    if start is not None:
        query_params["start"] = start
    if end is not None:
        query_params["end"] = end

    endpoint = f"/sequence/{digest}"
    return _try_urls(self.urls, endpoint, params=query_params, raise_errors=self.raise_errors)

SequenceCollectionClient

SequenceCollectionClient(urls=['https://seqcolapi.databio.org'], raise_errors=None)

Bases: RefgetClient

A client for interacting with a refget sequence collections API.

Initializes the sequence collection client.

Parameters:

Name Type Description Default
urls list

A list of base URLs of the sequence collection API. Defaults to ["https://seqcolapi.databio.org"].

['https://seqcolapi.databio.org']

Attributes:

Name Type Description
urls list

The list of base URLs of the sequence collection API.

Source code in refget/clients.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def __init__(self, urls=["https://seqcolapi.databio.org"], raise_errors=None):
    """
    Initializes the sequence collection client.

    Args:
        urls (list, optional): A list of base URLs of the sequence collection API. Defaults to ["https://seqcolapi.databio.org"].

    Attributes:
        urls (list): The list of base URLs of the sequence collection API.
    """
    # Remove trailing slaches from input URLs
    self.urls = [url.rstrip("/") for url in urls]
    # If raise_errors is None, set it to True if the client is not being used as a library
    if raise_errors is None:
        raise_errors = __name__ == "__main__"
    self.raise_errors = raise_errors

compare

compare(digest1, digest2)

Compares two sequence collections.

Parameters:

Name Type Description Default
digest1 str

The digest of the first sequence collection.

required
digest2 str

The digest of the second sequence collection.

required

Returns:

Type Description
dict

The JSON response containing the comparison of the two sequence collections.

Source code in refget/clients.py
142
143
144
145
146
147
148
149
150
151
152
153
154
def compare(self, digest1, digest2):
    """
    Compares two sequence collections.

    Args:
        digest1 (str): The digest of the first sequence collection.
        digest2 (str): The digest of the second sequence collection.

    Returns:
        (dict): The JSON response containing the comparison of the two sequence collections.
    """
    endpoint = f"/comparison/{digest1}/{digest2}"
    return _try_urls(self.urls, endpoint)

get_attribute

get_attribute(attribute, digest, level=2)

Retrieves a specific attribute for a given digest and detail level.

Parameters:

Name Type Description Default
attribute str

The attribute to retrieve.

required
digest str

The digest of the attribute.

required

Returns:

Type Description
dict

The JSON response containing the attribute.

Source code in refget/clients.py
128
129
130
131
132
133
134
135
136
137
138
139
140
def get_attribute(self, attribute, digest, level=2):
    """
    Retrieves a specific attribute for a given digest and detail level.

    Args:
        attribute (str): The attribute to retrieve.
        digest (str): The digest of the attribute.

    Returns:
        (dict): The JSON response containing the attribute.
    """
    endpoint = f"/attribute/collection/{attribute}/{digest}"
    return _try_urls(self.urls, endpoint)

get_collection

get_collection(digest, level=2)

Retrieves a sequence collection for a given digest and detail level.

Parameters:

Name Type Description Default
digest str

The digest of the sequence collection.

required
level int

The level of detail for the sequence collection. Defaults to 2.

2

Returns:

Type Description
dict

The JSON response containing the sequence collection.

Source code in refget/clients.py
114
115
116
117
118
119
120
121
122
123
124
125
126
def get_collection(self, digest, level=2):
    """
    Retrieves a sequence collection for a given digest and detail level.

    Args:
        digest (str): The digest of the sequence collection.
        level (int, optional): The level of detail for the sequence collection. Defaults to 2.

    Returns:
        (dict): The JSON response containing the sequence collection.
    """
    endpoint = f"/collection/{digest}?level={level}"
    return _try_urls(self.urls, endpoint)

list_attributes

list_attributes(attribute, page=None, page_size=None)

Lists all available values for a given attribute with optional paging support.

Parameters:

Name Type Description Default
attribute str

The attribute to list values for.

required
page int

The page number to retrieve. Defaults to None.

None
page_size int

The number of items per page. Defaults to None.

None

Returns:

Type Description
dict

The JSON response containing the list of available values for the attribute.

Source code in refget/clients.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def list_attributes(self, attribute, page=None, page_size=None):
    """
    Lists all available values for a given attribute with optional paging support.

    Args:
        attribute (str): The attribute to list values for.
        page (int, optional): The page number to retrieve. Defaults to None.
        page_size (int, optional): The number of items per page. Defaults to None.

    Returns:
        (dict): The JSON response containing the list of available values for the attribute.
    """
    params = {}
    if page is not None:
        params["page"] = page
    if page_size is not None:
        params["page_size"] = page_size

    endpoint = f"/list/attributes/{attribute}"
    return _try_urls(self.urls, endpoint, params=params)

list_collections

list_collections(page=None, page_size=None, attribute=None, attribute_digest=None)

Lists all available sequence collections with optional paging and attribute filtering support.

Parameters:

Name Type Description Default
page int

The page number to retrieve. Defaults to None.

None
page_size int

The number of items per page. Defaults to None.

None
attribute str

The attribute to filter by. Defaults to None.

None
attribute_digest str

The attribute digest to filter by. Defaults to None.

None

Returns:

Type Description
dict

The JSON response containing the list of available sequence collections.

Source code in refget/clients.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def list_collections(self, page=None, page_size=None, attribute=None, attribute_digest=None):
    """
    Lists all available sequence collections with optional paging and attribute filtering support.

    Args:
        page (int, optional): The page number to retrieve. Defaults to None.
        page_size (int, optional): The number of items per page. Defaults to None.
        attribute (str, optional): The attribute to filter by. Defaults to None.
        attribute_digest (str, optional): The attribute digest to filter by. Defaults to None.

    Returns:
        (dict): The JSON response containing the list of available sequence collections.
    """
    params = {}
    if page is not None:
        params["page"] = page
    if page_size is not None:
        params["page_size"] = page_size

    if attribute and attribute_digest:
        endpoint = f"/list/collections/{attribute}/{attribute_digest}"
    else:
        endpoint = "/list/collections"

    return _try_urls(self.urls, endpoint, params=params)

service_info

service_info()

Retrieves information about the service.

Returns:

Type Description
dict

The service information.

Source code in refget/clients.py
203
204
205
206
207
208
209
210
211
def service_info(self):
    """
    Retrieves information about the service.

    Returns:
        (dict): The service information.
    """
    endpoint = "/service-info"
    return _try_urls(self.urls, endpoint)

RefgetDBAgent

RefgetDBAgent(engine=None, postgres_str=None, schema=f'{SCHEMA_FILEPATH}/seqcol.json', inherent_attrs=['names', 'lengths', 'sequences'])

Bases: object

Primary aggregator agent, interface to all other agents

Parameterized it via these environment variables: - POSTGRES_HOST - POSTGRES_DB - POSTGRES_USER - POSTGRES_PASSWORD

Source code in refget/agents.py
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
def __init__(
    self,
    engine: Optional[SqlalchemyDatabaseEngine] = None,
    postgres_str: Optional[str] = None,
    schema=f"{SCHEMA_FILEPATH}/seqcol.json",
    inherent_attrs: List[str] = ["names", "lengths", "sequences"],
):  # = "sqlite:///foo.db"
    if engine is not None:
        self.engine = engine
    else:
        if not postgres_str:
            # Configure via environment variables
            POSTGRES_HOST = os.getenv("POSTGRES_HOST")
            POSTGRES_DB = os.getenv("POSTGRES_DB")
            POSTGRES_USER = os.getenv("POSTGRES_USER")
            POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
            postgres_str = URL.create(
                "postgresql",
                username=POSTGRES_USER,
                password=POSTGRES_PASSWORD,
                host=POSTGRES_HOST,
                database=POSTGRES_DB,
            )

        try:
            self.engine = create_engine(postgres_str, echo=False)
        except Exception as e:
            _LOGGER.error(f"Error: {e}")
            _LOGGER.error("Unable to connect to database")
            _LOGGER.error(
                "Please check that you have set the database credentials correctly in the environment variables"
            )
            _LOGGER.error(f"Database engine string: {postgres_str}")
            raise e
    try:
        SQLModel.metadata.create_all(self.engine)
    except Exception as e:
        _LOGGER.error(f"Error: {e}")
        _LOGGER.error("Unable to create tables in the database")
        raise e

    # Read schema
    if schema:
        self.schema_dict = load_json(schema)
        _LOGGER.debug(f"Schema: {self.schema_dict}")
        try:
            self.inherent_attrs = self.schema_dict["ga4gh"]["inherent"]
        except KeyError:
            self.inherent_attrs = inherent_attrs
            _LOGGER.warning(
                f"No 'inherent' attributes found in schema; using defaults: {inherent_attrs}"
            )
    else:
        _LOGGER.warning("No schema provided; using defaults")
        self.schema_dict = None
        self.inherent_attrs = inherent_attrs

    self.__sequence = SequenceAgent(self.engine)
    self.__seqcol = SequenceCollectionAgent(self.engine, self.inherent_attrs)
    self.__pangenome = PangenomeAgent(self)
    self.__attribute = AttributeAgent(self.engine)

truncate

truncate()

Delete all records from the database

Source code in refget/agents.py
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
def truncate(self):
    """Delete all records from the database"""

    with Session(self.engine) as session:
        statement = delete(SequenceCollection)
        result1 = session.exec(statement)
        statement = delete(Pangenome)
        result = session.exec(statement)
        statement = delete(NamesAttr)
        result = session.exec(statement)
        statement = delete(LengthsAttr)
        result = session.exec(statement)
        statement = delete(SequencesAttr)
        result = session.exec(statement)
        # statement = delete(SortedNameLengthPairsAttr)
        # result = session.exec(statement)
        statement = delete(NameLengthPairsAttr)
        result = session.exec(statement)
        statement = delete(SortedSequencesAttr)
        result = session.exec(statement)

        session.commit()
        return result1.rowcount

SequenceCollectionAgent

SequenceCollectionAgent(engine, inherent_attrs=None)

Bases: object

Agent for interacting with database of sequence collection

Source code in refget/agents.py
145
146
147
def __init__(self, engine, inherent_attrs=None):
    self.engine = engine
    self.inherent_attrs = inherent_attrs

add

add(seqcol)

Add a sequence collection to the database, given a SeedCollection object

Source code in refget/agents.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def add(self, seqcol: SequenceCollection) -> SequenceCollection:
    """
    Add a sequence collection to the database, given a SeedCollection object
    """
    with Session(self.engine, expire_on_commit=False) as session:
        with session.no_autoflush:
            csc = session.get(SequenceCollection, seqcol.digest)
            if csc:  # already exists
                return csc
            csc_simplified = SequenceCollection(
                digest=seqcol.digest,
                sorted_name_length_pairs_digest=seqcol.sorted_name_length_pairs_digest,
            )  # not linked to attributes

            # Check if attributes exist; only create them if they don't
            names = session.get(NamesAttr, seqcol.names.digest)
            if not names:
                names = NamesAttr(**seqcol.names.model_dump())
                session.add(names)

            sequences = session.get(SequencesAttr, seqcol.sequences.digest)
            if not sequences:
                sequences = SequencesAttr(**seqcol.sequences.model_dump())
                session.add(sequences)

            sorted_sequences = session.get(SortedSequencesAttr, seqcol.sorted_sequences.digest)
            if not sorted_sequences:
                sorted_sequences = SortedSequencesAttr(**seqcol.sorted_sequences.model_dump())
                session.add(sorted_sequences)

            lengths = session.get(LengthsAttr, seqcol.lengths.digest)
            if not lengths:
                lengths = LengthsAttr(**seqcol.lengths.model_dump())
                session.add(lengths)

            # This is a transient attribute
            # sorted_name_length_pairs = session.get(
            #     SortedNameLengthPairsAttr, seqcol.sorted_name_length_pairs.digest
            # )
            # if not sorted_name_length_pairs:
            #     sorted_name_length_pairs = SortedNameLengthPairsAttr(
            #         **seqcol.sorted_name_length_pairs.model_dump()
            #     )
            #     session.add(sorted_name_length_pairs)

            name_length_pairs = session.get(
                NameLengthPairsAttr, seqcol.name_length_pairs.digest
            )
            if not name_length_pairs:
                name_length_pairs = NameLengthPairsAttr(
                    **seqcol.name_length_pairs.model_dump()
                )
                session.add(name_length_pairs)

            # Link the attributes back to the sequence collection
            names.collection.append(csc_simplified)
            sequences.collection.append(csc_simplified)
            sorted_sequences.collection.append(csc_simplified)
            lengths.collection.append(csc_simplified)
            # sorted_name_length_pairs.collection.append(csc_simplified)
            name_length_pairs.collection.append(csc_simplified)
            session.commit()
            return csc_simplified

add_from_dict

add_from_dict(seqcol_dict)

Add a sequence collection from a seqcol dictionary

Source code in refget/agents.py
237
238
239
240
241
242
243
244
def add_from_dict(self, seqcol_dict: dict):
    """
    Add a sequence collection from a seqcol dictionary
    """
    seqcol = SequenceCollection.from_dict(seqcol_dict, self.inherent_attrs)
    _LOGGER.info(f"SeqCol: {seqcol}")
    _LOGGER.debug(f"SeqCol name_length_pairs: {seqcol.name_length_pairs.value}")
    return self.add(seqcol)

add_from_fasta_pep

add_from_fasta_pep(pep, fa_root)

Given a path to a PEP file and a root directory containing the fasta files, load the fasta files into the refget database.

Args: - pep_path (str): Path to the PEP file - fa_root (str): Root directory containing the fasta files

Source code in refget/agents.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def add_from_fasta_pep(self, pep: peppy.Project, fa_root):
    """
    Given a path to a PEP file and a root directory containing the fasta files,
    load the fasta files into the refget database.

    Args:
    - pep_path (str): Path to the PEP file
    - fa_root (str): Root directory containing the fasta files
    """

    total_files = len(pep.samples)
    results = {}
    import time

    for i, s in enumerate(pep.samples, 1):
        fa_path = os.path.join(fa_root, s.fasta)
        _LOGGER.info(f"Loading {fa_path} ({i} of {total_files})")

        start_time = time.time()  # Record start time
        results[s.fasta] = self.add_from_fasta_file(fa_path).digest
        elapsed_time = time.time() - start_time  # Calculate elapsed time

        _LOGGER.info(f"Loaded in {elapsed_time:.2f} seconds")

    return results

SequenceAgent

SequenceAgent(engine)

Bases: object

Agent for interacting with database of sequences

Source code in refget/agents.py
77
78
def __init__(self, engine):
    self.engine = engine