Skip to content

Package refget documentation

create_refget_router

create_refget_router(sequences=False, collections=True, pangenomes=False)

Create a FastAPI router for the sequence collection API. This router provides endpoints for retrieving and comparing sequence collections. You can choose which endpoints to include by setting the sequences, collections, or pangenomes flags.

Parameters:

Name Type Description Default
sequences bool

Include sequence endpoints

False
collections bool

Include sequence collection endpoints

True
pangenomes bool

Include pangenome endpoints

False

Returns:

Type Description
APIRouter

A FastAPI router with the specified endpoints

Examples:

app.include_router(create_refget_router(sequences=False, pangenomes=False))
Source code in refget/refget_router.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def create_refget_router(
    sequences: bool = False, collections: bool = True, pangenomes: bool = False
):
    """
    Create a FastAPI router for the sequence collection API.
    This router provides endpoints for retrieving and comparing sequence collections.
    You can choose which endpoints to include by setting the sequences, collections,
    or pangenomes flags.

    Args:
        sequences (bool): Include sequence endpoints
        collections (bool): Include sequence collection endpoints
        pangenomes (bool): Include pangenome endpoints

    Returns:
        (APIRouter): A FastAPI router with the specified endpoints

    Examples:
        ```
        app.include_router(create_refget_router(sequences=False, pangenomes=False))
        ```
    """

    refget_router = APIRouter()
    if sequences:
        _LOGGER.info("Adding sequence endpoints...")
        refget_router.include_router(seq_router)
    if collections:
        _LOGGER.info("Adding collection endpoints...")
        refget_router.include_router(seqcol_router)
    if pangenomes:
        _LOGGER.info("Adding pangenome endpoints...")
        refget_router.include_router(pangenome_router)
    return refget_router

fasta_to_seqcol_dict

fasta_to_seqcol_dict(fasta_file_path, digest_function=sha512t24u_digest)

Convert a FASTA file into a Sequence Collection object.

Parameters:

Name Type Description Default
fasta_file_path str

Path to the FASTA file

required
digest_function DigestFunction

Digest function to use. Defaults to sha512t24u_digest.

sha512t24u_digest

Returns:

Type Description
dict

A canonical sequence collection object

Source code in refget/utilities.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def fasta_to_seqcol_dict(
    fasta_file_path: str,
    digest_function: DigestFunction = sha512t24u_digest,
) -> SeqColDict:
    """
    Convert a FASTA file into a Sequence Collection object.

    Args:
        fasta_file_path (str): Path to the FASTA file
        digest_function (DigestFunction, optional): Digest function to use. Defaults to sha512t24u_digest.

    Returns:
        (dict): A canonical sequence collection object
    """

    fasta_seq_digests = fasta_to_seq_digests(fasta_file_path)
    seqcol_dict = {
        "lengths": [],
        "names": [],
        "sequences": [],
        "sorted_name_length_pairs": [],
        "sorted_sequences": [],
    }
    for s in fasta_seq_digests:
        seq_name = s.metadata.name
        seq_length = s.metadata.length
        seq_digest = "SQ." + s.metadata.sha512t24u
        nlp = {"length": seq_length, "name": seq_name}  # for name_length_pairs
        # snlp_digest = digest_function(canonical_str(nlp)) # for sorted_name_length_pairs
        snlp_digest = canonical_str(nlp)  # for sorted_name_length_pairs
        seqcol_dict["lengths"].append(seq_length)
        seqcol_dict["names"].append(seq_name)
        # seqcol_dict["name_length_pairs"].append(nlp)
        seqcol_dict["sorted_name_length_pairs"].append(snlp_digest)
        seqcol_dict["sequences"].append(seq_digest)
        seqcol_dict["sorted_sequences"].append(seq_digest)
    seqcol_dict["sorted_name_length_pairs"].sort()
    # seqcol_dict_digest = seqcol_digest(seqcol_dict)
    # dsc = DigestedSequenceCollection(**seqcol_dict)
    # dsc.digest = seqcol_digest(seqcol_dict)
    return seqcol_dict

fasta_to_digest

fasta_to_digest(fa_file_path, inherent_attrs=['names', 'sequences'])

Given a fasta file path, return a digest

Parameters:

Name Type Description Default
fa_file_path str | Path

Path to the fasta file

required
inherent_attrs Optional[list]

Attributes to include in the digest.

['names', 'sequences']

Returns:

Type Description
str

The top-level digest for this sequence collection

Source code in refget/utilities.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def fasta_to_digest(
    fa_file_path: str | Path, inherent_attrs: Optional[list] = ["names", "sequences"]
) -> str:
    """
    Given a fasta file path, return a digest

    Args:
        fa_file_path (str | Path): Path to the fasta file
        inherent_attrs (Optional[list], optional): Attributes to include in the digest.

    Returns:
        (str): The top-level digest for this sequence collection
    """
    seqcol_obj = fasta_to_seqcol_dict(fa_file_path)
    return seqcol_digest(seqcol_obj, inherent_attrs)

SequenceClient

SequenceClient(urls=['https://www.ebi.ac.uk/ena/cram'], raise_errors=None)

Bases: RefgetClient

A client for interacting with a refget sequences API.

Initializes the sequences client.

Parameters:

Name Type Description Default
urls list

A list of base URLs of the sequences API. Defaults to ["https://www.ebi.ac.uk/ena/cram/sequence/"].

['https://www.ebi.ac.uk/ena/cram']
raise_errors bool

Whether to raise errors or log them. Defaults to None, which will guess.

None

Attributes: urls (list): The list of base URLs of the sequences API.

Source code in refget/clients.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def __init__(self, urls=["https://www.ebi.ac.uk/ena/cram"], raise_errors=None):
    """
    Initializes the sequences client.

    Args:
        urls (list, optional): A list of base URLs of the sequences API. Defaults to ["https://www.ebi.ac.uk/ena/cram/sequence/"].
        raise_errors (bool, optional): Whether to raise errors or log them. Defaults to None, which will guess.
    Attributes:
        urls (list): The list of base URLs of the sequences API.
    """
    # Remove trailing slaches from input URLs
    self.urls = [url.rstrip("/") for url in urls]
    # If raise_errors is None, set it to True if the client is not being used as a library
    if raise_errors is None:
        raise_errors = __name__ == "__main__"
    self.raise_errors = raise_errors

get_metadata

get_metadata(digest)

Retrieves metadata for a given sequence digest.

Parameters:

Name Type Description Default
digest str

The digest of the sequence.

required

Returns:

Type Description
dict

The metadata.

Source code in refget/clients.py
78
79
80
81
82
83
84
85
86
87
88
89
def get_metadata(self, digest):
    """
    Retrieves metadata for a given sequence digest.

    Args:
        digest (str): The digest of the sequence.

    Returns:
        (dict): The metadata.
    """
    endpoint = f"/sequence/{digest}/metadata"
    return _try_urls(self.urls, endpoint, raise_errors=self.raise_errors)

get_sequence

get_sequence(digest, start=None, end=None)

Retrieves a sequence for a given digest.

Parameters:

Name Type Description Default
digest str

The digest of the sequence.

required

Returns:

Type Description
str

The sequence.

Source code in refget/clients.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def get_sequence(self, digest, start=None, end=None):
    """
    Retrieves a sequence for a given digest.

    Args:
        digest (str): The digest of the sequence.

    Returns:
        (str): The sequence.
    """
    query_params = {}
    if start is not None:
        query_params["start"] = start
    if end is not None:
        query_params["end"] = end

    endpoint = f"/sequence/{digest}"
    return _try_urls(self.urls, endpoint, params=query_params, raise_errors=self.raise_errors)

SequenceCollectionClient

SequenceCollectionClient(urls=['https://seqcolapi.databio.org'], raise_errors=None)

Bases: RefgetClient

A client for interacting with a refget sequence collections API.

Initializes the sequence collection client.

Parameters:

Name Type Description Default
urls list

A list of base URLs of the sequence collection API. Defaults to ["https://seqcolapi.databio.org"].

['https://seqcolapi.databio.org']

Attributes:

Name Type Description
urls list

The list of base URLs of the sequence collection API.

Source code in refget/clients.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def __init__(self, urls=["https://seqcolapi.databio.org"], raise_errors=None):
    """
    Initializes the sequence collection client.

    Args:
        urls (list, optional): A list of base URLs of the sequence collection API. Defaults to ["https://seqcolapi.databio.org"].

    Attributes:
        urls (list): The list of base URLs of the sequence collection API.
    """
    # Remove trailing slaches from input URLs
    self.urls = [url.rstrip("/") for url in urls]
    # If raise_errors is None, set it to True if the client is not being used as a library
    if raise_errors is None:
        raise_errors = __name__ == "__main__"
    self.raise_errors = raise_errors

compare

compare(digest1, digest2)

Compares two sequence collections.

Parameters:

Name Type Description Default
digest1 str

The digest of the first sequence collection.

required
digest2 str

The digest of the second sequence collection.

required

Returns:

Type Description
dict

The JSON response containing the comparison of the two sequence collections.

Source code in refget/clients.py
142
143
144
145
146
147
148
149
150
151
152
153
154
def compare(self, digest1, digest2):
    """
    Compares two sequence collections.

    Args:
        digest1 (str): The digest of the first sequence collection.
        digest2 (str): The digest of the second sequence collection.

    Returns:
        (dict): The JSON response containing the comparison of the two sequence collections.
    """
    endpoint = f"/comparison/{digest1}/{digest2}"
    return _try_urls(self.urls, endpoint)

get_attribute

get_attribute(attribute, digest, level=2)

Retrieves a specific attribute for a given digest and detail level.

Parameters:

Name Type Description Default
attribute str

The attribute to retrieve.

required
digest str

The digest of the attribute.

required

Returns:

Type Description
dict

The JSON response containing the attribute.

Source code in refget/clients.py
128
129
130
131
132
133
134
135
136
137
138
139
140
def get_attribute(self, attribute, digest, level=2):
    """
    Retrieves a specific attribute for a given digest and detail level.

    Args:
        attribute (str): The attribute to retrieve.
        digest (str): The digest of the attribute.

    Returns:
        (dict): The JSON response containing the attribute.
    """
    endpoint = f"/attribute/collection/{attribute}/{digest}"
    return _try_urls(self.urls, endpoint)

get_collection

get_collection(digest, level=2)

Retrieves a sequence collection for a given digest and detail level.

Parameters:

Name Type Description Default
digest str

The digest of the sequence collection.

required
level int

The level of detail for the sequence collection. Defaults to 2.

2

Returns:

Type Description
dict

The JSON response containing the sequence collection.

Source code in refget/clients.py
114
115
116
117
118
119
120
121
122
123
124
125
126
def get_collection(self, digest, level=2):
    """
    Retrieves a sequence collection for a given digest and detail level.

    Args:
        digest (str): The digest of the sequence collection.
        level (int, optional): The level of detail for the sequence collection. Defaults to 2.

    Returns:
        (dict): The JSON response containing the sequence collection.
    """
    endpoint = f"/collection/{digest}?level={level}"
    return _try_urls(self.urls, endpoint)

list_attributes

list_attributes(attribute, page=None, page_size=None)

Lists all available values for a given attribute with optional paging support.

Parameters:

Name Type Description Default
attribute str

The attribute to list values for.

required
page int

The page number to retrieve. Defaults to None.

None
page_size int

The number of items per page. Defaults to None.

None

Returns:

Type Description
dict

The JSON response containing the list of available values for the attribute.

Source code in refget/clients.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def list_attributes(self, attribute, page=None, page_size=None):
    """
    Lists all available values for a given attribute with optional paging support.

    Args:
        attribute (str): The attribute to list values for.
        page (int, optional): The page number to retrieve. Defaults to None.
        page_size (int, optional): The number of items per page. Defaults to None.

    Returns:
        (dict): The JSON response containing the list of available values for the attribute.
    """
    params = {}
    if page is not None:
        params["page"] = page
    if page_size is not None:
        params["page_size"] = page_size

    endpoint = f"/list/attributes/{attribute}"
    return _try_urls(self.urls, endpoint, params=params)

list_collections

list_collections(page=None, page_size=None, attribute=None, attribute_digest=None)

Lists all available sequence collections with optional paging and attribute filtering support.

Parameters:

Name Type Description Default
page int

The page number to retrieve. Defaults to None.

None
page_size int

The number of items per page. Defaults to None.

None
attribute str

The attribute to filter by. Defaults to None.

None
attribute_digest str

The attribute digest to filter by. Defaults to None.

None

Returns:

Type Description
dict

The JSON response containing the list of available sequence collections.

Source code in refget/clients.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def list_collections(self, page=None, page_size=None, attribute=None, attribute_digest=None):
    """
    Lists all available sequence collections with optional paging and attribute filtering support.

    Args:
        page (int, optional): The page number to retrieve. Defaults to None.
        page_size (int, optional): The number of items per page. Defaults to None.
        attribute (str, optional): The attribute to filter by. Defaults to None.
        attribute_digest (str, optional): The attribute digest to filter by. Defaults to None.

    Returns:
        (dict): The JSON response containing the list of available sequence collections.
    """
    params = {}
    if page is not None:
        params["page"] = page
    if page_size is not None:
        params["page_size"] = page_size

    if attribute and attribute_digest:
        endpoint = f"/list/collections/{attribute}/{attribute_digest}"
    else:
        endpoint = "/list/collections"

    return _try_urls(self.urls, endpoint, params=params)

service_info

service_info()

Retrieves information about the service.

Returns:

Type Description
dict

The service information.

Source code in refget/clients.py
203
204
205
206
207
208
209
210
211
def service_info(self):
    """
    Retrieves information about the service.

    Returns:
        (dict): The service information.
    """
    endpoint = "/service-info"
    return _try_urls(self.urls, endpoint)

RefgetDBAgent

RefgetDBAgent(engine=None, postgres_str=None, schema=f'{SCHEMA_FILEPATH}/seqcol.json', inherent_attrs=['names', 'lengths', 'sequences'])

Bases: object

Primary aggregator agent, interface to all other agents

Parameterized it via these environment variables: - POSTGRES_HOST - POSTGRES_DB - POSTGRES_USER - POSTGRES_PASSWORD

Source code in refget/agents.py
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
def __init__(
    self,
    engine: Optional[SqlalchemyDatabaseEngine] = None,
    postgres_str: Optional[str] = None,
    schema=f"{SCHEMA_FILEPATH}/seqcol.json",
    inherent_attrs: List[str] = ["names", "lengths", "sequences"],
):  # = "sqlite:///foo.db"
    if engine is not None:
        self.engine = engine
    else:
        if not postgres_str:
            # Configure via environment variables
            POSTGRES_HOST = os.getenv("POSTGRES_HOST")
            POSTGRES_DB = os.getenv("POSTGRES_DB")
            POSTGRES_USER = os.getenv("POSTGRES_USER")
            POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
            postgres_str = URL.create(
                "postgresql",
                username=POSTGRES_USER,
                password=POSTGRES_PASSWORD,
                host=POSTGRES_HOST,
                database=POSTGRES_DB,
            )

        try:
            self.engine = create_engine(postgres_str, echo=False)
        except Exception as e:
            _LOGGER.error(f"Error: {e}")
            _LOGGER.error("Unable to connect to database")
            _LOGGER.error(
                "Please check that you have set the database credentials correctly in the environment variables"
            )
            _LOGGER.error(f"Database engine string: {postgres_str}")
            raise e
    try:
        SQLModel.metadata.create_all(self.engine)
    except Exception as e:
        _LOGGER.error(f"Error: {e}")
        _LOGGER.error("Unable to create tables in the database")
        raise e

    # Read schema
    if schema:
        self.schema_dict = load_json(schema)
        _LOGGER.debug(f"Schema: {self.schema_dict}")
        try:
            self.inherent_attrs = self.schema_dict["ga4gh"]["inherent"]
        except KeyError:
            self.inherent_attrs = inherent_attrs
            _LOGGER.warning(
                f"No 'inherent' attributes found in schema; using defaults: {inherent_attrs}"
            )
    else:
        _LOGGER.warning("No schema provided; using defaults")
        self.schema_dict = None
        self.inherent_attrs = inherent_attrs

    self.__sequence = SequenceAgent(self.engine)
    self.__seqcol = SequenceCollectionAgent(self.engine, self.inherent_attrs)
    self.__pangenome = PangenomeAgent(self)
    self.__attribute = AttributeAgent(self.engine)

calc_similarities

calc_similarities(digestA, digestB)

Calculates the Jaccard similarity between two sequence collections.

This method retrieves two sequence collections using their digests and then computes jaccard similarities for all attributes.

Parameters:

Name Type Description Default
digestA str

The digest (identifier) for the first sequence collection.

required
digestB str

The digest (identifier) for the second sequence collection.

required

Returns:

Name Type Description
dict

The Jaccard similarity score between the two sequence collections for all present and shared attributes.

Source code in refget/agents.py
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
def calc_similarities(self, digestA, digestB):
    """
    Calculates the Jaccard similarity between two sequence collections.

    This method retrieves two sequence collections using their digests and then
    computes jaccard similarities for all attributes.

    Args:
        digestA (str): The digest (identifier) for the first sequence collection.
        digestB (str): The digest (identifier) for the second sequence collection.

    Returns:
        dict: The Jaccard similarity score between the two sequence collections for all present and shared attributes.

    """
    A = self.seqcol.get(digestA, return_format="level2")
    B = self.seqcol.get(digestB, return_format="level2")
    return calc_jaccard_similarities(A, B)

calc_similarities_seqcol_dicts

calc_similarities_seqcol_dicts(seqcolA, seqcolB)

Calculates the Jaccard similarity between two sequence collections.

This method retrieves one sequence collections using a digests and then computes jaccard similarities versus another input sequence collection dictionary.

Parameters:

Name Type Description Default
seqcolA dict

the first sequence collection in dict format.

required
seqcolB dict

the second sequence collection in dict format.

required

Returns:

Name Type Description
dict

The Jaccard similarity score between the two sequence collections for all present and shared attributes.

Source code in refget/agents.py
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
def calc_similarities_seqcol_dicts(self, seqcolA, seqcolB):
    """
    Calculates the Jaccard similarity between two sequence collections.

    This method retrieves one sequence collections using a digests and then
    computes jaccard similarities versus another input sequence collection dictionary.

    Args:
        seqcolA (dict): the first sequence collection in dict format.
        seqcolB (dict): the second sequence collection in dict format.

    Returns:
        dict: The Jaccard similarity score between the two sequence collections for all present and shared attributes.

    """

    return calc_jaccard_similarities(seqcolA, seqcolB)

truncate

truncate()

Delete all records from the database

Source code in refget/agents.py
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
def truncate(self):
    """Delete all records from the database"""

    with Session(self.engine) as session:
        statement = delete(SequenceCollection)
        result1 = session.exec(statement)
        statement = delete(Pangenome)
        result = session.exec(statement)
        statement = delete(NamesAttr)
        result = session.exec(statement)
        statement = delete(LengthsAttr)
        result = session.exec(statement)
        statement = delete(SequencesAttr)
        result = session.exec(statement)
        # statement = delete(SortedNameLengthPairsAttr)
        # result = session.exec(statement)
        statement = delete(NameLengthPairsAttr)
        result = session.exec(statement)
        statement = delete(SortedSequencesAttr)
        result = session.exec(statement)

        session.commit()
        return result1.rowcount

SequenceCollectionAgent

SequenceCollectionAgent(engine, inherent_attrs=None)

Bases: object

Agent for interacting with database of sequence collection

Source code in refget/agents.py
146
147
148
def __init__(self, engine, inherent_attrs=None):
    self.engine = engine
    self.inherent_attrs = inherent_attrs

add

add(seqcol, update=False)

Add a sequence collection to the database or update it if it exists

Parameters:

Name Type Description Default
seqcol SequenceCollection

The sequence collection to add

required
update bool

If True, update an existing collection if it exists

False

Returns:

Type Description
SequenceCollection

The added or updated sequence collection

Source code in refget/agents.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def add(self, seqcol: SequenceCollection, update: bool = False) -> SequenceCollection:
    """
    Add a sequence collection to the database or update it if it exists

    Args:
        seqcol: The sequence collection to add
        update: If True, update an existing collection if it exists

    Returns:
        The added or updated sequence collection
    """
    with Session(self.engine, expire_on_commit=False) as session:
        with session.no_autoflush:
            existing = session.get(SequenceCollection, seqcol.digest)

            if existing and not update:
                return existing

            # Process attributes (create if needed)
            attr_map = {
                "names": (NamesAttr, seqcol.names),
                "sequences": (SequencesAttr, seqcol.sequences),
                "sorted_sequences": (SortedSequencesAttr, seqcol.sorted_sequences),
                "lengths": (LengthsAttr, seqcol.lengths),
                "name_length_pairs": (NameLengthPairsAttr, seqcol.name_length_pairs),
            }

            processed_attrs = {}

            # Create or retrieve attributes
            for attr_name, (attr_class, attr_obj) in attr_map.items():
                attr = session.get(attr_class, attr_obj.digest)
                if not attr:
                    attr = attr_class(**attr_obj.model_dump())
                    session.add(attr)
                processed_attrs[attr_name] = attr

            if existing and update:
                # Update existing collection

                existing_names = [
                    name_model.human_readable_name
                    for name_model in existing.human_readable_names
                ]

                for name_model in seqcol.human_readable_names:
                    if name_model.human_readable_name not in existing_names:

                        new_name = HumanReadableNames(
                            human_readable_name=name_model.human_readable_name,
                            digest=existing.digest,
                        )

                        session.add(new_name)

                        existing.human_readable_names.append(new_name)

                for attr_name, attr in processed_attrs.items():
                    # Update attribute reference
                    setattr(existing, f"{attr_name}_digest", attr.digest)

                    # Update relationship - first remove from all existing collections
                    getattr(attr, "collection", []).append(existing)

                # Update transient attributes
                existing.sorted_name_length_pairs_digest = (
                    seqcol.sorted_name_length_pairs_digest
                )

                session.commit()
                return existing
            else:
                # Create new collection
                new_collection = SequenceCollection(
                    digest=seqcol.digest,
                    human_readable_names=seqcol.human_readable_names,
                    sorted_name_length_pairs_digest=seqcol.sorted_name_length_pairs_digest,
                )

                # Link attributes to collection
                for attr in processed_attrs.values():
                    getattr(attr, "collection", []).append(new_collection)

                session.add(new_collection)
                session.commit()
                return new_collection

add_from_dict

add_from_dict(seqcol_dict, update=False)

Add a sequence collection from a seqcol dictionary

Args: - seqcol_dict (dict): The sequence collection in dictionary form - update (bool): If True, update an existing collection if it exists

Returns: - (SequenceCollection): The added or updated sequence collection

Source code in refget/agents.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
def add_from_dict(self, seqcol_dict: dict, update: bool = False) -> SequenceCollection:
    """
    Add a sequence collection from a seqcol dictionary

    Args:
    - seqcol_dict (dict): The sequence collection in dictionary form
    - update (bool): If True, update an existing collection if it exists

    Returns:
    - (SequenceCollection): The added or updated sequence collection
    """
    seqcol = SequenceCollection.from_dict(seqcol_dict, self.inherent_attrs)
    _LOGGER.info(f"SeqCol: {seqcol}")
    _LOGGER.debug(f"SeqCol name_length_pairs: {seqcol.name_length_pairs.value}")
    return self.add(seqcol, update)

add_from_fasta_file

add_from_fasta_file(fasta_file_path, update=False)

Given a path to a fasta file, load the sequences into the refget database.

Args: - fasta_file_path (str): Path to the fasta file - update (bool): If True, update an existing collection if it exists

Returns: - (SequenceCollection): The added or updated sequence collection

Source code in refget/agents.py
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
def add_from_fasta_file(
    self, fasta_file_path: str, update: bool = False
) -> SequenceCollection:
    """
    Given a path to a fasta file, load the sequences into the refget database.

    Args:
    - fasta_file_path (str): Path to the fasta file
    - update (bool): If True, update an existing collection if it exists

    Returns:
    - (SequenceCollection): The added or updated sequence collection
    """

    CSC = fasta_to_seqcol_dict(fasta_file_path)
    seqcol = self.add_from_dict(CSC, update)
    return seqcol

add_from_fasta_file_with_name

add_from_fasta_file_with_name(fasta_file_path, human_readable_name, update=False)

Given a path to a fasta file, and a human-readable name, load the sequences into the refget database.

Args: - fasta_file_path (str): Path to the fasta file - human_readable_name (str): human_readable_name - update (bool): If True, update an existing collection if it exists

Returns: - (SequenceCollection): The added or updated sequence collection

Source code in refget/agents.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
def add_from_fasta_file_with_name(
    self,
    fasta_file_path: str,
    human_readable_name: str,
    update: bool = False,
) -> SequenceCollection:
    """
    Given a path to a fasta file, and a human-readable name, load the sequences into the refget database.

    Args:
    - fasta_file_path (str): Path to the fasta file
    - human_readable_name (str): human_readable_name
    - update (bool): If True, update an existing collection if it exists

    Returns:
    - (SequenceCollection): The added or updated sequence collection
    """

    CSC = fasta_to_seqcol_dict(fasta_file_path)
    CSC["human_readable_names"] = human_readable_name
    seqcol = self.add_from_dict(CSC, update)
    return seqcol

add_from_fasta_pep

add_from_fasta_pep(pep, fa_root, update=False)

Given a path to a PEP file and a root directory containing the fasta files, load the fasta files into the refget database.

Args: - pep_path (str): Path to the PEP file - fa_root (str): Root directory containing the fasta files

Returns: - (dict): A dictionary of the digests of the added sequence collections

Source code in refget/agents.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
def add_from_fasta_pep(self, pep: peppy.Project, fa_root: str, update: bool = False) -> dict:
    """
    Given a path to a PEP file and a root directory containing the fasta files,
    load the fasta files into the refget database.

    Args:
    - pep_path (str): Path to the PEP file
    - fa_root (str): Root directory containing the fasta files

    Returns:
    - (dict): A dictionary of the digests of the added sequence collections
    """

    total_files = len(pep.samples)
    results = {}
    import time

    for i, s in enumerate(pep.samples, 1):
        fa_path = os.path.join(fa_root, s.fasta)
        _LOGGER.info(f"Loading {fa_path} ({i} of {total_files})")

        start_time = time.time()  # Record start time
        if s.sample_name:
            results[s.fasta] = self.add_from_fasta_file_with_name(
                fa_path, s.sample_name, update
            ).digest
        else:
            results[s.fasta] = self.add_from_fasta_file(fa_path, update).digest
        elapsed_time = time.time() - start_time  # Calculate elapsed time

        _LOGGER.info(f"Loaded in {elapsed_time:.2f} seconds")

    return results

get

get(digest, return_format='level2', attribute=None, itemwise_limit=None)

Get a sequence collection by digest

Args: - digest (str): The digest of the sequence collection - return_format (str): The format in which to return the sequence collection - attribute (str): Name of an attribute to return, if you just want an attribute - itemwise_limit (int): Limit the number of items returned in itemwise format

Returns: - (SequenceCollection): The sequence collection (in requested format)

Source code in refget/agents.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def get(
    self,
    digest: str,
    return_format: str = "level2",
    attribute: str = None,
    itemwise_limit: int = None,
) -> SequenceCollection:
    """
    Get a sequence collection by digest

    Args:
    - digest (str): The digest of the sequence collection
    - return_format (str): The format in which to return the sequence collection
    - attribute (str): Name of an attribute to return, if you just want an attribute
    - itemwise_limit (int): Limit the number of items returned in itemwise format

    Returns:
    - (SequenceCollection): The sequence collection (in requested format)
    """
    with Session(self.engine) as session:
        statement = select(SequenceCollection).where(SequenceCollection.digest == digest)
        results = session.exec(statement)
        seqcol = results.one_or_none()
        if not seqcol:
            raise ValueError(f"SequenceCollection with digest '{digest}' not found")
        if attribute:
            return getattr(seqcol, attribute).value
        elif return_format == "level2":
            return seqcol.level2()
        elif return_format == "level1":
            return seqcol.level1()
        elif return_format == "itemwise":
            return seqcol.itemwise(itemwise_limit)
        else:
            return seqcol

SequenceAgent

SequenceAgent(engine)

Bases: object

Agent for interacting with database of sequences

Source code in refget/agents.py
79
80
def __init__(self, engine):
    self.engine = engine