Package `refget` documentation

SequenceCollection

Bases: SQLModel

A SQLModel/pydantic model that represents a refget sequence collection.

digest `class-attribute` `instance-attribute`

digest = Field(primary_key=True)

Top-level digest of the SequenceCollection.

lengths `class-attribute` `instance-attribute`

lengths = Relationship(back_populates='collection')

Array of sequence lengths.

name_length_pairs `class-attribute` `instance-attribute`

name_length_pairs = Relationship(back_populates='collection')

Array of name-length pairs, representing the coordinate system of the collection.

names `class-attribute` `instance-attribute`

names = Relationship(back_populates='collection')

Array of sequence names.

sequences `class-attribute` `instance-attribute`

sequences = Relationship(back_populates='collection')

Array of sequence digests.

sorted_name_length_pairs_digest `class-attribute` `instance-attribute`

sorted_name_length_pairs_digest = Field()

Digest of the sorted name-length pairs, representing a unique digest of sort-invariant coordinate system.

sorted_sequences `class-attribute` `instance-attribute`

sorted_sequences = Relationship(back_populates='collection')

Array of sorted sequence digests.

from_PySequenceCollection `classmethod`

from_PySequenceCollection(gtars_seq_col)

Given a PySequenceCollection object (from Rust bindings), create a SequenceCollection object.

Parameters:

Name	Type	Description	Default
`gtars_seq_col`	`PySequenceCollection`	PySequenceCollection object from Rust bindings.	required

Returns:

Type	Description
`SequenceCollection`	The SequenceCollection object.

Source code in refget/models.py

@classmethod
def from_PySequenceCollection(
    cls, gtars_seq_col: gtarsSequenceCollection
) -> "SequenceCollection":
    """
    Given a PySequenceCollection object (from Rust bindings), create a SequenceCollection object.

    Args:
       gtars_seq_col (PySequenceCollection): PySequenceCollection object from Rust bindings.

    Returns:
        (SequenceCollection): The SequenceCollection object.
    """
    if not _RUST_BINDINGS_AVAILABLE:
        raise RuntimeError(
            "Rust sequence collection bindings are not available. Cannot use `from_PySequenceCollection`."
        )

    sequences_value = []
    names_value = []
    lengths_value = []

    temp_seqcol_dict = {"names": [], "lengths": [], "sequences": []}

    for record in gtars_seq_col.sequences:
        sequences_value.append("SQ." + record.metadata.sha512t24u)
        names_value.append(record.metadata.name)
        lengths_value.append(record.metadata.length)

        temp_seqcol_dict["names"].append(record.metadata.name)
        temp_seqcol_dict["lengths"].append(record.metadata.length)
        temp_seqcol_dict["sequences"].append(record.metadata.sha512t24u)

    sequences_attr = SequencesAttr(
        digest=gtars_seq_col.lvl1.sequences_digest, value=sequences_value
    )
    _LOGGER.debug(f"SequencesAttr: {sequences_attr}")

    names_attr = NamesAttr(digest=gtars_seq_col.lvl1.names_digest, value=names_value)
    _LOGGER.debug(f"NamesAttr: {names_attr}")

    lengths_attr = LengthsAttr(
        digest=gtars_seq_col.lvl1.lengths_digest,
        value=lengths_value,
    )
    _LOGGER.debug(f"LengthsAttr: {lengths_attr}")

    nlp = build_name_length_pairs(temp_seqcol_dict)
    nlp_attr = NameLengthPairsAttr(digest=sha512t24u_digest(canonical_str(nlp)), value=nlp)
    _LOGGER.debug(f"NameLengthPairsAttr: {nlp_attr}")

    sorted_sequences_value = copy(sequences_value)
    sorted_sequences_value.sort()
    sorted_sequences_digest = sha512t24u_digest(canonical_str(sorted_sequences_value))
    sorted_sequences_attr = SortedSequencesAttr(
        digest=sorted_sequences_digest, value=sorted_sequences_value
    )
    _LOGGER.debug(f"SortedSequencesAttr: {sorted_sequences_attr}")

    snlp_digests = []
    for pair in nlp:
        snlp_digests.append(sha512t24u_digest(canonical_str(pair)))
    snlp_digests.sort()
    sorted_name_length_pairs_digest = sha512t24u_digest(canonical_str(snlp_digests))
    _LOGGER.debug(f"Sorted Name Length Pairs Digest: {sorted_name_length_pairs_digest}")

    seqcol = SequenceCollection(
        digest=gtars_seq_col.digest,
        human_readable_names=[],
        sequences=sequences_attr,
        sorted_sequences=sorted_sequences_attr,
        names=names_attr,
        lengths=lengths_attr,
        name_length_pairs=nlp_attr,
        sorted_name_length_pairs_digest=sorted_name_length_pairs_digest,
    )

    _LOGGER.debug(f"Created SequenceCollection from PySequenceCollection: {seqcol}")
    return seqcol

from_dict `classmethod`

from_dict(seqcol_dict, inherent_attrs=['names', 'sequences'])

Given a dict representation of a sequence collection, create a SequenceCollection object. This is the primary way to create a SequenceCollection object.

Parameters:

Name	Type	Description	Default
`seqcol_dict`	`dict`	Dictionary representation of a canonical sequence collection object	required
`schema`	`dict`	Schema defining the inherent attributes to digest	required

Returns:

Type	Description
`SequenceCollection`	The SequenceCollection object

Source code in refget/models.py

@classmethod
def from_dict(
    cls, seqcol_dict: dict, inherent_attrs: Optional[list] = ["names", "sequences"]
) -> "SequenceCollection":
    """
    Given a dict representation of a sequence collection, create a SequenceCollection object.
    This is the primary way to create a SequenceCollection object.

    Args:
        seqcol_dict (dict): Dictionary representation of a canonical sequence collection object
        schema (dict): Schema defining the inherent attributes to digest

    Returns:
        (SequenceCollection): The SequenceCollection object
    """

    # validate_seqcol(seqcol_dict)
    level1_dict = seqcol_dict_to_level1_dict(seqcol_dict, inherent_attrs)
    seqcol_digest = level1_dict_to_seqcol_digest(level1_dict)

    # Now, build the actual pydantic models
    sequences_attr = SequencesAttr(
        digest=level1_dict["sequences"], value=seqcol_dict["sequences"]
    )

    names_attr = NamesAttr(digest=level1_dict["names"], value=seqcol_dict["names"])

    # Any non-inherent attributes will have been filtered from the l1 dict
    # So we need to compute the digests for them here
    lengths_attr = LengthsAttr(
        digest=sha512t24u_digest(canonical_str(seqcol_dict["lengths"])),
        value=seqcol_dict["lengths"],
    )

    nlp = build_name_length_pairs(seqcol_dict)
    nlp_attr = NameLengthPairsAttr(digest=sha512t24u_digest(canonical_str(nlp)), value=nlp)
    _LOGGER.debug(f"nlp: {nlp}")
    _LOGGER.debug(f"Name-length pairs: {nlp_attr}")

    snlp_digests = []  # sorted_name_length_pairs digests
    for i in range(len(nlp)):
        snlp_digests.append(sha512t24u_digest(canonical_str(nlp[i])))
    snlp_digests.sort()

    # you can build it like this, but instead I'm just building it from the nlp, to save compute
    # snlp = build_sorted_name_length_pairs(seqcol_dict)
    # v = ",".join(snlp)
    snlp_digest_level1 = sha512t24u_digest(canonical_str(snlp_digests))

    # This is now a transient attribute, so we don't need to store it in the database.
    # snlp_attr = SortedNameLengthPairsAttr(digest=snlp_digest_level1, value=snlp_digests)

    sorted_sequences_value = copy(seqcol_dict["sequences"])
    sorted_sequences_value.sort()
    sorted_sequences_digest = sha512t24u_digest(canonical_str(sorted_sequences_value))
    sorted_sequences_attr = SortedSequencesAttr(
        digest=sorted_sequences_digest, value=sorted_sequences_value
    )
    _LOGGER.debug(f"sorted_sequences_value: {sorted_sequences_value}")
    _LOGGER.debug(f"sorted_sequences_digest: {sorted_sequences_digest}")
    _LOGGER.debug(f"sorted_sequences_attr: {sorted_sequences_attr}")

    human_readable_names_list = []
    if "human_readable_names" in seqcol_dict and seqcol_dict["human_readable_names"]:
        # Assuming 'human_readable_name' is a list of strings in the input dictionary
        if isinstance(seqcol_dict["human_readable_names"], list):
            for name_str in seqcol_dict["human_readable_names"]:
                human_readable_names_list.append(
                    HumanReadableNames(human_readable_name=name_str, digest=seqcol_digest)
                )
        # Handle the case where a single string is provided for backward compatibility
        elif isinstance(seqcol_dict["human_readable_names"], str):
            human_readable_names_list.append(
                HumanReadableNames(
                    human_readable_name=seqcol_dict["human_readable_names"],
                    digest=seqcol_digest,
                )
            )

    seqcol = SequenceCollection(
        digest=seqcol_digest,
        human_readable_names=human_readable_names_list,
        sequences=sequences_attr,
        sorted_sequences=sorted_sequences_attr,
        names=names_attr,
        lengths=lengths_attr,
        name_length_pairs=nlp_attr,
        sorted_name_length_pairs_digest=snlp_digest_level1,
    )

    _LOGGER.debug(f"seqcol: {seqcol}")

    return seqcol

from_fasta_file `classmethod`

from_fasta_file(fasta_file)

Given a FASTA file, create a SequenceCollection object.

Parameters:

Name	Type	Description	Default
`fasta_file`	`str`	Path to a FASTA file	required

Returns:

Type	Description
`SequenceCollection`	The SequenceCollection object

Source code in refget/models.py

@classmethod
def from_fasta_file(cls, fasta_file: str) -> "SequenceCollection":
    """
    Given a FASTA file, create a SequenceCollection object.

    Args:
        fasta_file (str): Path to a FASTA file

    Returns:
        (SequenceCollection): The SequenceCollection object
    """
    seqcol = fasta_to_seqcol_dict(fasta_file)
    return cls.from_dict(seqcol)

input_validate `classmethod`

input_validate(seqcol_obj)

Given a dict representation of a sequence collection, validate it against the input schema.

Parameters:

Name	Type	Description	Default
`seqcol_obj`	`dict`	Dictionary representation of a canonical sequence collection object	required

Returns:

Type	Description
`bool`	True if the object is valid, False otherwise

Source code in refget/models.py

@classmethod
def input_validate(cls, seqcol_obj: dict) -> bool:
    """
    Given a dict representation of a sequence collection, validate it against the input schema.

    Args:
        seqcol_obj (dict): Dictionary representation of a canonical sequence collection object

    Returns:
        (bool): True if the object is valid, False otherwise
    """
    schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml")
    schema = load_yaml(schema_path)
    validator = Draft7Validator(schema)

    if not validator.is_valid(seqcol_obj.level2()):
        errors = sorted(validator.iter_errors(seqcol_obj), key=lambda e: e.path)
        raise InvalidSeqColError("Validation failed", errors)
    return True

itemwise

itemwise(limit=None)

Converts object into a list of dictionaries, one for each sequence in the collection.

Source code in refget/models.py

def itemwise(self, limit=None):
    """
    Converts object into a list of dictionaries, one for each sequence in the collection.
    """
    if limit and len(self.sequences.value) > limit:
        raise ValueError(f"Too many sequences to format itemwise: {len(self.sequences.value)}")
    list_of_dicts = []
    for i in range(len(self.lengths.value)):
        list_of_dicts.append(
            {
                "name": self.names.value[i],
                "length": self.lengths.value[i],
                "sequence": self.sequences.value[i],
            }
        )
    return list_of_dicts

level1

level1()

Converts object into dict of level 2 representation of the SequenceCollection.

Source code in refget/models.py

def level1(self):
    """
    Converts object into dict of level 2 representation of the SequenceCollection.
    """
    return {
        "lengths": self.lengths.digest,
        "names": self.names.digest,
        "sequences": self.sequences.digest,
        "sorted_sequences": self.sorted_sequences.digest,
        "name_length_pairs": self.name_length_pairs.digest,
        "sorted_name_length_pairs": self.sorted_name_length_pairs_digest,
    }

level2

level2()

Converts object into dict of level 2 representation of the SequenceCollection.

Source code in refget/models.py

def level2(self):
    """
    Converts object into dict of level 2 representation of the SequenceCollection.
    """
    return {
        "lengths": self.lengths.value,
        "names": self.names.value,
        "sequences": self.sequences.value,
        "sorted_sequences": self.sorted_sequences.value,
        "name_length_pairs": self.name_length_pairs.value,
        # "sorted_name_length_pairs": self.sorted_name_length_pairs.value,  # decided to remove transient attrs from level 2 repr
    }

Package refget documentation

SequenceCollection

digest class-attribute instance-attribute

lengths class-attribute instance-attribute

name_length_pairs class-attribute instance-attribute

names class-attribute instance-attribute

sequences class-attribute instance-attribute

sorted_name_length_pairs_digest class-attribute instance-attribute

sorted_sequences class-attribute instance-attribute

from_PySequenceCollection classmethod

from_dict classmethod

from_fasta_file classmethod

input_validate classmethod

itemwise

level1

level2

Package `refget` documentation

digest `class-attribute` `instance-attribute`

lengths `class-attribute` `instance-attribute`

name_length_pairs `class-attribute` `instance-attribute`

names `class-attribute` `instance-attribute`

sequences `class-attribute` `instance-attribute`

sorted_name_length_pairs_digest `class-attribute` `instance-attribute`

sorted_sequences `class-attribute` `instance-attribute`

from_PySequenceCollection `classmethod`

from_dict `classmethod`

from_fasta_file `classmethod`

input_validate `classmethod`