Skip to content

Package refget documentation

SequenceCollection

Bases: SQLModel

A SQLModel/pydantic model that represents a refget sequence collection.

digest class-attribute instance-attribute

digest = Field(primary_key=True)

Top-level digest of the SequenceCollection.

lengths class-attribute instance-attribute

lengths = Relationship(back_populates='collection')

Array of sequence lengths.

name_length_pairs class-attribute instance-attribute

name_length_pairs = Relationship(back_populates='collection')

Array of name-length pairs, representing the coordinate system of the collection.

names class-attribute instance-attribute

names = Relationship(back_populates='collection')

Array of sequence names.

sequences class-attribute instance-attribute

sequences = Relationship(back_populates='collection')

Array of sequence digests.

sorted_name_length_pairs_digest class-attribute instance-attribute

sorted_name_length_pairs_digest = Field()

Digest of the sorted name-length pairs, representing a unique digest of sort-invariant coordinate system.

sorted_sequences class-attribute instance-attribute

sorted_sequences = Relationship(back_populates='collection')

Array of sorted sequence digests.

from_dict classmethod

from_dict(seqcol_dict, inherent_attrs=['names', 'sequences'])

Given a dict representation of a sequence collection, create a SequenceCollection object. This is the primary way to create a SequenceCollection object.

Parameters:

Name Type Description Default
seqcol_dict dict

Dictionary representation of a canonical sequence collection object

required
schema dict

Schema defining the inherent attributes to digest

required

Returns:

Type Description
SequenceCollection

The SequenceCollection object

Source code in refget/models.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
@classmethod
def from_dict(
    cls, seqcol_dict: dict, inherent_attrs: Optional[list] = ["names", "sequences"]
) -> "SequenceCollection":
    """
    Given a dict representation of a sequence collection, create a SequenceCollection object.
    This is the primary way to create a SequenceCollection object.

    Args:
        seqcol_dict (dict): Dictionary representation of a canonical sequence collection object
        schema (dict): Schema defining the inherent attributes to digest

    Returns:
        (SequenceCollection): The SequenceCollection object
    """

    # validate_seqcol(seqcol_dict)
    level1_dict = seqcol_dict_to_level1_dict(seqcol_dict, inherent_attrs)
    seqcol_digest = level1_dict_to_seqcol_digest(level1_dict)

    # Now, build the actual pydantic models
    sequences_attr = SequencesAttr(
        digest=level1_dict["sequences"], value=seqcol_dict["sequences"]
    )

    names_attr = NamesAttr(digest=level1_dict["names"], value=seqcol_dict["names"])

    # Any non-inherent attributes will have been filtered from the l1 dict
    # So we need to compute the digests for them here
    lengths_attr = LengthsAttr(
        digest=sha512t24u_digest(canonical_str(seqcol_dict["lengths"])),
        value=seqcol_dict["lengths"],
    )

    nlp = build_name_length_pairs(seqcol_dict)
    nlp_attr = NameLengthPairsAttr(digest=sha512t24u_digest(canonical_str(nlp)), value=nlp)
    _LOGGER.debug(f"nlp: {nlp}")
    _LOGGER.debug(f"nlp canonical_str: {canonical_str(nlp)}")
    _LOGGER.debug(f"Name-length pairs: {nlp_attr}")

    # snlp = build_sorted_name_length_pairs(seqcol_dict)
    # v = ",".join(snlp)
    # snlp_attr = SortedNameLengthPairsAttr(digest=sha512t24u_digest(canonical_str(snlp)), value=snlp)

    snlp = [canonical_str(x).decode("utf-8") for x in nlp]
    snlp.sort()
    _LOGGER.debug(f"--- SNLP: {snlp}")
    snlp_digest = sha512t24u_digest(canonical_str(snlp))
    _LOGGER.debug(f"--- SNLP: {snlp_digest}")
    # snlp_attr = SortedNameLengthPairsAttr(digest=snlp_digest, value=snlp)

    sorted_sequences_value = copy(seqcol_dict["sequences"])
    sorted_sequences_value.sort()
    sorted_sequences_digest = sha512t24u_digest(canonical_str(sorted_sequences_value))
    sorted_sequences_attr = SortedSequencesAttr(
        digest=sorted_sequences_digest, value=sorted_sequences_value
    )
    _LOGGER.debug(f"sorted_sequences_value: {sorted_sequences_value}")
    _LOGGER.debug(f"sorted_sequences_digest: {sorted_sequences_digest}")
    _LOGGER.debug(f"sorted_sequences_attr: {sorted_sequences_attr}")

    seqcol = SequenceCollection(
        digest=seqcol_digest,
        sequences=sequences_attr,
        sorted_sequences=sorted_sequences_attr,
        names=names_attr,
        lengths=lengths_attr,
        name_length_pairs=nlp_attr,
        sorted_name_length_pairs_digest=snlp_digest,
    )

    _LOGGER.debug(f"seqcol: {seqcol}")

    return seqcol

from_fasta_file classmethod

from_fasta_file(fasta_file)

Given a FASTA file, create a SequenceCollection object.

Parameters:

Name Type Description Default
fasta_file str

Path to a FASTA file

required

Returns:

Type Description
SequenceCollection

The SequenceCollection object

Source code in refget/models.py
145
146
147
148
149
150
151
152
153
154
155
156
157
@classmethod
def from_fasta_file(cls, fasta_file: str) -> "SequenceCollection":
    """
    Given a FASTA file, create a SequenceCollection object.

    Args:
        fasta_file (str): Path to a FASTA file

    Returns:
        (SequenceCollection): The SequenceCollection object
    """
    seqcol = fasta_to_seqcol_dict(fasta_file)
    return cls.from_dict(seqcol)

input_validate classmethod

input_validate(seqcol_obj)

Given a dict representation of a sequence collection, validate it against the input schema.

Parameters:

Name Type Description Default
seqcol_obj dict

Dictionary representation of a canonical sequence collection object

required

Returns:

Type Description
bool

True if the object is valid, False otherwise

Source code in refget/models.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
@classmethod
def input_validate(cls, seqcol_obj: dict) -> bool:
    """
    Given a dict representation of a sequence collection, validate it against the input schema.

    Args:
        seqcol_obj (dict): Dictionary representation of a canonical sequence collection object

    Returns:
        (bool): True if the object is valid, False otherwise
    """
    schema_path = os.path.join(os.path.dirname(__file__), "schemas", "seqcol.yaml")
    schema = load_yaml(schema_path)
    validator = Draft7Validator(schema)

    if not validator.is_valid(seqcol_obj.level2()):
        errors = sorted(validator.iter_errors(seqcol_obj), key=lambda e: e.path)
        raise InvalidSeqColError("Validation failed", errors)
    return True

itemwise

itemwise(limit=None)

Converts object into a list of dictionaries, one for each sequence in the collection.

Source code in refget/models.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def itemwise(self, limit=None):
    """
    Converts object into a list of dictionaries, one for each sequence in the collection.
    """
    if limit and len(self.sequences.value) > limit:
        raise ValueError(f"Too many sequences to format itemwise: {len(self.sequences.value)}")
    list_of_dicts = []
    for i in range(len(self.lengths.value)):
        list_of_dicts.append(
            {
                "name": self.names.value[i],
                "length": self.lengths.value[i],
                "sequence": self.sequences.value[i],
            }
        )
    return list_of_dicts

level1

level1()

Converts object into dict of level 2 representation of the SequenceCollection.

Source code in refget/models.py
238
239
240
241
242
243
244
245
246
247
248
249
def level1(self):
    """
    Converts object into dict of level 2 representation of the SequenceCollection.
    """
    return {
        "lengths": self.lengths.digest,
        "names": self.names.digest,
        "sequences": self.sequences.digest,
        "sorted_sequences": self.sorted_sequences.digest,
        "name_length_pairs": self.name_length_pairs.digest,
        "sorted_name_length_pairs": self.sorted_name_length_pairs_digest,
    }

level2

level2()

Converts object into dict of level 2 representation of the SequenceCollection.

Source code in refget/models.py
251
252
253
254
255
256
257
258
259
260
261
262
def level2(self):
    """
    Converts object into dict of level 2 representation of the SequenceCollection.
    """
    return {
        "lengths": self.lengths.value,
        "names": self.names.value,
        "sequences": self.sequences.value,
        "sorted_sequences": self.sorted_sequences.value,
        "name_length_pairs": self.name_length_pairs.value,
        # "sorted_name_length_pairs": self.sorted_name_length_pairs.value,  # decided to remove transient attrs from level 2 repr
    }