Source code for drugforge.data.metadata.master_seqres.fasta_to_seqres

import argparse
from pathlib import Path

from Bio import SeqIO
from Bio.SeqUtils import seq3


################################################################################
[docs] def get_args(): parser = argparse.ArgumentParser(description="") parser.add_argument( "-i", "--in_file", required=True, type=Path, help="Input FASTA file." ) parser.add_argument( "-o", "--out_file", required=True, type=Path, help="Output YAML file." ) return parser.parse_args()
[docs] def main(): args = get_args() seqres_str = "SEQRES: |\n" for r in SeqIO.parse(args.in_file.open("r"), "fasta"): # Based on the format Andre used in Slack rec_chains = r.description.split("|")[1] rec_chains = [c.strip(",") for c in rec_chains.split()[1:]] # Number of residues in the chain(s) rec_len = len(r.seq) # Conver to 3-letter codes and split up rec_seq = seq3(r.seq) rec_residues = [rec_seq[i : i + 3].upper() for i in range(0, len(rec_seq), 3)] # Build record SEQRES string rec_str = "" line_res = [rec_residues[i : i + 13] for i in range(0, len(rec_residues), 13)] for i, res_group in enumerate(line_res): rec_str += " SEQRES " rec_seq += " " * (3 - len(str(i))) + str(i) rec_str += " {}" rec_str += " " + " " * (3 - len(str(rec_len))) + str(rec_len) rec_str += " " + " ".join(res_group) rec_str += "\n" rec_str = "".join([rec_str.format(*([c] * len(line_res))) for c in rec_chains]) seqres_str += rec_str with args.out_file.open("w") as fp: fp.write(seqres_str)
if __name__ == "__main__": main()