-
Notifications
You must be signed in to change notification settings - Fork 0
/
Kevin.py~
95 lines (72 loc) · 3.31 KB
/
Kevin.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#coding: utf8
from typing import *
from eliot_main import get_genetic_code, write_csv, read_csv
from Theo import read_flat_file
def get_features(txt):
"""Extract features lines from flat text and return them
This function is written by Kévin Merchadou.
Args:
txt: flat text with features to extract
Returns:
string of features
"""
conc=txt.split("\n")
for i in range(len(conc)):
if conc[i]=="FEATURES Location/Qualifiers":
features = "".join(conc[i+1:])
break
return features
def get_genes(features):
list_CDS=[]
bloc=("").join(features.split(" "))
bloc_cleaned=bloc.split(" ")
for i in range(len(bloc_cleaned)):
if bloc_cleaned[i][0:3]=="CDS":
list_CDS.append(i)
return list_CDS
# for line in features.split('\n'):
# if line[5:8] == "CDS":
# pos = line.split(" ")[1].split("..")
# product = line+7[22].split("=")[1]
# protein =
# new_cds = {"start": pos[0], "stop" : pos[1], "frame" : pos[0]%3+1 "length": pos[1] - pos[0],"name" : "unknown", "protein" : "xxx", "product" : "unknown"}
# elif line[5] != " " and len(new_cds) > 1:
# list_genes.append(new_cds)
# else:
# new_cds = line[22].split('=')
#def read_gen_bank(filename: str) -> Dict[str, Union[str, List[dict]]]:
# """Parse a GenBank file
# This function is written by Kévin Merchadou.
# Args:
# filename: .gb file to parse
# Returns:
# dictionary of :
# features:
# description: entry title (genbank descriptor DEFINITION)
# type: myBio keywords only - dna, rna, or protein
# data: sequence data only if available otherwise set to ‘xxx’. When the sequence is too large
# the entry does not contain data.
# ID: Identifier (locus)
# length: sequence length
# gbtype: molecule type as described in a genbank entry.
# organism: organism²
# codeTableID: NCBI genetic code table identifier
# list of gene = ORF (dict)
# start: start position (in bp)
# stop: stop position (in bp)
# frame: frame (1, 2, 3,- 1, -2, or -3)
# length: gene length (in bp)
# name: gene name if available. By default, ‘unknown’.
# protein: translated protein sequence if available. By default, ‘xxx’.
# product: product name if available. By default, ‘unknown’.
# """
# pass
features=get_features(read_flat_file("sequence.gb"))
bloc=("").join(features.split(" "))
bloc_CDS=bloc.split(" ")
bloc_cleaned=bloc_CDS[7].split("/")
a=(bloc_cleaned[9][12:].split('"')[1]).split(" ")
print(a)
# a=(bloc_cleaned[0][17:].strip()).split("..")[1]
# print(a)
# print(int(a[:len(a)-1]))