In [2]:
import pandas
import numpy as np
import sklearn.cluster
import distance
In [3]:
genomes_df = pandas.read_csv("/Users/johncalvo/Downloads/covid_sequences.csv")
genomes_df.head()
Out[3]:
In [4]:
words = genomes_df["Nucleotides sequence"]
print(words)
In [5]:
words = np.asarray(words) #So that indexing with a list will work
words_match_index = np.array([[str(i) + '_' + str(j) for i in range(1,len(words))]
for j in range(1,len(words))])
In [6]:
lev_15_16 = distance.levenshtein(words[15],words[16], normalized=True)
lev_1_15 = distance.levenshtein(words[1],words[15], normalized=True)
lev_15_17 = distance.levenshtein(words[15],words[17], normalized=True)
In [7]:
print("Similarity between SARS-CoV-Whu-1 genome and HIV")
print(1 - lev_15_16)
In [8]:
print("Similarity between the last reported genome and SARS-CoV-Whu-1")
print(1 - lev_1_15)
In [9]:
print("Similarity between SARS-CoV-Whu-1 genome and SARS-CoV original")
print(1 - lev_15_17)
In [22]:
lev_similarity = -1*np.array([[distance.levenshtein(w1,w2, normalized=True) for w1 in words] for w2 in words])
In [ ]:
Comments
Post a Comment