-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheckebeddings.py
111 lines (83 loc) · 3.31 KB
/
checkebeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# USAGE
# python checkebeddings.py --embedding output/embed.pickle --tolerance 0.1 --unique output/unique.txt
# --output output/similarities.txt
import numpy as np
import argparse
import pickle
import os
from scipy import spatial
import openpyxl
ap = argparse.ArgumentParser()
ap.add_argument("-e", "--embedding", required=False,
help="path to embeddings.pickle file")
ap.add_argument("-x", "--xlsx", required=False,
help="path to XLSX file")
ap.add_argument("-r", "--range", required=True,
help="path to XLSX file")
ap.add_argument("-t", "--tolerance", required=True,
help="tolerance of image similarity")
ap.add_argument("-o", "--output", required=False,
help="path to store similar image info")
ap.add_argument("-u", "--unique", required=False,
help="path to store unique image info")
args = vars(ap.parse_args())
def import_from_workbook(col, row):
return (sheet[col+str(row)].value)
def write_to_workbook(value, col, row):
sheet[col+str(row+1)] = value
dict_urls = {}
if(args["embedding"]):
dbfile = open(args["embedding"], 'rb')
embedding = pickle.load(dbfile)
elif(args["xlsx"]):
path = args["xlsx"]
wb = openpyxl.load_workbook(path)
sheet = wb.active
embedding = []
for vector in range(1, int(args["range"])):
getstring = (import_from_workbook("B", vector).strip('[]'))
getstring = " ".join(getstring.split())
dict_urls[import_from_workbook("A", vector)] = np.fromstring(getstring, sep=' ')
embedding = list(dict_urls.keys())
print(embedding)
print(len(embedding))
tolerance = float(args["tolerance"])
similarity= {}
unique = []
# distance = spatial.distance.cosine(embedding['testimages3\(3).jpg'], embedding['testimages3\(7).jpg'] )
for i in range(0, len(embedding)-1):
conflicts = []
similar_url = ""
for j in range(i+1, len(embedding)):
distance = spatial.distance.cosine(dict_urls[embedding[i]], dict_urls[embedding[j]])
if(distance<tolerance):
if(args["embedding"]):
conflicts.append(embedding[j])
elif(args["xlsx"]):
similar_url += '; ' + embedding[j]
write_to_workbook(similar_url, 'C', i)
if(args["embedding"]):
if(len(conflicts) != 0):
similarity[i] = conflicts
else:
unique.append(i)
wb.save(path)
# with open(args["unique"], mode='wt', encoding='utf-8') as myfile:
# myfile.write('\n'.join(str(u) for u in unique))
# f = open(args["output"], "w")
# for k in similarity.keys():
# f.write("{}:\n{}\n\n".format(k, similarity[k]))
#____________________________________________________________________________________________
# for i in similarity:
# conflicts = []
# query_val = embedding[i]
# for value in embed_values:
# if(abs(query_val-value)<tolerance):
# conflicts.append(embed_values.index(value)+1)
# similarity[i] = conflicts
# f = open("log.txt", "w")
# f.write("{\n")
# for k in similarity.keys():
# f.write("'{}':'{}'\n".format(k, similarity[k]))
# f.write("}")
# f.close()