By Xiaosen
Released by ELK Geek
"Search by Image" is a relatively common feature in shopping guide websites, and there are many ways to implement it, such as "Hash fingerprint and Hamming distance calculation" and "feature vector and Milvus". In actual scenarios, however, it is difficult to achieve quickness, precision, and simplicity.
This topic describes how to install the Alibaba Cloud Elasticsearch Vector search plug-in (aliyun-knn) based on Elasticsearch Version 6.7. In addition, this plug-in can design images with 512-dimension vector features.
The aliyun-knn plug-in cannot be used for user-created Elasticsearch clusters. We recommend using the open-source Elasticsearch Version 7.x and installing the fast-elasticsearch-vector-scoring plug-in.
1. # Create an image index
2. PUT images_v2
3. {
4. "aliases": {
5. "images": {}
6. },
7. "settings": {
8. "index.codec": "proxima",
9. "index.vector.algorithm": "hnsw",
10. "index.number_of_replicas":1,
11. "index.number_of_shards":3
12. },
13. "mappings": {
14. "_doc": {
15. "properties": {
16. "feature": {
17. "type": "proxima_vector",
18. "dim": 512
19. },
20. "relation_id": {
21. "type": "keyword"
22. },
23. "image_path": {
24. "type": "keyword"
25. }
26. }
27. }
28. }
29. }
1. GET images/_search
2.
3. "query": {
4. "hnsw": {
5. "feature": {
6. "vector": [255,....255],
7. "size": 3,
8. "ef": 1
9. }
10. }
11. },
12. "from": 0,
13. "size": 20,
14. "sort": [
15. {
16. "_score": {
17. "order": "desc"
18. }
19. }
20. ],
21. "collapse": {
22. "field": "relation_id"
23. },
24. "_source": {
25. "includes": [
26. "relation_id",
27. "image_path"
28. ]
29. }
extract_cnn_vgg16_keras.py
1. # -*- coding: utf-8 -*-
2. # Author: yongyuan.name
3. import numpy as np
4. from numpy import linalg as LA
5. from keras.applications.vgg16 import VGG16
6. from keras.preprocessing import image
7. from keras.applications.vgg16 import preprocess_input
8. from PIL import Image, ImageFile
9. ImageFile.LOAD_TRUNCATED_IMAGES = True
10. class VGGNet:
11. def __init__(self):
12. # weights: 'imagenet'
13. # pooling: 'max' or 'avg'
14. # input_shape: (width, height, 3), width and height should >= 48
15. self.input_shape = (224, 224, 3)
16. self.weight = 'imagenet'
17. self.pooling = 'max'
18. self.model = VGG16(weights = self.weight, input_shape = (self.input_shape[0], self.input_shape[1], self.input_shape[2]), pooling = self.pooling, include_top = False)
19. self.model.predict(np.zeros((1, 224, 224 , 3)))
20. '''
21. Use vgg16 model to extract features
22. Output normalized feature vector
23. '''
24. def extract_feat(self, img_path):
25. img = image.load_img(img_path, target_size=(self.input_shape[0], self.input_shape[1]))
26. img = image.img_to_array(img)
27. img = np.expand_dims(img, axis=0)
28. img = preprocess_input(img)
29. feat = self.model.predict(img)
30. norm_feat = feat[0]/LA.norm(feat[0])
31. return norm_feat
1. # Obtain the image feature
2. from extract_cnn_vgg16_keras import VGGNet
3. model = VGGNet()
4. file_path = "./demo.jpg"
5. queryVec = model.extract_feat(file_path)
6. feature = queryVec.tolist()
helper.py
1. import re
2. import urllib.request
3. def strip(path):
4. """
5. Folder names need to be cleared
6. Clear the strings of illegal folder names in Windows OS
7. :param path:
8. :return:
9. """
10. path = re.sub(r'[?\\*|"<>:/]', '', str(path))
11. return path
12.
13. def getfilename(url):
14. """
15. Obtain the last file name through URL
16. :param url:
17. :return:
18. """
19. filename = url.split('/')[-1]
20. filename = strip(filename)
21. return filename
22.
23. def urllib_download(url, filename):
24. """
25. Download
26. :param url:
27. :param filename:
28. :return:
29. """
30. return urllib.request.urlretrieve(url, filename)
train.py
1. # coding=utf-8
2. import mysql.connector
3. import os
4. from helper import urllib_download, getfilename
5. from elasticsearch5 import Elasticsearch, helpers
6. from extract_cnn_vgg16_keras import VGGNet
7. model = VGGNet()
8. http_auth = ("elastic", "123455")
9. es = Elasticsearch("http://127.0.0.1:9200", http_auth=http_auth)
10. mydb = mysql.connector.connect(
11. host="127.0.0.1", # Database host IP address
12. user="root", # Database username
13. passwd="123456", # Database password
14. database="images"
15. )
16. mycursor = mydb.cursor()
17. imgae_path = "./images/"
18. def get_data(page=1):
19. page_size = 20
20. offset = (page - 1) * page_size
21. sql = """
22. SELECT id, relation_id, photo FROM images LIMIT {0},{1}
23. """
24. mycursor.execute(sql.format(offset, page_size))
25. myresult = mycursor.fetchall()
26. return myresult
27.
28. def train_image_feature(myresult):
29. indexName = "images"
30. photo_path = "http://域名/{0}"
31. actions = []
32. for x in myresult:
33. id = str(x[0])
34. relation_id = x[1]
35. # photo = x[2].decode(encoding="utf-8")
36. photo = x[2]
37. full_photo = photo_path.format(photo)
38. filename = imgae_path + getfilename(full_photo)
39. if not os.path.exists(filename):
40. try:
41. urllib_download(full_photo, filename)
42. except BaseException as e:
43. print("gid:{0}的图片{1}未能下载成功".format(gid, full_photo))
44. continue
45. if not os.path.exists(filename):
46. continue
47. try:
48. feature = model.extract_feat(filename).tolist()
49. action = {
50. "_op_type": "index",
51. "_index": indexName,
52. "_type": "_doc",
53. "_id": id,
54. "_source": {
55. "relation_id": relation_id,
56. "feature": feature,
57. "image_path": photo
58. }
59. }
60. actions.append(action)
61. except BaseException as e:
62. print("id:{0}的图片{1}未能获取到特征".format(id, full_photo))
63. continue
64. # print(actions)
65. succeed_num = 0
66. for ok, response in helpers.streaming_bulk(es, actions):
67. if not ok:
68. print(ok)
69. print(response)
70. else:
71. succeed_num += 1
72. print("本次更新了{0}条数据".format(succeed_num))
73. es.indices.refresh(indexName)
74.
75. page = 1
76. while True:
77. print("当前第{0}页".format(page))
78. myresult = get_data(page=page)
79. if not myresult:
80. print("没有获取到数据了,退出")
81. break
82. train_image_feature(myresult)
83. page += 1
1. import requests
2. import json
3. import os
4. import time
5. from elasticsearch5 import Elasticsearch
6. from extract_cnn_vgg16_keras import VGGNet
7. model = VGGNet()
8. http_auth = ("elastic", "123455")
9. es = Elasticsearch("http://127.0.0.1:9200", http_auth=http_auth)
10. #Save the uploaded image
11. upload_image_path = "./runtime/"
12. upload_image = request.files.get("image")
13. upload_image_type = upload_image.content_type.split('/')[-1]
14. file_name = str(time.time())[:10] + '.' + upload_image_type
15. file_path = upload_image_path + file_name
16. upload_image.save(file_path)
17. # Calculate the feature vector of the image
18. queryVec = model.extract_feat(file_path)
19. feature = queryVec.tolist()
20. # Delete image
21. os.remove(file_path)
22. # Query in ES based on feature vector
23. body = {
24. "query": {
25. "hnsw": {
26. "feature": {
27. "vector": feature,
28. "size": 5,
29. "ef": 10
30. }
31. }
32. },
33. # "collapse": {
34. # "field": "relation_id"
35. # },
36. "_source": {"includes": ["relation_id", "image_path"]},
37. "from": 0,
38. "size": 40
39. }
40. indexName = "images"
41. res = es.search(indexName, body=body)
42. # Filter out the returned results with low scores based on your situation…after testing, results with scores no lower than 0.65 meet the requirements.
1. mysql_connector_repackaged
2. elasticsearch
3. Pillow
4. tensorflow
5. requests
6. pandas
7. Keras
8. numpy
From the user experience perspective, at the perceptible level, speed and accuracy determine whether the product inspires the feeling of "easy to use". The simple four-step "search for images by image" search engine built with Alibaba Cloud Elasticsearch vector search (aliyun-knn) is not just "easy to use" but also features a simple and one-step operation, which is also a big advantage.
Alibaba Cloud Elasticsearch: Lifecycle Management for Index Data
Efficiently Monitor Nginx Web Servers Using Alibaba Cloud Elasticsearch
2,599 posts | 764 followers
FollowAlibaba Clouder - January 29, 2021
Alibaba Cloud Community - November 26, 2024
Data Geek - April 8, 2024
Alibaba Cloud Storage - April 10, 2019
Alibaba Clouder - October 23, 2020
Data Geek - April 25, 2024
2,599 posts | 764 followers
FollowAlibaba Cloud provides big data consulting services to help enterprises leverage advanced data technology.
Learn MoreAlibaba Cloud experts provide retailers with a lightweight and customized big data consulting service to help you assess your big data maturity and plan your big data journey.
Learn MoreAlibaba Cloud Elasticsearch helps users easy to build AI-powered search applications seamlessly integrated with large language models, and featuring for the enterprise: robust access control, security monitoring, and automatic updates.
Learn MoreApsaraDB for HBase is a NoSQL database engine that is highly optimized and 100% compatible with the community edition of HBase.
Learn MoreMore Posts by Alibaba Clouder
5866422516326226 June 1, 2021 at 2:59 am
great article, can you publist the mysql database used for training?