×
Community Blog PostgreSQL Vector Similarity Recommendation – PASE

PostgreSQL Vector Similarity Recommendation – PASE

This article explains PASE with specific examples and code.

By digoal

Background

Feature Vector Table for the Content:

create table tbl_v_box_n (vid int primary key, vec float4[], score float4);

Feature Vectors Corresponding to the Content: N-dimension (tag capacity). Attach M tags to each piece of content on average. The tag value is a random value within 100.

create or replace function gen_rand_vec(int, int) returns float4[] as $$      
  select array(select (case when random()*$1 <=$2 then (random()*100)::int else 0 end)::float4  from generate_series(1,$1));      
$$ language sql strict;      

Insert 1 million records and 512 tags (in the vector dimension). Each piece of content is weighted in 10 random dimensions and 0 in other dimensions (5,000 central points and 5 in positive and negative random offsets.)

do language plpgsql $$    
declare    
  x float4[];    
begin    
  for i in 1..5000 loop    
     x := gen_rand_vec(512,10);    
     for y in 1..200 loop    
       insert into tbl_v_box_n select (i-1)*200+y, array(select (case when t=0 then 0 else t + 5 - round((10*random())::numeric,2) end)::float4 from unnest(x) t) t;    
     end loop;    
  end loop;    
end;    
$$;  

Create the PASE plug-in:

create extension pase;      

Create a vector index:

CREATE INDEX idx_tbl_v_box_n_1 ON tbl_v_box_n          
USING          
  pase_hnsw(vec)          
WITH          
  (dim = 512, base_nb_num = 16, ef_build = 40, ef_search = 200, base64_encoded = 0);          
    
或    
    
CREATE INDEX idx_tbl_v_box_n_2 ON tbl_v_box_n          
USING          
  pase_ivfflat(vec)          
WITH          
  (clustering_type = 1, distance_type = 0, dimension = 512, base64_encoded = 0, clustering_params = "10,5000");          

For 1 million records, the HNSW index creation takes 1,500 seconds, and IVFFlat takes 1,091 seconds.

Note: The IVFFlat index needs to cluster the central points, so there must be enough data in the table to build the central points when creating the index. Otherwise, the clustered central points may deviate from the future data, leading to performance problems.

If there is a large amount of data to train the central points in advance, the external clustering method can be used. /data/xxxfile is the file of clustered central points trained in advance.

CREATE INDEX idx_tbl_v_box_n_2 ON tbl_v_box_n
USING
  pase_ivfflat(vec)
WITH
  (clustering_type = 0, distance_type = 0, dimension = 512, base64_encoded = 0, clustering_params = "/data/xxxfile");

Create an hll plug-in to record ids hash values of the read content:

create extension hll;      
      
postgres=>  select hll_add(hll_empty(), hll_hash_integer(1));      
         hll_add                
--------------------------      
 \x128b7f8895a3f5af28cafe      
(1 row)      
      
postgres=> select '\x128b7f8895a3f5af28cafe'::hll = hll_add('\x128b7f8895a3f5af28cafe'::hll, hll_hash_integer(1));      
 ?column?       
----------      
 t      
(1 row)      
      
postgres=> select '\x128b7f8895a3f5af28cafe'::hll = hll_add('\x128b7f8895a3f5af28cafe'::hll, hll_hash_integer(2));      
 ?column?       
----------      
 f      
(1 row)      

The user preference tag has 512 dimensions, among which 50 dimensions have a value.

select gen_rand_vec(512,50);      
      
{0,0,0,0,0,0,0,0,0,0,0,0,75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,13,72,0,0,0,0,0,0,0,0,30,0,0,0,0,0,0,0,0,0,0,0,92,0,0,0,0,0,45,0,4,0,0,0,0,0,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,47,0,0,0,0,0,0,0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,0,0,0,0,0,0,0,94,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,0,0,90,0,84,0,0,0,0,0,0,0,0,0,0,62,0,0,0,0,0,0,44,0,0,0,0,0,0,0,0,0,4,0,0,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87,0,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,78,0,27,0,0,0,0,0,57,0,0,0,0,0,0,0,0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,73,0,0,0,4,0,0,77,0,74,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,83,0,0,0,0,0,0,0,64,0,85,0,0,0,0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,82,22,0,0,0,0,0,0,48,58,86,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0}      

Forward preferred content to the user and filter the read content:

explain (analyze,verbose,timing,costs,buffers)      
SELECT vid,         
vec <?>  '0,0,0,0,0,0,0,0,0,0,0,0,75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,13,72,0,0,0,0,0,0,0,0,30,0,0,0,0,0,0,0,0,0,0,0,92,0,0,0,0,0,45,0,4,0,0,0,0,0,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,47,0,0,0,0,0,0,0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,0,0,0,0,0,0,0,94,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,0,0,90,0,84,0,0,0,0,0,0,0,0,0,0,62,0,0,0,0,0,0,44,0,0,0,0,0,0,0,0,0,4,0,0,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87,0,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,78,0,27,0,0,0,0,0,57,0,0,0,0,0,0,0,0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,73,0,0,0,4,0,0,77,0,74,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,83,0,0,0,0,0,0,0,64,0,85,0,0,0,0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,82,22,0,0,0,0,0,0,48,58,86,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0:40:0'::pase  as distance          
FROM tbl_v_box_n         
where       
'\x128b7f8895a3f5af28cafe'::hll <> hll_add('\x128b7f8895a3f5af28cafe'::hll, hll_hash_integer(vid))      
ORDER BY          
vec <?> '0,0,0,0,0,0,0,0,0,0,0,0,75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,13,72,0,0,0,0,0,0,0,0,30,0,0,0,0,0,0,0,0,0,0,0,92,0,0,0,0,0,45,0,4,0,0,0,0,0,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,47,0,0,0,0,0,0,0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,0,0,0,0,0,0,0,94,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,0,0,90,0,84,0,0,0,0,0,0,0,0,0,0,62,0,0,0,0,0,0,44,0,0,0,0,0,0,0,0,0,4,0,0,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87,0,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,78,0,27,0,0,0,0,0,57,0,0,0,0,0,0,0,0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,73,0,0,0,4,0,0,77,0,74,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,83,0,0,0,0,0,0,0,64,0,85,0,0,0,0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,82,22,0,0,0,0,0,0,48,58,86,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0:40:0'::pase        
LIMIT 100;       
explain (analyze,verbose,timing,costs,buffers)      
select array_agg(vid) from (      
SELECT vid       
FROM tbl_v_box_n         
where       
'\x128b7f8895a3f5af28cafe'::hll <> hll_add('\x128b7f8895a3f5af28cafe'::hll, hll_hash_integer(vid))      
ORDER BY          
vec <?> '0,0,0,0,0,0,0,0,0,0,0,0,75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,13,72,0,0,0,0,0,0,0,0,30,0,0,0,0,0,0,0,0,0,0,0,92,0,0,0,0,0,45,0,4,0,0,0,0,0,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,47,0,0,0,0,0,0,0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,0,0,0,0,0,0,0,94,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,0,0,90,0,84,0,0,0,0,0,0,0,0,0,0,62,0,0,0,0,0,0,44,0,0,0,0,0,0,0,0,0,4,0,0,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87,0,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,78,0,27,0,0,0,0,0,57,0,0,0,0,0,0,0,0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,73,0,0,0,4,0,0,77,0,74,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,83,0,0,0,0,0,0,0,64,0,85,0,0,0,0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,82,22,0,0,0,0,0,0,48,58,86,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0:40:0'::pase        
LIMIT 100) t;       
  
或 ivfflat  
  
explain (analyze,verbose,timing,costs,buffers)      
select array_agg(vid) from (      
SELECT vid       
FROM tbl_v_box_n         
where       
'\x128b7f8895a3f5af28cafe'::hll <> hll_add('\x128b7f8895a3f5af28cafe'::hll, hll_hash_integer(vid))      
ORDER BY          
vec <#> '0,0,0,0,0,0,0,0,0,0,0,0,0,5.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,50.68,0,62.11,0,0,0,0,0,0,0,0,0,0,78.16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,46.44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,81.15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.68,0,71.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15.87,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,76.17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,69.94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0:1:0'::pase        
LIMIT 100) t;     

Description:

https://www.alibabacloud.com/help/en/doc-detail/147837.html

<?> is a PASE-type operator, which is used to calculate similarity between the vectors to the left and right of a specific element. The vector to the left must use the float4[] data type, and the vector to the right must use the PASE data type.

The PASE data type is defined in the PASE plug-in and can contain up to three constructors. Take the float4[], 0, 1 part in the preceding third as an example.

  • The first parameter specifies the vector to the right with the float4[] data type.
  • The second parameter does not serve a special purpose and can be set to 0.
  • The third parameter specifies the similarity calculation method, where the value 0 represents the Euclidean distance method and the value 1 represents the dot product method. A dot product is also called an inner product.

The vector to the left must have the same number of dimensions as the vector to the right. Otherwise, the system reports similarity calculation errors.

<#> is an operator that is used by IVFFlat indexes.

You must execute the ORDER BY statement to make an IVFFlat index take effect. An IVFFlat index allows vectors to be sorted in ascending order.

The PASE data type requires three parameters to specify a vector. These parameters are separated by colons (:).

For example, 1,1,1:10:0 includes three parameters:

  • The first parameter specifies the vector to query.
  • The second parameter specifies the query efficiency of IVFFlat with a value range of (0, 1000], in which a larger value indicates higher query accuracy but lower query performance.
  • The third parameter specifies the vector similarity calculation method, where the value 0 represents the Euclidean distance method and the value 1 represents the dot product method. A dot product is also called an inner product. The dot product method requires the normalization of vectors. The order of dot products is opposite to the order of Euclidean distances.

When using the IVFFlat index for querying, you can adjust the value of the second parameter to improve performance. However, you need to pay attention to the accuracy or whether the number of returned records meets the requirements. If the number is within the specified ef_search range but does not meet the requirement, you need to widen the ef_search range.

Stress Testing:

create or replace function test(int) returns void as $$      
declare      
begin      
  for i in 1..$1 loop      
perform array_agg(vid) from (      
SELECT vid       
FROM tbl_v_box_n         
where       
'\x128b7f8895a3f5af28cafe'::hll <> hll_add('\x128b7f8895a3f5af28cafe'::hll, hll_hash_integer(vid))      
ORDER BY          
vec <?> '0,0,0,0,0,0,0,0,0,0,0,0,75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,13,72,0,0,0,0,0,0,0,0,30,0,0,0,0,0,0,0,0,0,0,0,92,0,0,0,0,0,45,0,4,0,0,0,0,0,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,47,0,0,0,0,0,0,0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,0,0,0,0,0,0,0,94,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,0,0,90,0,84,0,0,0,0,0,0,0,0,0,0,62,0,0,0,0,0,0,44,0,0,0,0,0,0,0,0,0,4,0,0,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87,0,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,78,0,27,0,0,0,0,0,57,0,0,0,0,0,0,0,0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,73,0,0,0,4,0,0,77,0,74,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,83,0,0,0,0,0,0,0,64,0,85,0,0,0,0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,82,22,0,0,0,0,0,0,48,58,86,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0:40:0'::pase        
LIMIT 100) t;      
  end loop;      
return;      
end;      
$$ language plpgsql strict;      
vi test.sql      
select test(100);      
      
pgbench -M prepared -n -r -P 1 -f ./test.sql -c 20 -j 20 -T 120       
progress: 1.0 s, 18.0 tps, lat 683.974 ms stddev 81.527      
progress: 2.0 s, 21.0 tps, lat 722.688 ms stddev 100.880      
progress: 3.0 s, 24.0 tps, lat 714.391 ms stddev 102.946      
progress: 4.0 s, 28.0 tps, lat 860.122 ms stddev 141.135      
progress: 5.0 s, 24.0 tps, lat 787.278 ms stddev 58.100      
progress: 6.0 s, 21.0 tps, lat 786.168 ms stddev 91.312      
progress: 7.0 s, 30.0 tps, lat 796.017 ms stddev 134.557      
progress: 8.0 s, 24.0 tps, lat 791.886 ms stddev 84.715      
progress: 9.0 s, 23.0 tps, lat 775.147 ms stddev 103.539      
progress: 10.0 s, 27.0 tps, lat 810.352 ms stddev 66.641      
progress: 11.0 s, 30.0 tps, lat 780.018 ms stddev 77.176      
progress: 12.0 s, 23.0 tps, lat 773.011 ms stddev 65.566      
progress: 13.0 s, 26.0 tps, lat 749.862 ms stddev 47.076      
progress: 14.0 s, 24.0 tps, lat 805.899 ms stddev 112.102      
progress: 15.0 s, 27.0 tps, lat 788.106 ms stddev 85.193      
progress: 16.0 s, 24.0 tps, lat 793.259 ms stddev 83.253      
progress: 17.0 s, 26.0 tps, lat 779.958 ms stddev 85.379      
progress: 18.0 s, 26.0 tps, lat 765.178 ms stddev 78.601      

A 4-core machine with QPS about 2,300

Update Stress Testing:

with a as (  
select array(select (case when t=0 then 0 else t + 5 - round((10*random())::numeric,2) end)::float4 from unnest(vec) t) as vv from tbl_v_box_n where vid=1  
)  
update tbl_v_box_n set vec=a.vv from a where vid=1;  
vi up.sql  
  
\set vid random(1,1000000)  
with a as (  
select array(select (case when t=0 then 0 else t + 5 - round((10*random())::numeric,2) end)::float4 from unnest(vec) t) as vv from tbl_v_box_n where vid=:vid  
)  
update tbl_v_box_n set vec=a.vv from a where vid=:vid;  
  
pgbench -M prepared -n -r -P 1 -f ./up.sql -c 20 -j 20 -T 120       

Update method of the user preference tag:

The read content affects user preferences:

now_vec=(vec_old*cnt_old+vec_new*cnt_new)/(cnt_old+new_cnt)      
      
now_cnt=cnt_old+cnt_new      

vec_old: The current feature vector of the user

cnt_old: The number of pieces of content the user has read. This value is accumulated, so it increases. You can adjust this value to reduce the impact of old features.

vec_new: The feature vector central point of the user's newly read content

cnt_new: The number of pieces of content that have been recently read by the user

Process: The content ID is obtained based on the similarity between the user vector and the content vector. After the user reads several pieces of content, query the feature vectors of these content based on IDs, calculate the vector central point (vec_new), and merge it into the user feature vector (vec_old).

Age method: (to prevent user preferences from being framed) Decrease the value of cnt_old when calculating new feature vectors. This is equivalent to reducing the overall weight of the user's old feature vectors, which is like decreasing the denominator (cnt_old + new_cnt). The impact of the old value on the result is reduced when merging the new feature vectors.

The downgrade policy for cnt_old: In addition to adjusting based on the time, you can make decisions based on the user's recent skip rate. If the skip rate increases abnormally, you can also reduce the value of cnt_old.

Example of Dynamic SQL Execution:

create or replace function get_v_ids(
i_uid int8,
i_ef int,
i_limit int, 
i_tbl_prefix text, 
i_suffix text, 
i_idx_ops text
) returns int8[] as $$
declare
  u_hll_w1 hll;
  u_hll_w2 hll;
  u_hll_w7 hll;
  v_vec text;
  res int8[];
  query text;
begin
  set local enable_seqscan=off;
  set local enable_bitmapscan=off;
  set local enable_indexscan=on;
  set local enable_indexonlyscan=on;
  select coalesce(w1,hll_empty()),coalesce(w2,hll_empty()),coalesce(w7,hll_empty()) 
    into u_hll_w1,u_hll_w2,u_hll_w7 from video_user_read where user_id=i_uid; 
  select rtrim(ltrim(vec::text,'{'),'}') into v_vec from u_like_tag where user_id = i_uid; 
  query := format($_$
select array_agg(v_id order by video_weight desc) from
(
  select v_id,video_weight from %I v 
    where %L::hll || hll_hash_bigint(v.v_id) <> %L::hll 
    or %L::hll || hll_hash_bigint(v.v_id) <> %L::hll 
    or %L::hll || hll_hash_bigint(v.v_id) <> %L::hll 
  order by vec %s (%L)::pase 
  limit %s
) t
$_$,
i_tbl_prefix||'_'||i_suffix, 
u_hll_w1,u_hll_w1,
u_hll_w2,u_hll_w2,
u_hll_w7,u_hll_w7,
i_idx_ops,
v_vec||':'||i_ef||':0',
i_limit
);
raise notice '%', query;
execute query into res;
return res;
end;
$$ language plpgsql strict;


select get_v_ids(123::int8, 1, 100, 'tbl', '1', '<#>');
0 0 0
Share on

digoal

282 posts | 24 followers

You may also like

Comments

digoal

282 posts | 24 followers

Related Products