By digoal
The PostgreSQL GiST index supports distance sorting queries, some of which have three requirements:
1) Order by distance
2) Limited distance range
3) The number of records returned by limit
create extension postgis;
create table t_pos(
id int primary key,
pos geometry
);
insert into t_pos
select * from (
select id,
ST_SetSRID(
ST_Point( round((random()*(135.085831-73.406586)+73.406586)::numeric,6),
round((random()*(53.880950-3.408477)+3.408477)::numeric,6)
),
4326
) as pos
from generate_series(1,1000000000) t(id)
) t
order by st_geohash(pos,15);
create index idx_t_pos_1 on t_pos using gist(pos);
select *,
st_distancespheroid(pos, st_setsrid(st_makepoint(120,50),4326), 'SPHEROID["WGS84",6378137,298.257223563]') as dist
from t_pos
where
st_distancespheroid(pos, st_setsrid(st_makepoint(120,50),4326), 'SPHEROID["WGS84",6378137,298.257223563]') < 5000
order by pos <-> st_setsrid(st_makepoint(120,50),4326)
limit 100;
Or,
create table t_age(id int, age int);
insert into t_age select generate_series(1,10000000), random()*120;
create index idx_t_age_1 on t_age using gist (age);
select * from t_age
where
(age <-> 25) <1
order by age <-> 25
limit 100000;
When you use the GiST index in both the SQL statements, what will happen if there are less than 100 records of distance less than 5000, or less than 100 thousand records of age gap smaller than 1?
The answer is:
The entire index will be scanned.
This is because the database does not know if there are any records that meet the where condition.
Since the records are returned by distance from near to far, when a record no longer meets the where condition (the distance is equal to or greater than 5000, or the age gap is equal to or greater than 1), there is no need to scan the rest.
So, this is the point where you can optimize the database kernel.
A waste scan may occur in the subquery, such as limit 1000. Only 1 record meets the condition, and then scanning the rest of 999 records is a waste.
select * from (
select * from t_age
order by age <-> 25
limit 1000
) t
where
(age <-> 25) <1 ;
Compare with the following:
postgres=# explain (analyze,verbose,timing,costs,buffers) select * from t_age where (age <-> 25) <1 order by age <-> 25 limit 100000;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------------------
Limit (cost=0.36..3572.19 rows=100000 width=12) (actual time=0.169..10757.930 rows=83553 loops=1)
Output: id, age, ((age <-> 25))
Buffers: shared hit=9525191
-> Index Scan using idx_t_age_1 on public.t_age (cost=0.36..119061.20 rows=3333333 width=12) (actual time=0.167..10750.016 rows=83553 loops=1)
Output: id, age, (age <-> 25)
Order By: (t_age.age <-> 25)
Filter: ((t_age.age <-> 25) < 1)
Rows Removed by Filter: 9916447
Buffers: shared hit=9525191
Planning Time: 0.153 ms
Execution Time: 10762.824 ms
(11 rows)
postgres=# explain (analyze,verbose,timing,costs,buffers) select * from (
select * from t_age
order by age <-> 25
limit 100000
) t
where
(age <-> 25) <1 ;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------
Subquery Scan on t (cost=0.36..1354.35 rows=33333 width=8) (actual time=0.146..144.873 rows=83553 loops=1)
Output: t.id, t.age
Filter: ((t.age <-> 25) < 1)
Rows Removed by Filter: 16447
Buffers: shared hit=95549
-> Limit (cost=0.36..901.97 rows=100000 width=12) (actual time=0.144..133.330 rows=100000 loops=1)
Output: t_age.id, t_age.age, ((t_age.age <-> 25))
Buffers: shared hit=95549
-> Index Scan using idx_t_age_1 on public.t_age (cost=0.36..90161.39 rows=10000000 width=12) (actual time=0.143..124.430 rows=100000 loops=1)
Output: t_age.id, t_age.age, (t_age.age <-> 25)
Order By: (t_age.age <-> 25)
Buffers: shared hit=95549
Planning Time: 0.108 ms
Execution Time: 148.951 ms
(14 rows)
create or replace function ff(int, int, int) returns setof t_age as $$
declare
v t_age;
i int := 0;
begin
set enable_seqscan=off;
set enable_indexscan=on;
set enable_bitmapscan=off;
for v in
select * from t_age order by age <-> $1
loop
if (v.age <-> $1) >= $2 or (i) >= $3 then return; end if;
i := i+1;
return next v;
end loop;
return;
end;
$$ language plpgsql strict;
postgres=# explain (analyze,verbose,timing,costs,buffers) select * from ff(25,1,1000000);
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------
Function Scan on public.ff (cost=0.22..0.41 rows=1000 width=8) (actual time=159.203..164.049 rows=83553 loops=1)
Output: id, age
Function Call: ff(25, 1, 1000000)
Buffers: shared hit=79848
Planning Time: 0.042 ms
Execution Time: 168.601 ms
(6 rows)
Support filter recognition in sorting the scan to avoid scanning all records when not meeting the limit number.
How to Use FIO to Test the IO Performance of ECS Local SSD and ESSD (Part 1)
digoal - January 21, 2021
digoal - December 23, 2020
digoal - February 14, 2021
digoal - January 19, 2021
digoal - January 21, 2021
digoal - December 11, 2019
Alibaba Cloud PolarDB for PostgreSQL is an in-house relational database service 100% compatible with PostgreSQL and highly compatible with the Oracle syntax.
Learn MoreLeverage cloud-native database solutions dedicated for FinTech.
Learn MoreMigrate your legacy Oracle databases to Alibaba Cloud to save on long-term costs and take advantage of improved scalability, reliability, robust security, high performance, and cloud-native features.
Learn MoreMigrating to fully managed cloud databases brings a host of benefits including scalability, reliability, and cost efficiency.
Learn MoreMore Posts by digoal