How to optimize complex sorting?

221 Views Asked by At

I'm building a language learning platform, here are some tables:

create table users
(  
    id                  bigserial primary key,
    ...
)

create table users_languages
(
    id                   bigserial primary key,
    user_id              bigint    not null
        constraint users_languages_user_id_foreign references users,
    level                varchar(255)         not null,
    lang_code            varchar(10)          not null,
    ...
)

So the users_languages keeps all the langs a user knows, level could be NATIVE or LEARN, lang_code is the ISO code.

I'm building a recommended search feature for a current user, depending on the languages he knows, so the result must be ordered by the following rules:

  1. other user NATIVE lang = current user LEARN lang AND other user LEARN lang = current user NATIVE lang
  2. other user NATIVE lang = current user LEARN lang
  3. other user has at least one of current user langs

Here is what I figured out so far:

           nativeCode := current user native lang_code
           langCodes := current user learn lang_codes


            WITH user_lang_priority AS NOT MATERIALIZED (
                SELECT l.user_id, MIN(CASE 
                    WHEN l.level = 'NATIVE' AND l.lang_code IN(:langCodes) THEN
                      CASE
                        WHEN EXISTS (
                           SELECT ll.id FROM users_languages ll
                           WHERE ll.level != 'NATIVE'
                           AND ll.lang_code = :nativeCode
                           AND ll.user_id = l.user_id
                        ) THEN 1 ELSE 2
                      END
                    WHEN l.lang_code IN(:langCodes) THEN 3
                END) AS priority
                FROM users_languages l
                GROUP BY l.user_id
            )
            SELECT u.*
            FROM users u
            INNER JOIN user_lang_priority lp ON u.id = lp.user_id
            GROUP BY u.id
            ORDER BY lp.priority ASC, u.id DESC

The query seems to be returning the correct results, but it takes up to 2 seconds on a table with about 30k rows. Is there a way to speed it up?

UPD: the query plan

Sort  (cost=777435.38..777508.24 rows=29145 width=762)
"  Sort Key: (min(CASE WHEN (((l.level)::text = 'NATIVE'::text) AND ((l.lang_code)::text = ANY ('{fin,fre,ger}'::text[]))) THEN CASE WHEN (hashed SubPlan 2) THEN 1 ELSE 2 END WHEN ((l.lang_code)::text = ANY ('{rus,fre,ger}'::text[])) THEN 3 ELSE NULL::integer END)), u.id DESC"
  ->  Hash Join  (cost=1677.14..775274.14 rows=29145 width=762)
        Hash Cond: (l.user_id = u.id)
        ->  GroupAggregate  (cost=0.29..773229.32 rows=29145 width=12)
              Group Key: l.user_id
              ->  Index Scan using users_languages_user_id_index on users_languages l  (cost=0.29..3005.29 rows=84970 width=19)
              SubPlan 2
                ->  Bitmap Heap Scan on users_languages ll  (cost=299.15..1487.36 rows=14990 width=8)
                      Recheck Cond: ((lang_code)::text = 'eng'::text)
                      Filter: ((level)::text <> 'NATIVE'::text)
                      ->  Bitmap Index Scan on users_languages_lang_code_index  (cost=0.00..295.40 rows=22814 width=0)
                            Index Cond: ((lang_code)::text = 'eng'::text)
        ->  Hash  (cost=1253.60..1253.60 rows=33860 width=758)
              ->  Seq Scan on users u  (cost=0.00..1253.60 rows=33860 width=758)
3

There are 3 best solutions below

0
user3402754 On

Consider creating indexes on the following columns:

  1. users_languages.user_id: This should improve the join operation with the users table.
  2. users_languages.level: To optimise the CASE statements.
  3. users_languages.lang_code: to optimise the filtering operation and CASE statements.

After creating these indexes, re-run your query and check the performance. You can update the statistics using the ANALYZE command:


ANALYZE users_languages;
ANALYZE users;

Lastly, if the users_languages table is very large and this query is frequently executed, you might consider using a materialized view to precompute the results.

-- Create the Materialized View
CREATE MATERIALIZED VIEW user_lang_priority_mv AS
SELECT l.user_id, MIN(CASE 
   WHEN l.level = 'NATIVE' AND l.lang_code IN(:langCodes) THEN
     CASE
       WHEN EXISTS (
          SELECT ll.id FROM users_languages ll
          WHERE ll.level != 'NATIVE'
          AND ll.lang_code = :nativeCode
          AND ll.user_id = l.user_id
       ) THEN 1 ELSE 2
     END
   WHEN l.lang_code IN(:langCodes) THEN 3
END) AS priority
FROM users_languages l
GROUP BY l.user_id;

-- Create Index on the Materialized View to further enhance performance.
CREATE INDEX user_lang_priority_mv_idx ON user_lang_priority_mv (user_id, priority);

Depending on how frequently the underlying data changes, you'll need to schedule regular refreshes of the materialized view.

0
Fatemeh Hosseini On

try following query(view):

CREATE MATERIALIZED VIEW user_lang_priority AS
SELECT l.user_id, MIN(CASE 
    WHEN l.level = 'NATIVE' AND l.lang_code IN (:langCodes) THEN
        CASE
            WHEN EXISTS (
                SELECT ll.id FROM users_languages ll
                WHERE ll.level != 'NATIVE'
                AND ll.lang_code = :nativeCode
                AND ll.user_id = l.user_id
            ) THEN 1 ELSE 2
        END
    WHEN l.lang_code IN (:langCodes) THEN 3
END) AS priority
FROM users_languages l
GROUP BY l.user_id;
0
ValNik On

If you need search for current user, possible simple query. Here one join and one groupping. I think, they are well supported by indexes.

with current_user_languages as(
  select * from users_languages l
  where user_id=1 -- this is current user Id
)
,cul_intersects as(
select cul.user_id user1, cul.level level1,cul.lang_code code1
  ,l2.user_id user2, l2.level level2,l2.lang_code code2
from current_user_languages cul
left join users_languages l2 
   on l2.lang_code=cul.lang_code and cul.user_id<>l2.user_id
)
,priorities as(
select user1,user2
  ,case when sum(case when level1='LEARN' and level2='NATIVE' then 1 else 0 end)>0 
 and sum(case when level1='NATIVE' and level2='LEARN' then 1 else 0 end)>0 then 1
       when sum(case when level2='NATIVE' and level1='LEARN' then 1 else 0 end)>0 then 2
       when count(*)>0 then 3
   end priority
  ,sum(case when level1='LEARN' and level2='NATIVE' then 1 else 0 end) priority1_1
  ,sum(case when level2='LEARN' and level1='NATIVE' then 1 else 0 end) priority1_2
  ,sum(case when level2='NATIVE' and level1='LEARN' then 1 else 0 end) priority2
  ,count(*) totalIntersect
from cul_intersects
group by user1,user2
)
select user1,user2, priority,priority1,priority2,totalintersect,username 
from priorities p
inner join users u on u.id=p.user2
order by user1,user2

For test data

insert into users values
 (1,'User1')
,(2,'User2')
,(3,'User3')
,(4,'User4')
,(5,'User5')
,(6,'User6')
;
insert into users_languages values
 (1,1,'NATIVE','EN')
,(2,1,'LEARN','FR')
,(3,2,'NATIVE','FR')
,(4,2,'LEARN','EN')
,(5,2,'LEARN','TUR')
,(6,3,'NATIVE','TUR')
,(7,4,'NATIVE','ITA')
,(8,5,'NATIVE','ITA')
,(9,5,'LEARN','FR')
,(10,5,'LEARN','TUR')
,(11,6,'NATIVE','DE')
;

Draft result is

user1 user2 priority priority1 priority2 totalintersect username
1 2 1 1 1 2 User2
1 5 3 0 0 1 User5

Query for all users

with all_intersects as(
select l1.user_id user1, l1.level level1,l1.lang_code code1
  ,l2.user_id user2, l2.level level2,l2.lang_code code2
from users_languages l1
left join users_languages l2 on l2.lang_code=l1.lang_code and l1.user_id<>l2.user_id
)
,priorities as(
select user1,user2
  ,case when sum(case when level1='LEARN' and level2='NATIVE' then 1 else 0 end)>0 
and sum(case when level1='NATIVE' and level2='LEARN' then 1 else 0 end)>0 then 1
       when sum(case when level2='NATIVE' and level1='LEARN' then 1 else 0 end)>0 then 2
       when count(*)>0 then 3
   end priority
  -- this is for clearity
  ,sum(case when level1='LEARN' and level2='NATIVE' then 1 else 0 end) priority1_1
  ,sum(case when level2='LEARN' and level1='NATIVE' then 1 else 0 end) priority1_2
  ,sum(case when level2='NATIVE' and level1='LEARN' then 1 else 0 end) priority2
  ,count(*) totalIntersect
from all_intersects
group by user1,user2
)
select user1,user2, priority,priority1,priority2,totalintersect,username 
from priorities p
inner join users u on u.id=p.user2
order by user1,user2

Result

user1 user2 priority priority1_1 priority1_2 priority2 totalintersect username
1 2 1 1 1 1 2 User2
1 5 3 0 0 0 1 User5
2 1 1 1 1 1 2 User1
2 3 2 1 0 1 1 User3
2 5 3 0 1 0 2 User5
3 2 3 0 1 0 1 User2
3 5 3 0 1 0 1 User5
4 5 3 0 0 0 1 User5
5 1 3 0 0 0 1 User1
5 2 2 1 0 1 2 User2
5 3 2 1 0 1 1 User3
5 4 3 0 0 0 1 User4

fiddle here