[#1570] Experimental feature to improve user search using Levenshtein distance calculation.
Improves short queries, especially containing typos.
This commit is contained in:
parent
7aa6c82937
commit
659e15896e
@ -92,6 +92,7 @@ defmodule Pleroma.User do
|
||||
field(:local, :boolean, default: true)
|
||||
field(:follower_address, :string)
|
||||
field(:following_address, :string)
|
||||
field(:levenshtein_distance, :integer, virtual: true)
|
||||
field(:search_rank, :float, virtual: true)
|
||||
field(:search_type, :integer, virtual: true)
|
||||
field(:tags, {:array, :string}, default: [])
|
||||
|
@ -8,6 +8,8 @@ defmodule Pleroma.User.Search do
|
||||
import Ecto.Query
|
||||
|
||||
@limit 20
|
||||
@levenshtein_max_query_length 5
|
||||
@search_rank_threshold 0
|
||||
|
||||
def search(query_string, opts \\ []) do
|
||||
resolve = Keyword.get(opts, :resolve, false)
|
||||
@ -31,7 +33,10 @@ defmodule Pleroma.User.Search do
|
||||
|
||||
defp format_query(query_string) do
|
||||
# Strip the beginning @ off if there is a query
|
||||
query_string = String.trim_leading(query_string, "@")
|
||||
query_string =
|
||||
query_string
|
||||
|> String.trim()
|
||||
|> String.trim_leading("@")
|
||||
|
||||
with [name, domain] <- String.split(query_string, "@") do
|
||||
encoded_domain =
|
||||
@ -47,15 +52,33 @@ defmodule Pleroma.User.Search do
|
||||
end
|
||||
end
|
||||
|
||||
defp levenshtein_applicable?(query_string) do
|
||||
String.length(query_string) <= @levenshtein_max_query_length
|
||||
end
|
||||
|
||||
defp search_query(query_string, for_user, following) do
|
||||
for_user
|
||||
|> base_query(following)
|
||||
|> filter_blocked_user(for_user)
|
||||
|> filter_invisible_users()
|
||||
|> filter_blocked_domains(for_user)
|
||||
|> fts_search(query_string)
|
||||
|> trigram_rank(query_string)
|
||||
query =
|
||||
for_user
|
||||
|> base_query(following)
|
||||
|> filter_blocked_user(for_user)
|
||||
|> filter_invisible_users()
|
||||
|> filter_blocked_domains(for_user)
|
||||
|
||||
query =
|
||||
if levenshtein_applicable?(query_string) do
|
||||
query
|
||||
|> levenshtein_distance(query_string)
|
||||
|> fts_levenshtein_search(query_string)
|
||||
|> trigram_levenshtein_rank(query_string)
|
||||
else
|
||||
query
|
||||
|> fts_search(query_string)
|
||||
|> trigram_rank(query_string)
|
||||
end
|
||||
|
||||
query
|
||||
|> boost_search_rank(for_user)
|
||||
|> filter_by_search_rank()
|
||||
|> subquery()
|
||||
|> order_by(desc: :search_rank)
|
||||
|> maybe_restrict_local(for_user)
|
||||
@ -78,6 +101,25 @@ defmodule Pleroma.User.Search do
|
||||
)
|
||||
end
|
||||
|
||||
defp fts_levenshtein_search(query, query_string) do
|
||||
tsquery = to_tsquery(query_string)
|
||||
|
||||
from(
|
||||
u in subquery(query),
|
||||
where:
|
||||
fragment(
|
||||
"""
|
||||
? <= 2 OR
|
||||
(to_tsvector('simple', ?) || to_tsvector('simple', ?)) @@ to_tsquery('simple', ?)
|
||||
""",
|
||||
u.levenshtein_distance,
|
||||
u.name,
|
||||
u.nickname,
|
||||
^tsquery
|
||||
)
|
||||
)
|
||||
end
|
||||
|
||||
defp to_tsquery(query_string) do
|
||||
String.trim_trailing(query_string, "@" <> local_domain())
|
||||
|> String.replace(~r/[!-\/|@|[-`|{-~|:-?]+/, " ")
|
||||
@ -87,6 +129,45 @@ defmodule Pleroma.User.Search do
|
||||
|> Enum.join(" | ")
|
||||
end
|
||||
|
||||
# Trigram-based rank with bonus for close Levenshtein distance b/w query and nickname
|
||||
defp trigram_levenshtein_rank(query, query_string) do
|
||||
from(
|
||||
u in subquery(query),
|
||||
select_merge: %{
|
||||
search_rank:
|
||||
fragment(
|
||||
"similarity(?, trim(? || ' ' || coalesce(?, ''))) + \
|
||||
(CASE WHEN ? = 0 THEN 1.0 \
|
||||
WHEN ? = 1 AND length(?) > 1 THEN 0.5
|
||||
WHEN ? = 2 AND length(?) > 3 THEN 0.2
|
||||
ELSE 0 END)",
|
||||
^query_string,
|
||||
u.nickname,
|
||||
u.name,
|
||||
u.levenshtein_distance,
|
||||
u.levenshtein_distance,
|
||||
^query_string,
|
||||
u.levenshtein_distance,
|
||||
^query_string
|
||||
)
|
||||
}
|
||||
)
|
||||
end
|
||||
|
||||
defp levenshtein_distance(query, query_string) do
|
||||
from(
|
||||
u in query,
|
||||
select_merge: %{
|
||||
levenshtein_distance:
|
||||
fragment(
|
||||
"levenshtein(?, regexp_replace(?, '@.+', ''))",
|
||||
^query_string,
|
||||
u.nickname
|
||||
)
|
||||
}
|
||||
)
|
||||
end
|
||||
|
||||
defp trigram_rank(query, query_string) do
|
||||
from(
|
||||
u in query,
|
||||
@ -185,4 +266,8 @@ defmodule Pleroma.User.Search do
|
||||
end
|
||||
|
||||
defp boost_search_rank(query, _for_user), do: query
|
||||
|
||||
defp filter_by_search_rank(query) do
|
||||
from(u in subquery(query), where: u.search_rank > @search_rank_threshold)
|
||||
end
|
||||
end
|
||||
|
@ -0,0 +1,23 @@
|
||||
defmodule Pleroma.Repo.Migrations.AddFuzzystrmatchPostgresExtension do
|
||||
use Ecto.Migration
|
||||
|
||||
require Logger
|
||||
|
||||
def up do
|
||||
Logger.warn("ATTENTION ATTENTION ATTENTION\n")
|
||||
|
||||
Logger.warn(
|
||||
"This will try to create the pg_trgm extension on your database. If your database user does NOT have the necessary rights, you will have to do it manually and re-run the migrations.\nYou can probably do this by running the following:\n"
|
||||
)
|
||||
|
||||
Logger.warn(
|
||||
"sudo -u postgres psql pleroma_dev -c \"create extension if not exists fuzzystrmatch\"\n"
|
||||
)
|
||||
|
||||
execute("create extension if not exists fuzzystrmatch")
|
||||
end
|
||||
|
||||
def down do
|
||||
execute("drop extension if exists fuzzystrmatch")
|
||||
end
|
||||
end
|
@ -436,11 +436,14 @@ defmodule Mix.Tasks.Pleroma.UserTest do
|
||||
|
||||
{:ok, user} = User.follow(user, kawen)
|
||||
|
||||
assert [moon.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id)
|
||||
# One "typo" in nickname makes `moot` score better than `kawen` despite of name match
|
||||
assert [moon.id, moot.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id)
|
||||
|
||||
res = User.search("moo") |> Enum.map(& &1.id)
|
||||
assert moon.id in res
|
||||
assert moot.id in res
|
||||
assert kawen.id in res
|
||||
|
||||
assert [moon.id, kawen.id] == User.Search.search("moon fediverse") |> Enum.map(& &1.id)
|
||||
|
||||
assert [kawen.id, moon.id] ==
|
||||
|
@ -37,6 +37,10 @@ defmodule Pleroma.UserSearchTest do
|
||||
assert length(User.search("john", limit: 3, offset: 3)) == 2
|
||||
end
|
||||
|
||||
defp clear_virtual_fields(user) do
|
||||
Map.merge(user, %{search_rank: nil, search_type: nil, levenshtein_distance: nil})
|
||||
end
|
||||
|
||||
test "finds a user by full or partial nickname" do
|
||||
user = insert(:user, %{nickname: "john"})
|
||||
|
||||
@ -44,8 +48,7 @@ defmodule Pleroma.UserSearchTest do
|
||||
assert user ==
|
||||
User.search(query)
|
||||
|> List.first()
|
||||
|> Map.put(:search_rank, nil)
|
||||
|> Map.put(:search_type, nil)
|
||||
|> clear_virtual_fields()
|
||||
end)
|
||||
end
|
||||
|
||||
@ -56,8 +59,8 @@ defmodule Pleroma.UserSearchTest do
|
||||
assert user ==
|
||||
User.search(query)
|
||||
|> List.first()
|
||||
|> Map.put(:search_rank, nil)
|
||||
|> Map.put(:search_type, nil)
|
||||
|> Map.merge(%{search_rank: nil, search_type: nil, levenshtein_distance: nil})
|
||||
|> clear_virtual_fields()
|
||||
end)
|
||||
end
|
||||
|
||||
@ -68,6 +71,23 @@ defmodule Pleroma.UserSearchTest do
|
||||
assert [u2.id, u1.id] == Enum.map(User.search("bar word"), & &1.id)
|
||||
end
|
||||
|
||||
test "considers Levenshtein distance between query and nickname for short queries" do
|
||||
clear_config([:instance, :limit_to_local_content], false)
|
||||
|
||||
user = insert(:user, %{nickname: "hj@shigusegubu.club"})
|
||||
insert(:user, %{nickname: "xyz@sample.com"})
|
||||
insert(:user, %{nickname: "zyx@hj.com"})
|
||||
|
||||
# Note: "h.j." and "hhhj" are matched since 4+ char queries allow for 2 typos
|
||||
for query <- ["hj", "hhj", "h j", "lj", "hi", "jj", "h.j.", "hhhj"] do
|
||||
assert [user.id] == Enum.map(User.search(query), & &1.id)
|
||||
end
|
||||
|
||||
for query <- ["ajay", "gg"] do
|
||||
assert [] == User.search(query)
|
||||
end
|
||||
end
|
||||
|
||||
test "finds users, boosting ranks of friends and followers" do
|
||||
u1 = insert(:user)
|
||||
u2 = insert(:user, %{name: "Doe"})
|
||||
|
Loading…
Reference in New Issue
Block a user