[#1570] Experimental feature to improve user search using Levenshtein distance calculation.

Improves short queries, especially containing typos.
This commit is contained in:
Ivan Tashkinov 2020-06-10 21:02:44 +03:00
parent 7aa6c82937
commit 659e15896e
5 changed files with 145 additions and 13 deletions

View File

@ -92,6 +92,7 @@ defmodule Pleroma.User do
field(:local, :boolean, default: true)
field(:follower_address, :string)
field(:following_address, :string)
field(:levenshtein_distance, :integer, virtual: true)
field(:search_rank, :float, virtual: true)
field(:search_type, :integer, virtual: true)
field(:tags, {:array, :string}, default: [])

View File

@ -8,6 +8,8 @@ defmodule Pleroma.User.Search do
import Ecto.Query
@limit 20
@levenshtein_max_query_length 5
@search_rank_threshold 0
def search(query_string, opts \\ []) do
resolve = Keyword.get(opts, :resolve, false)
@ -31,7 +33,10 @@ defmodule Pleroma.User.Search do
defp format_query(query_string) do
# Strip the beginning @ off if there is a query
query_string = String.trim_leading(query_string, "@")
query_string =
query_string
|> String.trim()
|> String.trim_leading("@")
with [name, domain] <- String.split(query_string, "@") do
encoded_domain =
@ -47,15 +52,33 @@ defmodule Pleroma.User.Search do
end
end
defp levenshtein_applicable?(query_string) do
String.length(query_string) <= @levenshtein_max_query_length
end
defp search_query(query_string, for_user, following) do
for_user
|> base_query(following)
|> filter_blocked_user(for_user)
|> filter_invisible_users()
|> filter_blocked_domains(for_user)
|> fts_search(query_string)
|> trigram_rank(query_string)
query =
for_user
|> base_query(following)
|> filter_blocked_user(for_user)
|> filter_invisible_users()
|> filter_blocked_domains(for_user)
query =
if levenshtein_applicable?(query_string) do
query
|> levenshtein_distance(query_string)
|> fts_levenshtein_search(query_string)
|> trigram_levenshtein_rank(query_string)
else
query
|> fts_search(query_string)
|> trigram_rank(query_string)
end
query
|> boost_search_rank(for_user)
|> filter_by_search_rank()
|> subquery()
|> order_by(desc: :search_rank)
|> maybe_restrict_local(for_user)
@ -78,6 +101,25 @@ defmodule Pleroma.User.Search do
)
end
defp fts_levenshtein_search(query, query_string) do
tsquery = to_tsquery(query_string)
from(
u in subquery(query),
where:
fragment(
"""
? <= 2 OR
(to_tsvector('simple', ?) || to_tsvector('simple', ?)) @@ to_tsquery('simple', ?)
""",
u.levenshtein_distance,
u.name,
u.nickname,
^tsquery
)
)
end
defp to_tsquery(query_string) do
String.trim_trailing(query_string, "@" <> local_domain())
|> String.replace(~r/[!-\/|@|[-`|{-~|:-?]+/, " ")
@ -87,6 +129,45 @@ defmodule Pleroma.User.Search do
|> Enum.join(" | ")
end
# Trigram-based rank with bonus for close Levenshtein distance b/w query and nickname
defp trigram_levenshtein_rank(query, query_string) do
from(
u in subquery(query),
select_merge: %{
search_rank:
fragment(
"similarity(?, trim(? || ' ' || coalesce(?, ''))) + \
(CASE WHEN ? = 0 THEN 1.0 \
WHEN ? = 1 AND length(?) > 1 THEN 0.5
WHEN ? = 2 AND length(?) > 3 THEN 0.2
ELSE 0 END)",
^query_string,
u.nickname,
u.name,
u.levenshtein_distance,
u.levenshtein_distance,
^query_string,
u.levenshtein_distance,
^query_string
)
}
)
end
defp levenshtein_distance(query, query_string) do
from(
u in query,
select_merge: %{
levenshtein_distance:
fragment(
"levenshtein(?, regexp_replace(?, '@.+', ''))",
^query_string,
u.nickname
)
}
)
end
defp trigram_rank(query, query_string) do
from(
u in query,
@ -185,4 +266,8 @@ defmodule Pleroma.User.Search do
end
defp boost_search_rank(query, _for_user), do: query
defp filter_by_search_rank(query) do
from(u in subquery(query), where: u.search_rank > @search_rank_threshold)
end
end

View File

@ -0,0 +1,23 @@
defmodule Pleroma.Repo.Migrations.AddFuzzystrmatchPostgresExtension do
use Ecto.Migration
require Logger
def up do
Logger.warn("ATTENTION ATTENTION ATTENTION\n")
Logger.warn(
"This will try to create the pg_trgm extension on your database. If your database user does NOT have the necessary rights, you will have to do it manually and re-run the migrations.\nYou can probably do this by running the following:\n"
)
Logger.warn(
"sudo -u postgres psql pleroma_dev -c \"create extension if not exists fuzzystrmatch\"\n"
)
execute("create extension if not exists fuzzystrmatch")
end
def down do
execute("drop extension if exists fuzzystrmatch")
end
end

View File

@ -436,11 +436,14 @@ defmodule Mix.Tasks.Pleroma.UserTest do
{:ok, user} = User.follow(user, kawen)
assert [moon.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id)
# One "typo" in nickname makes `moot` score better than `kawen` despite of name match
assert [moon.id, moot.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id)
res = User.search("moo") |> Enum.map(& &1.id)
assert moon.id in res
assert moot.id in res
assert kawen.id in res
assert [moon.id, kawen.id] == User.Search.search("moon fediverse") |> Enum.map(& &1.id)
assert [kawen.id, moon.id] ==

View File

@ -37,6 +37,10 @@ defmodule Pleroma.UserSearchTest do
assert length(User.search("john", limit: 3, offset: 3)) == 2
end
defp clear_virtual_fields(user) do
Map.merge(user, %{search_rank: nil, search_type: nil, levenshtein_distance: nil})
end
test "finds a user by full or partial nickname" do
user = insert(:user, %{nickname: "john"})
@ -44,8 +48,7 @@ defmodule Pleroma.UserSearchTest do
assert user ==
User.search(query)
|> List.first()
|> Map.put(:search_rank, nil)
|> Map.put(:search_type, nil)
|> clear_virtual_fields()
end)
end
@ -56,8 +59,8 @@ defmodule Pleroma.UserSearchTest do
assert user ==
User.search(query)
|> List.first()
|> Map.put(:search_rank, nil)
|> Map.put(:search_type, nil)
|> Map.merge(%{search_rank: nil, search_type: nil, levenshtein_distance: nil})
|> clear_virtual_fields()
end)
end
@ -68,6 +71,23 @@ defmodule Pleroma.UserSearchTest do
assert [u2.id, u1.id] == Enum.map(User.search("bar word"), & &1.id)
end
test "considers Levenshtein distance between query and nickname for short queries" do
clear_config([:instance, :limit_to_local_content], false)
user = insert(:user, %{nickname: "hj@shigusegubu.club"})
insert(:user, %{nickname: "xyz@sample.com"})
insert(:user, %{nickname: "zyx@hj.com"})
# Note: "h.j." and "hhhj" are matched since 4+ char queries allow for 2 typos
for query <- ["hj", "hhj", "h j", "lj", "hi", "jj", "h.j.", "hhhj"] do
assert [user.id] == Enum.map(User.search(query), & &1.id)
end
for query <- ["ajay", "gg"] do
assert [] == User.search(query)
end
end
test "finds users, boosting ranks of friends and followers" do
u1 = insert(:user)
u2 = insert(:user, %{name: "Doe"})