[#1570] Experimental feature to improve user search using Levenshtein distance calculation.
Improves short queries, especially containing typos.
This commit is contained in:
parent
7aa6c82937
commit
659e15896e
lib/pleroma
priv/repo/migrations
test
@ -92,6 +92,7 @@ defmodule Pleroma.User do
|
|||||||
field(:local, :boolean, default: true)
|
field(:local, :boolean, default: true)
|
||||||
field(:follower_address, :string)
|
field(:follower_address, :string)
|
||||||
field(:following_address, :string)
|
field(:following_address, :string)
|
||||||
|
field(:levenshtein_distance, :integer, virtual: true)
|
||||||
field(:search_rank, :float, virtual: true)
|
field(:search_rank, :float, virtual: true)
|
||||||
field(:search_type, :integer, virtual: true)
|
field(:search_type, :integer, virtual: true)
|
||||||
field(:tags, {:array, :string}, default: [])
|
field(:tags, {:array, :string}, default: [])
|
||||||
|
@ -8,6 +8,8 @@ defmodule Pleroma.User.Search do
|
|||||||
import Ecto.Query
|
import Ecto.Query
|
||||||
|
|
||||||
@limit 20
|
@limit 20
|
||||||
|
@levenshtein_max_query_length 5
|
||||||
|
@search_rank_threshold 0
|
||||||
|
|
||||||
def search(query_string, opts \\ []) do
|
def search(query_string, opts \\ []) do
|
||||||
resolve = Keyword.get(opts, :resolve, false)
|
resolve = Keyword.get(opts, :resolve, false)
|
||||||
@ -31,7 +33,10 @@ defmodule Pleroma.User.Search do
|
|||||||
|
|
||||||
defp format_query(query_string) do
|
defp format_query(query_string) do
|
||||||
# Strip the beginning @ off if there is a query
|
# Strip the beginning @ off if there is a query
|
||||||
query_string = String.trim_leading(query_string, "@")
|
query_string =
|
||||||
|
query_string
|
||||||
|
|> String.trim()
|
||||||
|
|> String.trim_leading("@")
|
||||||
|
|
||||||
with [name, domain] <- String.split(query_string, "@") do
|
with [name, domain] <- String.split(query_string, "@") do
|
||||||
encoded_domain =
|
encoded_domain =
|
||||||
@ -47,15 +52,33 @@ defmodule Pleroma.User.Search do
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
defp levenshtein_applicable?(query_string) do
|
||||||
|
String.length(query_string) <= @levenshtein_max_query_length
|
||||||
|
end
|
||||||
|
|
||||||
defp search_query(query_string, for_user, following) do
|
defp search_query(query_string, for_user, following) do
|
||||||
for_user
|
query =
|
||||||
|> base_query(following)
|
for_user
|
||||||
|> filter_blocked_user(for_user)
|
|> base_query(following)
|
||||||
|> filter_invisible_users()
|
|> filter_blocked_user(for_user)
|
||||||
|> filter_blocked_domains(for_user)
|
|> filter_invisible_users()
|
||||||
|> fts_search(query_string)
|
|> filter_blocked_domains(for_user)
|
||||||
|> trigram_rank(query_string)
|
|
||||||
|
query =
|
||||||
|
if levenshtein_applicable?(query_string) do
|
||||||
|
query
|
||||||
|
|> levenshtein_distance(query_string)
|
||||||
|
|> fts_levenshtein_search(query_string)
|
||||||
|
|> trigram_levenshtein_rank(query_string)
|
||||||
|
else
|
||||||
|
query
|
||||||
|
|> fts_search(query_string)
|
||||||
|
|> trigram_rank(query_string)
|
||||||
|
end
|
||||||
|
|
||||||
|
query
|
||||||
|> boost_search_rank(for_user)
|
|> boost_search_rank(for_user)
|
||||||
|
|> filter_by_search_rank()
|
||||||
|> subquery()
|
|> subquery()
|
||||||
|> order_by(desc: :search_rank)
|
|> order_by(desc: :search_rank)
|
||||||
|> maybe_restrict_local(for_user)
|
|> maybe_restrict_local(for_user)
|
||||||
@ -78,6 +101,25 @@ defmodule Pleroma.User.Search do
|
|||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
defp fts_levenshtein_search(query, query_string) do
|
||||||
|
tsquery = to_tsquery(query_string)
|
||||||
|
|
||||||
|
from(
|
||||||
|
u in subquery(query),
|
||||||
|
where:
|
||||||
|
fragment(
|
||||||
|
"""
|
||||||
|
? <= 2 OR
|
||||||
|
(to_tsvector('simple', ?) || to_tsvector('simple', ?)) @@ to_tsquery('simple', ?)
|
||||||
|
""",
|
||||||
|
u.levenshtein_distance,
|
||||||
|
u.name,
|
||||||
|
u.nickname,
|
||||||
|
^tsquery
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
defp to_tsquery(query_string) do
|
defp to_tsquery(query_string) do
|
||||||
String.trim_trailing(query_string, "@" <> local_domain())
|
String.trim_trailing(query_string, "@" <> local_domain())
|
||||||
|> String.replace(~r/[!-\/|@|[-`|{-~|:-?]+/, " ")
|
|> String.replace(~r/[!-\/|@|[-`|{-~|:-?]+/, " ")
|
||||||
@ -87,6 +129,45 @@ defmodule Pleroma.User.Search do
|
|||||||
|> Enum.join(" | ")
|
|> Enum.join(" | ")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Trigram-based rank with bonus for close Levenshtein distance b/w query and nickname
|
||||||
|
defp trigram_levenshtein_rank(query, query_string) do
|
||||||
|
from(
|
||||||
|
u in subquery(query),
|
||||||
|
select_merge: %{
|
||||||
|
search_rank:
|
||||||
|
fragment(
|
||||||
|
"similarity(?, trim(? || ' ' || coalesce(?, ''))) + \
|
||||||
|
(CASE WHEN ? = 0 THEN 1.0 \
|
||||||
|
WHEN ? = 1 AND length(?) > 1 THEN 0.5
|
||||||
|
WHEN ? = 2 AND length(?) > 3 THEN 0.2
|
||||||
|
ELSE 0 END)",
|
||||||
|
^query_string,
|
||||||
|
u.nickname,
|
||||||
|
u.name,
|
||||||
|
u.levenshtein_distance,
|
||||||
|
u.levenshtein_distance,
|
||||||
|
^query_string,
|
||||||
|
u.levenshtein_distance,
|
||||||
|
^query_string
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp levenshtein_distance(query, query_string) do
|
||||||
|
from(
|
||||||
|
u in query,
|
||||||
|
select_merge: %{
|
||||||
|
levenshtein_distance:
|
||||||
|
fragment(
|
||||||
|
"levenshtein(?, regexp_replace(?, '@.+', ''))",
|
||||||
|
^query_string,
|
||||||
|
u.nickname
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
defp trigram_rank(query, query_string) do
|
defp trigram_rank(query, query_string) do
|
||||||
from(
|
from(
|
||||||
u in query,
|
u in query,
|
||||||
@ -185,4 +266,8 @@ defmodule Pleroma.User.Search do
|
|||||||
end
|
end
|
||||||
|
|
||||||
defp boost_search_rank(query, _for_user), do: query
|
defp boost_search_rank(query, _for_user), do: query
|
||||||
|
|
||||||
|
defp filter_by_search_rank(query) do
|
||||||
|
from(u in subquery(query), where: u.search_rank > @search_rank_threshold)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
@ -0,0 +1,23 @@
|
|||||||
|
defmodule Pleroma.Repo.Migrations.AddFuzzystrmatchPostgresExtension do
|
||||||
|
use Ecto.Migration
|
||||||
|
|
||||||
|
require Logger
|
||||||
|
|
||||||
|
def up do
|
||||||
|
Logger.warn("ATTENTION ATTENTION ATTENTION\n")
|
||||||
|
|
||||||
|
Logger.warn(
|
||||||
|
"This will try to create the pg_trgm extension on your database. If your database user does NOT have the necessary rights, you will have to do it manually and re-run the migrations.\nYou can probably do this by running the following:\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
Logger.warn(
|
||||||
|
"sudo -u postgres psql pleroma_dev -c \"create extension if not exists fuzzystrmatch\"\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
execute("create extension if not exists fuzzystrmatch")
|
||||||
|
end
|
||||||
|
|
||||||
|
def down do
|
||||||
|
execute("drop extension if exists fuzzystrmatch")
|
||||||
|
end
|
||||||
|
end
|
@ -436,11 +436,14 @@ defmodule Mix.Tasks.Pleroma.UserTest do
|
|||||||
|
|
||||||
{:ok, user} = User.follow(user, kawen)
|
{:ok, user} = User.follow(user, kawen)
|
||||||
|
|
||||||
assert [moon.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id)
|
# One "typo" in nickname makes `moot` score better than `kawen` despite of name match
|
||||||
|
assert [moon.id, moot.id, kawen.id] == User.Search.search("moon") |> Enum.map(& &1.id)
|
||||||
|
|
||||||
res = User.search("moo") |> Enum.map(& &1.id)
|
res = User.search("moo") |> Enum.map(& &1.id)
|
||||||
assert moon.id in res
|
assert moon.id in res
|
||||||
assert moot.id in res
|
assert moot.id in res
|
||||||
assert kawen.id in res
|
assert kawen.id in res
|
||||||
|
|
||||||
assert [moon.id, kawen.id] == User.Search.search("moon fediverse") |> Enum.map(& &1.id)
|
assert [moon.id, kawen.id] == User.Search.search("moon fediverse") |> Enum.map(& &1.id)
|
||||||
|
|
||||||
assert [kawen.id, moon.id] ==
|
assert [kawen.id, moon.id] ==
|
||||||
|
@ -37,6 +37,10 @@ defmodule Pleroma.UserSearchTest do
|
|||||||
assert length(User.search("john", limit: 3, offset: 3)) == 2
|
assert length(User.search("john", limit: 3, offset: 3)) == 2
|
||||||
end
|
end
|
||||||
|
|
||||||
|
defp clear_virtual_fields(user) do
|
||||||
|
Map.merge(user, %{search_rank: nil, search_type: nil, levenshtein_distance: nil})
|
||||||
|
end
|
||||||
|
|
||||||
test "finds a user by full or partial nickname" do
|
test "finds a user by full or partial nickname" do
|
||||||
user = insert(:user, %{nickname: "john"})
|
user = insert(:user, %{nickname: "john"})
|
||||||
|
|
||||||
@ -44,8 +48,7 @@ defmodule Pleroma.UserSearchTest do
|
|||||||
assert user ==
|
assert user ==
|
||||||
User.search(query)
|
User.search(query)
|
||||||
|> List.first()
|
|> List.first()
|
||||||
|> Map.put(:search_rank, nil)
|
|> clear_virtual_fields()
|
||||||
|> Map.put(:search_type, nil)
|
|
||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -56,8 +59,8 @@ defmodule Pleroma.UserSearchTest do
|
|||||||
assert user ==
|
assert user ==
|
||||||
User.search(query)
|
User.search(query)
|
||||||
|> List.first()
|
|> List.first()
|
||||||
|> Map.put(:search_rank, nil)
|
|> Map.merge(%{search_rank: nil, search_type: nil, levenshtein_distance: nil})
|
||||||
|> Map.put(:search_type, nil)
|
|> clear_virtual_fields()
|
||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -68,6 +71,23 @@ defmodule Pleroma.UserSearchTest do
|
|||||||
assert [u2.id, u1.id] == Enum.map(User.search("bar word"), & &1.id)
|
assert [u2.id, u1.id] == Enum.map(User.search("bar word"), & &1.id)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
test "considers Levenshtein distance between query and nickname for short queries" do
|
||||||
|
clear_config([:instance, :limit_to_local_content], false)
|
||||||
|
|
||||||
|
user = insert(:user, %{nickname: "hj@shigusegubu.club"})
|
||||||
|
insert(:user, %{nickname: "xyz@sample.com"})
|
||||||
|
insert(:user, %{nickname: "zyx@hj.com"})
|
||||||
|
|
||||||
|
# Note: "h.j." and "hhhj" are matched since 4+ char queries allow for 2 typos
|
||||||
|
for query <- ["hj", "hhj", "h j", "lj", "hi", "jj", "h.j.", "hhhj"] do
|
||||||
|
assert [user.id] == Enum.map(User.search(query), & &1.id)
|
||||||
|
end
|
||||||
|
|
||||||
|
for query <- ["ajay", "gg"] do
|
||||||
|
assert [] == User.search(query)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
test "finds users, boosting ranks of friends and followers" do
|
test "finds users, boosting ranks of friends and followers" do
|
||||||
u1 = insert(:user)
|
u1 = insert(:user)
|
||||||
u2 = insert(:user, %{name: "Doe"})
|
u2 = insert(:user, %{name: "Doe"})
|
||||||
|
Loading…
Reference in New Issue
Block a user