diff --git a/config/config.exs b/config/config.exs index 66d394655..efe0cd979 100644 --- a/config/config.exs +++ b/config/config.exs @@ -847,9 +847,14 @@ config :pleroma, Pleroma.User.Backup, config :pleroma, ConcurrentLimiter, [ {Pleroma.Web.RichMedia.Helpers, [max_running: 5, max_waiting: 5]}, - {Pleroma.Web.ActivityPub.MRF.MediaProxyWarmingPolicy, [max_running: 5, max_waiting: 5]} + {Pleroma.Web.ActivityPub.MRF.MediaProxyWarmingPolicy, [max_running: 5, max_waiting: 5]}, + {Pleroma.Search, [max_running: 30, max_waiting: 50]} ] +config :pleroma, Pleroma.Search, module: Pleroma.Activity.Search + +config :pleroma, Pleroma.Search.Meilisearch, url: "http://127.0.0.1:7700/", private_key: nil + # Import environment specific config. This must remain at the bottom # of this file so it overrides the configuration defined above. import_config "#{Mix.env()}.exs" diff --git a/config/test.exs b/config/test.exs index d5c25f65e..c9b2b51ba 100644 --- a/config/test.exs +++ b/config/test.exs @@ -133,6 +133,10 @@ config :pleroma, :side_effects, ap_streamer: Pleroma.Web.ActivityPub.ActivityPubMock, logger: Pleroma.LoggerMock +config :pleroma, Pleroma.Search, module: Pleroma.Activity + +config :pleroma, Pleroma.Search.Meilisearch, url: "http://127.0.0.1:7700/", private_key: nil + # Reduce recompilation time # https://dashbit.co/blog/speeding-up-re-compilation-of-elixir-projects config :phoenix, :plug_init_mode, :runtime diff --git a/docs/configuration/search.md b/docs/configuration/search.md new file mode 100644 index 000000000..e9743f1a4 --- /dev/null +++ b/docs/configuration/search.md @@ -0,0 +1,108 @@ +# Configuring search + +{! backend/administration/CLI_tasks/general_cli_task_info.include !} + +## Built-in search + +To use built-in search that has no external dependencies, set the search module to `Pleroma.Activity`: + +> config :pleroma, Pleroma.Search, module: Pleroma.Activity + +While it has no external dependencies, it has problems with performance and relevancy. + +## Meilisearch + +Note that it's quite a bit more memory hungry than PostgreSQL (around 4-5G for ~1.2 million +posts while idle and up to 7G while indexing initially). The disk usage for this additional index is also +around 4 gigabytes. Like [RUM](./cheatsheet.md#rum-indexing-for-full-text-search) indexes, it offers considerably +higher performance and ordering by timestamp in a reasonable amount of time. +Additionally, the search results seem to be more accurate. + +Due to high memory usage, it may be best to set it up on a different machine, if running pleroma on a low-resource +computer, and use private key authentication to secure the remote search instance. + +To use [meilisearch](https://www.meilisearch.com/), set the search module to `Pleroma.Search.Meilisearch`: + +> config :pleroma, Pleroma.Search, module: Pleroma.Search.Meilisearch + +You then need to set the address of the meilisearch instance, and optionally the private key for authentication. + +> config :pleroma, Pleroma.Search.Meilisearch, +> url: "http://127.0.0.1:7700/", +> private_key: "private key" + +Information about setting up meilisearch can be found in the +[official documentation](https://docs.meilisearch.com/learn/getting_started/installation.html). +You probably want to start it with `MEILI_NO_ANALYTICS=true` and `MEILI_NO_CENTRY=true` environment variables, +to disable analytics. + +### Private key authentication (optional) + +To set the private key, use the `MEILI_MASTER_KEY` environment variable when starting. After setting the _master key_, +you have to get the _private key_, which is actually used for authentication. + +=== "OTP" + ```sh + ./bin/pleroma_ctl search.meilisearch show-private-key + ``` + +=== "From Source" + ```sh + mix pleroma.search.meilisearch show-private-key + ``` + +This is the key you actually put into your configuration file. + +### Initial indexing + +After setting up the configuration, you'll want to index all of your already existsing posts. Only public posts are indexed. You'll only +have to do it one time, but it might take a while, depending on the amount of posts your instance has seen. This is also a fairly RAM +consuming process for `meilisearch`, and it will take a lot of RAM when running if you have a lot of posts (seems to be around 5G for ~1.2 +million posts while idle and up to 7G while indexing initially, but your experience may be different). + +To start te initial indexing, run the `index` command: + +=== "OTP" + ```sh + ./bin/pleroma_ctl search.meilisearch index + ``` + +=== "From Source" + ```sh + mix pleroma.search.meilisearch index + ``` + +This will show you the total amount of posts to index, and then show you the amount of posts indexed currently, until the numbers eventually +become the same. The posts are indexed in big batches and meilisearch will take some time to actually index them, even after you have +inserted all the posts into it. Depending on the amount of posts, this may be as long as several hours. To get information about the status +of indexing and how many posts have actually been indexed, use the `stats` command: + +=== "OTP" + ```sh + ./bin/pleroma_ctl search.meilisearch stats + ``` + +=== "From Source" + ```sh + mix pleroma.search.meilisearch stats + ``` + +### Clearing the index + +In case you need to clear the index (for example, to re-index from scratch, if that needs to happen for some reason), you can +use the `clear` command: + +=== "OTP" + ```sh + ./bin/pleroma_ctl search.meilisearch clear + ``` + +=== "From Source" + ```sh + mix pleroma.search.meilisearch clear + ``` + +This will clear **all** the posts from the search index. Note, that deleted posts are also removed from index by the instance itself, so +there is no need to actually clear the whole index, unless you want **all** of it gone. That said, the index does not hold any information +that cannot be re-created from the database, it should also generally be a lot smaller than the size of your database. Still, the size +depends on the amount of text in posts. diff --git a/lib/mix/tasks/pleroma/search/meilisearch.ex b/lib/mix/tasks/pleroma/search/meilisearch.ex new file mode 100644 index 000000000..cdf9ab0bd --- /dev/null +++ b/lib/mix/tasks/pleroma/search/meilisearch.ex @@ -0,0 +1,122 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2021 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Mix.Tasks.Pleroma.Search.Meilisearch do + require Logger + require Pleroma.Constants + + import Mix.Pleroma + import Ecto.Query + + import Pleroma.Search.Meilisearch, only: [meili_post!: 2, meili_delete!: 1, meili_get!: 1] + + def run(["index"]) do + start_pleroma() + + meili_post!( + "/indexes/objects/settings/ranking-rules", + [ + "desc(published)", + "words", + "exactness", + "proximity", + "wordsPosition", + "typo", + "attribute" + ] + ) + + meili_post!( + "/indexes/objects/settings/searchable-attributes", + [ + "content" + ] + ) + + chunk_size = 10_000 + + Pleroma.Repo.transaction( + fn -> + query = + from(Pleroma.Object, + # Only index public posts which are notes and have some text + where: + fragment("data->>'type' = 'Note'") and + fragment("LENGTH(data->>'content') > 0") and + fragment("data->'to' \\? ?", ^Pleroma.Constants.as_public()), + order_by: [desc: fragment("data->'published'")] + ) + + count = query |> Pleroma.Repo.aggregate(:count, :data) + IO.puts("Entries to index: #{count}") + + Pleroma.Repo.stream( + query, + timeout: :infinity + ) + |> Stream.map(&Pleroma.Search.Meilisearch.object_to_search_data/1) + |> Stream.filter(fn o -> not is_nil(o) end) + |> Stream.chunk_every(chunk_size) + |> Stream.transform(0, fn objects, acc -> + new_acc = acc + Enum.count(objects) + + # Reset to the beginning of the line and rewrite it + IO.write("\r") + IO.write("Indexed #{new_acc} entries") + + {[objects], new_acc} + end) + |> Stream.each(fn objects -> + result = + meili_post!( + "/indexes/objects/documents", + objects + ) + + if not Map.has_key?(result, "updateId") do + IO.puts("Failed to index: #{inspect(result)}") + end + end) + |> Stream.run() + end, + timeout: :infinity + ) + + IO.write("\n") + end + + def run(["clear"]) do + start_pleroma() + + meili_delete!("/indexes/objects/documents") + end + + def run(["show-private-key", master_key]) do + start_pleroma() + + endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url]) + + {:ok, result} = + Pleroma.HTTP.get( + Path.join(endpoint, "/keys"), + [{"X-Meili-API-Key", master_key}] + ) + + decoded = Jason.decode!(result.body) + + if decoded["private"] do + IO.puts(decoded["private"]) + else + IO.puts("Error fetching the key, check the master key is correct: #{inspect(decoded)}") + end + end + + def run(["stats"]) do + start_pleroma() + + result = meili_get!("/indexes/objects/stats") + IO.puts("Number of entries: #{result["numberOfDocuments"]}") + IO.puts("Indexing? #{result["isIndexing"]}") + end +end diff --git a/lib/pleroma/activity/search.ex b/lib/pleroma/activity/search.ex index 09671f621..8352ba20a 100644 --- a/lib/pleroma/activity/search.ex +++ b/lib/pleroma/activity/search.ex @@ -45,6 +45,9 @@ defmodule Pleroma.Activity.Search do end end + def add_to_index(_activity), do: nil + def remove_from_index(_object), do: nil + def maybe_restrict_author(query, %User{} = author) do Activity.Queries.by_author(query, author) end @@ -57,7 +60,7 @@ defmodule Pleroma.Activity.Search do def maybe_restrict_blocked(query, _), do: query - defp restrict_public(q) do + def restrict_public(q) do from([a, o] in q, where: fragment("?->>'type' = 'Create'", a.data), where: ^Pleroma.Constants.as_public() in a.recipients @@ -124,7 +127,7 @@ defmodule Pleroma.Activity.Search do ) end - defp maybe_restrict_local(q, user) do + def maybe_restrict_local(q, user) do limit = Pleroma.Config.get([:instance, :limit_to_local_content], :unauthenticated) case {limit, user} do @@ -137,7 +140,7 @@ defmodule Pleroma.Activity.Search do defp restrict_local(q), do: where(q, local: true) - defp maybe_fetch(activities, user, search_query) do + def maybe_fetch(activities, user, search_query) do with true <- Regex.match?(~r/https?:/, search_query), {:ok, object} <- Fetcher.fetch_object_from_id(search_query), %Activity{} = activity <- Activity.get_create_by_object_ap_id(object.data["id"]), diff --git a/lib/pleroma/application.ex b/lib/pleroma/application.ex index 9824e0a4a..2ab43ee8a 100644 --- a/lib/pleroma/application.ex +++ b/lib/pleroma/application.ex @@ -303,7 +303,11 @@ defmodule Pleroma.Application do def limiters_setup do config = Config.get(ConcurrentLimiter, []) - [Pleroma.Web.RichMedia.Helpers, Pleroma.Web.ActivityPub.MRF.MediaProxyWarmingPolicy] + [ + Pleroma.Web.RichMedia.Helpers, + Pleroma.Web.ActivityPub.MRF.MediaProxyWarmingPolicy, + Pleroma.Search + ] |> Enum.each(fn module -> mod_config = Keyword.get(config, module, []) diff --git a/lib/pleroma/search/meilisearch.ex b/lib/pleroma/search/meilisearch.ex new file mode 100644 index 000000000..d94ab8b64 --- /dev/null +++ b/lib/pleroma/search/meilisearch.ex @@ -0,0 +1,134 @@ +defmodule Pleroma.Search.Meilisearch do + require Logger + require Pleroma.Constants + + alias Pleroma.Activity + + import Pleroma.Activity.Search + import Ecto.Query + + defp meili_headers() do + private_key = Pleroma.Config.get([Pleroma.Search.Meilisearch, :private_key]) + + [{"Content-Type", "application/json"}] ++ + if is_nil(private_key), do: [], else: [{"X-Meili-API-Key", private_key}] + end + + def meili_get!(path) do + endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url]) + + {:ok, result} = + Pleroma.HTTP.get( + Path.join(endpoint, path), + meili_headers() + ) + + Jason.decode!(result.body) + end + + def meili_post!(path, params) do + endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url]) + + {:ok, result} = + Pleroma.HTTP.post( + Path.join(endpoint, path), + Jason.encode!(params), + meili_headers() + ) + + Jason.decode!(result.body) + end + + def meili_delete!(path) do + endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url]) + + {:ok, _} = + Pleroma.HTTP.request( + :delete, + Path.join(endpoint, path), + "", + meili_headers(), + [] + ) + end + + def search(user, query, options \\ []) do + limit = Enum.min([Keyword.get(options, :limit), 40]) + offset = Keyword.get(options, :offset, 0) + author = Keyword.get(options, :author) + + result = + meili_post!( + "/indexes/objects/search", + %{q: query, offset: offset, limit: limit} + ) + + hits = result["hits"] |> Enum.map(& &1["ap"]) + + try do + hits + |> Activity.create_by_object_ap_id() + |> Activity.with_preloaded_object() + |> Activity.with_preloaded_object() + |> Activity.restrict_deactivated_users() + |> maybe_restrict_local(user) + |> maybe_restrict_author(author) + |> maybe_restrict_blocked(user) + |> maybe_fetch(user, query) + |> order_by([object: obj], desc: obj.data["published"]) + |> Pleroma.Repo.all() + rescue + _ -> maybe_fetch([], user, query) + end + end + + def object_to_search_data(object) do + if not is_nil(object) and object.data["type"] == "Note" and + Pleroma.Constants.as_public() in object.data["to"] do + data = object.data + + content_str = + case data["content"] do + [nil | rest] -> to_string(rest) + str -> str + end + + content = + with {:ok, scrubbed} <- FastSanitize.strip_tags(content_str), + trimmed <- String.trim(scrubbed) do + trimmed + end + + if String.length(content) > 1 do + {:ok, published, _} = DateTime.from_iso8601(data["published"]) + + %{ + id: object.id, + content: content, + ap: data["id"], + published: published |> DateTime.to_unix() + } + end + end + end + + def add_to_index(activity) do + maybe_search_data = object_to_search_data(activity.object) + + if activity.data["type"] == "Create" and maybe_search_data do + result = + meili_post!( + "/indexes/objects/documents", + [maybe_search_data] + ) + + if not Map.has_key?(result, "updateId") do + Logger.error("Failed to add activity #{activity.id} to index: #{inspect(result)}") + end + end + end + + def remove_from_index(object) do + meili_delete!("/indexes/objects/documents/#{object.id}") + end +end diff --git a/lib/pleroma/search/search.ex b/lib/pleroma/search/search.ex new file mode 100644 index 000000000..e363abf19 --- /dev/null +++ b/lib/pleroma/search/search.ex @@ -0,0 +1,18 @@ +defmodule Pleroma.Search do + def add_to_index(activity) do + search_module = Pleroma.Config.get([Pleroma.Search, :module]) + + ConcurrentLimiter.limit(Pleroma.Search, fn -> + Task.start(fn -> search_module.add_to_index(activity) end) + end) + end + + def remove_from_index(object) do + # Also delete from search index + search_module = Pleroma.Config.get([Pleroma.Search, :module]) + + ConcurrentLimiter.limit(Pleroma.Search, fn -> + Task.start(fn -> search_module.remove_from_index(object) end) + end) + end +end diff --git a/lib/pleroma/web/activity_pub/activity_pub.ex b/lib/pleroma/web/activity_pub/activity_pub.ex index dbaf06e7a..2d6572b78 100644 --- a/lib/pleroma/web/activity_pub/activity_pub.ex +++ b/lib/pleroma/web/activity_pub/activity_pub.ex @@ -136,6 +136,9 @@ defmodule Pleroma.Web.ActivityPub.ActivityPub do Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end) end) + # Add local posts to search index + if local, do: Pleroma.Search.add_to_index(activity) + {:ok, activity} else %Activity{} = activity -> diff --git a/lib/pleroma/web/activity_pub/side_effects.ex b/lib/pleroma/web/activity_pub/side_effects.ex index 701181a14..5859e9a9a 100644 --- a/lib/pleroma/web/activity_pub/side_effects.ex +++ b/lib/pleroma/web/activity_pub/side_effects.ex @@ -193,6 +193,7 @@ defmodule Pleroma.Web.ActivityPub.SideEffects do # - Increase replies count # - Set up ActivityExpiration # - Set up notifications + # - Index incoming posts for search (if needed) @impl true def handle(%{data: %{"type" => "Create"}} = activity, meta) do with {:ok, object, meta} <- handle_object_creation(meta[:object_data], activity, meta), @@ -221,6 +222,8 @@ defmodule Pleroma.Web.ActivityPub.SideEffects do Task.start(fn -> Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity) end) end) + Pleroma.Search.add_to_index(Map.put(activity, :object, object)) + meta = meta |> add_notifications(notifications) @@ -281,6 +284,7 @@ defmodule Pleroma.Web.ActivityPub.SideEffects do # - Reduce the user note count # - Reduce the reply count # - Stream out the activity + # - Removes posts from search index (if needed) @impl true def handle(%{data: %{"type" => "Delete", "object" => deleted_object}} = object, meta) do deleted_object = @@ -320,6 +324,9 @@ defmodule Pleroma.Web.ActivityPub.SideEffects do if result == :ok do Notification.create_notifications(object) + + Pleroma.Search.remove_from_index(deleted_object) + {:ok, object, meta} else {:error, result} diff --git a/lib/pleroma/web/mastodon_api/controllers/search_controller.ex b/lib/pleroma/web/mastodon_api/controllers/search_controller.ex index 64b177eb3..0bb0cd3e3 100644 --- a/lib/pleroma/web/mastodon_api/controllers/search_controller.ex +++ b/lib/pleroma/web/mastodon_api/controllers/search_controller.ex @@ -5,7 +5,6 @@ defmodule Pleroma.Web.MastodonAPI.SearchController do use Pleroma.Web, :controller - alias Pleroma.Activity alias Pleroma.Repo alias Pleroma.User alias Pleroma.Web.ControllerHelper @@ -98,7 +97,9 @@ defmodule Pleroma.Web.MastodonAPI.SearchController do end defp resource_search(_, "statuses", query, options) do - statuses = with_fallback(fn -> Activity.search(options[:for_user], query, options) end) + search_module = Pleroma.Config.get([Pleroma.Search, :module], Pleroma.Activity) + + statuses = with_fallback(fn -> search_module.search(options[:for_user], query, options) end) StatusView.render("index.json", activities: statuses, diff --git a/test/pleroma/search/meilisearch_test.exs b/test/pleroma/search/meilisearch_test.exs new file mode 100644 index 000000000..6e13c8edf --- /dev/null +++ b/test/pleroma/search/meilisearch_test.exs @@ -0,0 +1,108 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2021 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Search.MeilisearchTest do + require Pleroma.Constants + + use Pleroma.DataCase + + import Pleroma.Factory + import Tesla.Mock + import Mock + + alias Pleroma.Web.CommonAPI + alias Pleroma.Search.Meilisearch + + setup_all do + Tesla.Mock.mock_global(fn env -> apply(HttpRequestMock, :request, [env]) end) + :ok + end + + describe "meilisearch" do + setup do: clear_config([Pleroma.Search, :module], Meilisearch) + + setup_with_mocks( + [ + {Meilisearch, [:passthrough], + [ + add_to_index: fn a -> passthrough([a]) end, + remove_from_index: fn a -> passthrough([a]) end + ]} + ], + context, + do: {:ok, context} + ) + + test "indexes a local post on creation" do + user = insert(:user) + + mock_global(fn + %{method: :post, url: "http://127.0.0.1:7700/indexes/objects/documents", body: body} -> + assert match?( + [%{"content" => "guys i just don't wanna leave the swamp"}], + Jason.decode!(body) + ) + + json(%{updateId: 1}) + end) + + {:ok, activity} = + CommonAPI.post(user, %{ + status: "guys i just don't wanna leave the swamp", + visibility: "public" + }) + + assert_called(Meilisearch.add_to_index(activity)) + end + + test "doesn't index posts that are not public" do + user = insert(:user) + + Enum.each(["unlisted", "private", "direct"], fn visiblity -> + {:ok, _} = + CommonAPI.post(user, %{ + status: "guys i just don't wanna leave the swamp", + visibility: visiblity + }) + end) + + history = call_history(Meilisearch) + assert Enum.count(history) == 3 + + Enum.each(history, fn {_, _, return} -> + assert is_nil(return) + end) + end + + test "deletes posts from index when deleted locally" do + user = insert(:user) + + mock_global(fn + %{method: :post, url: "http://127.0.0.1:7700/indexes/objects/documents", body: body} -> + assert match?( + [%{"content" => "guys i just don't wanna leave the swamp"}], + Jason.decode!(body) + ) + + json(%{updateId: 1}) + + %{method: :delete, url: "http://127.0.0.1:7700/indexes/objects/documents/" <> id} -> + assert String.length(id) > 1 + json(%{updateId: 2}) + end) + + {:ok, activity} = + CommonAPI.post(user, %{ + status: "guys i just don't wanna leave the swamp", + visibility: "public" + }) + + assert_called(Meilisearch.add_to_index(activity)) + + {:ok, _} = CommonAPI.delete(activity.id, user) + + assert_called(Meilisearch.remove_from_index(:_)) + end + end +end