pleroma/test/html_test.exs

# Pleroma: A lightweight social networking server
# Copyright © 2017-2020 Pleroma Authors <https://pleroma.social/>
# SPDX-License-Identifier: AGPL-3.0-only

defmodule Pleroma.HTMLTest do
  alias Pleroma.HTML
  alias Pleroma.Object
  alias Pleroma.Web.CommonAPI
  use Pleroma.DataCase

  import Pleroma.Factory

  @html_sample """
    <b>this is in bold</b>
    <p>this is a paragraph</p>
    this is a linebreak<br />
    this is a link with allowed "rel" attribute: <a href="http://example.com/" rel="tag">example.com</a>
    this is a link with not allowed "rel" attribute: <a href="http://example.com/" rel="tag noallowed">example.com</a>
    this is an image: <img src="http://example.com/image.jpg"><br />
    <script>alert('hacked')</script>
  """

  @html_onerror_sample """
  <img src="http://example.com/image.jpg" onerror="alert('hacked')">
  """

  @html_span_class_sample """
  <span class="animate-spin">hi</span>
  """

  @html_span_microformats_sample """
  <span class="h-card"><a class="u-url mention">@<span>foo</span></a></span>
  """

  @html_span_invalid_microformats_sample """
  <span class="h-card"><a class="u-url mention animate-spin">@<span>foo</span></a></span>
  """

  describe "StripTags scrubber" do
    test "works as expected" do
      expected = """
        this is in bold
        this is a paragraph
        this is a linebreak
        this is a link with allowed &quot;rel&quot; attribute: example.com
        this is a link with not allowed &quot;rel&quot; attribute: example.com
        this is an image: 
        alert(&#39;hacked&#39;)
      """

      assert expected == HTML.strip_tags(@html_sample)
    end

    test "does not allow attribute-based XSS" do
      expected = "\n"

      assert expected == HTML.strip_tags(@html_onerror_sample)
    end
  end

  describe "TwitterText scrubber" do
    test "normalizes HTML as expected" do
      expected = """
        this is in bold
        <p>this is a paragraph</p>
        this is a linebreak<br/>
        this is a link with allowed &quot;rel&quot; attribute: <a href="http://example.com/" rel="tag">example.com</a>
        this is a link with not allowed &quot;rel&quot; attribute: <a href="http://example.com/">example.com</a>
        this is an image: <img src="http://example.com/image.jpg"/><br/>
        alert(&#39;hacked&#39;)
      """

      assert expected == HTML.filter_tags(@html_sample, Pleroma.HTML.Scrubber.TwitterText)
    end

    test "does not allow attribute-based XSS" do
      expected = """
      <img src="http://example.com/image.jpg"/>
      """

      assert expected == HTML.filter_tags(@html_onerror_sample, Pleroma.HTML.Scrubber.TwitterText)
    end

    test "does not allow spans with invalid classes" do
      expected = """
      <span>hi</span>
      """

      assert expected ==
               HTML.filter_tags(@html_span_class_sample, Pleroma.HTML.Scrubber.TwitterText)
    end

    test "does allow microformats" do
      expected = """
      <span class="h-card"><a class="u-url mention">@<span>foo</span></a></span>
      """

      assert expected ==
               HTML.filter_tags(@html_span_microformats_sample, Pleroma.HTML.Scrubber.TwitterText)
    end

    test "filters invalid microformats markup" do
      expected = """
      <span class="h-card"><a>@<span>foo</span></a></span>
      """

      assert expected ==
               HTML.filter_tags(
                 @html_span_invalid_microformats_sample,
                 Pleroma.HTML.Scrubber.TwitterText
               )
    end
  end

  describe "default scrubber" do
    test "normalizes HTML as expected" do
      expected = """
        <b>this is in bold</b>
        <p>this is a paragraph</p>
        this is a linebreak<br/>
        this is a link with allowed &quot;rel&quot; attribute: <a href="http://example.com/" rel="tag">example.com</a>
        this is a link with not allowed &quot;rel&quot; attribute: <a href="http://example.com/">example.com</a>
        this is an image: <img src="http://example.com/image.jpg"/><br/>
        alert(&#39;hacked&#39;)
      """

      assert expected == HTML.filter_tags(@html_sample, Pleroma.HTML.Scrubber.Default)
    end

    test "does not allow attribute-based XSS" do
      expected = """
      <img src="http://example.com/image.jpg"/>
      """

      assert expected == HTML.filter_tags(@html_onerror_sample, Pleroma.HTML.Scrubber.Default)
    end

    test "does not allow spans with invalid classes" do
      expected = """
      <span>hi</span>
      """

      assert expected == HTML.filter_tags(@html_span_class_sample, Pleroma.HTML.Scrubber.Default)
    end

    test "does allow microformats" do
      expected = """
      <span class="h-card"><a class="u-url mention">@<span>foo</span></a></span>
      """

      assert expected ==
               HTML.filter_tags(@html_span_microformats_sample, Pleroma.HTML.Scrubber.Default)
    end

    test "filters invalid microformats markup" do
      expected = """
      <span class="h-card"><a>@<span>foo</span></a></span>
      """

      assert expected ==
               HTML.filter_tags(
                 @html_span_invalid_microformats_sample,
                 Pleroma.HTML.Scrubber.Default
               )
    end
  end

  describe "extract_first_external_url" do
    test "extracts the url" do
      user = insert(:user)

      {:ok, activity} =
        CommonAPI.post(user, %{
          status:
            "I think I just found the best github repo https://github.com/komeiji-satori/Dress"
        })

      object = Object.normalize(activity)
      {:ok, url} = HTML.extract_first_external_url(object, object.data["content"])
      assert url == "https://github.com/komeiji-satori/Dress"
    end

    test "skips mentions" do
      user = insert(:user)
      other_user = insert(:user)

      {:ok, activity} =
        CommonAPI.post(user, %{
          status:
            "@#{other_user.nickname} install misskey! https://github.com/syuilo/misskey/blob/develop/docs/setup.en.md"
        })

      object = Object.normalize(activity)
      {:ok, url} = HTML.extract_first_external_url(object, object.data["content"])

      assert url == "https://github.com/syuilo/misskey/blob/develop/docs/setup.en.md"

      refute url == other_user.ap_id
    end

    test "skips hashtags" do
      user = insert(:user)

      {:ok, activity} =
        CommonAPI.post(user, %{
          status: "#cofe https://www.pixiv.net/member_illust.php?mode=medium&illust_id=72255140"
        })

      object = Object.normalize(activity)
      {:ok, url} = HTML.extract_first_external_url(object, object.data["content"])

      assert url == "https://www.pixiv.net/member_illust.php?mode=medium&illust_id=72255140"
    end

    test "skips microformats hashtags" do
      user = insert(:user)

      {:ok, activity} =
        CommonAPI.post(user, %{
          status:
            "<a href=\"https://pleroma.gov/tags/cofe\" rel=\"tag\">#cofe</a> https://www.pixiv.net/member_illust.php?mode=medium&illust_id=72255140",
          content_type: "text/html"
        })

      object = Object.normalize(activity)
      {:ok, url} = HTML.extract_first_external_url(object, object.data["content"])

      assert url == "https://www.pixiv.net/member_illust.php?mode=medium&illust_id=72255140"
    end

    test "does not crash when there is an HTML entity in a link" do
      user = insert(:user)

      {:ok, activity} = CommonAPI.post(user, %{status: "\"http://cofe.com/?boomer=ok&foo=bar\""})

      object = Object.normalize(activity)

      assert {:ok, nil} = HTML.extract_first_external_url(object, object.data["content"])
    end

    test "skips attachment links" do
      user = insert(:user)

      {:ok, activity} =
        CommonAPI.post(user, %{
          status:
            "<a href=\"https://pleroma.gov/media/d24caa3a498e21e0298377a9ca0149a4f4f8b767178aacf837542282e2d94fb1.png?name=image.png\" class=\"attachment\">image.png</a>"
        })

      object = Object.normalize(activity)

      assert {:ok, nil} = HTML.extract_first_external_url(object, object.data["content"])
    end
  end
end
tests: add legal boilerplate 2018-12-23 15:11:29 -05:00			`# Pleroma: A lightweight social networking server`
Update Copyrights 2020-03-03 17:44:49 -05:00			`# Copyright © 2017-2020 Pleroma Authors <https://pleroma.social/>`
tests: add legal boilerplate 2018-12-23 15:11:29 -05:00			`# SPDX-License-Identifier: AGPL-3.0-only`

test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`defmodule Pleroma.HTMLTest do`
			`alias Pleroma.HTML`
Fix hashtags being picked up by rich media parser Closes #989 2019-06-14 07:34:42 -04:00			`alias Pleroma.Object`
			`alias Pleroma.Web.CommonAPI`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`use Pleroma.DataCase`

Fix hashtags being picked up by rich media parser Closes #989 2019-06-14 07:34:42 -04:00			`import Pleroma.Factory`

test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`@html_sample """`
			`<b>this is in bold</b>`
			`<p>this is a paragraph</p>`
			`this is a linebreak<br />`
Allow 'rel' attribute on `<a>` link with specific values (for hashtag recognition). 2019-03-17 09:46:46 -04:00			`this is a link with allowed "rel" attribute: <a href="http://example.com/" rel="tag">example.com</a>`
			`this is a link with not allowed "rel" attribute: <a href="http://example.com/" rel="tag noallowed">example.com</a>`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`this is an image: <img src="http://example.com/image.jpg"><br />`
			`<script>alert('hacked')</script>`
			`"""`

			`@html_onerror_sample """`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`<img src="http://example.com/image.jpg" onerror="alert('hacked')">`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`"""`

html: lock down allowed class attributes to only those related to microformats 2019-04-23 18:55:21 -04:00			`@html_span_class_sample """`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`<span class="animate-spin">hi</span>`
html: lock down allowed class attributes to only those related to microformats 2019-04-23 18:55:21 -04:00			`"""`

			`@html_span_microformats_sample """`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`<span class="h-card"><a class="u-url mention">@<span>foo</span></a></span>`
html: lock down allowed class attributes to only those related to microformats 2019-04-23 18:55:21 -04:00			`"""`

			`@html_span_invalid_microformats_sample """`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`<span class="h-card"><a class="u-url mention animate-spin">@<span>foo</span></a></span>`
html: lock down allowed class attributes to only those related to microformats 2019-04-23 18:55:21 -04:00			`"""`

test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`describe "StripTags scrubber" do`
			`test "works as expected" do`
			`expected = """`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`this is in bold`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`this is a paragraph`
			`this is a linebreak`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`this is a link with allowed "rel" attribute: example.com`
			`this is a link with not allowed "rel" attribute: example.com`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`this is an image:`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`alert('hacked')`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`"""`

			`assert expected == HTML.strip_tags(@html_sample)`
			`end`

			`test "does not allow attribute-based XSS" do`
			`expected = "\n"`

			`assert expected == HTML.strip_tags(@html_onerror_sample)`
			`end`
			`end`

			`describe "TwitterText scrubber" do`
			`test "normalizes HTML as expected" do`
			`expected = """`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`this is in bold`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`<p>this is a paragraph</p>`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`this is a linebreak<br/>`
			`this is a link with allowed "rel" attribute: <a href="http://example.com/" rel="tag">example.com</a>`
			`this is a link with not allowed "rel" attribute: <a href="http://example.com/">example.com</a>`
			`this is an image: <img src="http://example.com/image.jpg"/><br/>`
			`alert('hacked')`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`"""`

			`assert expected == HTML.filter_tags(@html_sample, Pleroma.HTML.Scrubber.TwitterText)`
			`end`

			`test "does not allow attribute-based XSS" do`
			`expected = """`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`<img src="http://example.com/image.jpg"/>`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`"""`

			`assert expected == HTML.filter_tags(@html_onerror_sample, Pleroma.HTML.Scrubber.TwitterText)`
			`end`
html: lock down allowed class attributes to only those related to microformats 2019-04-23 18:55:21 -04:00
			`test "does not allow spans with invalid classes" do`
			`expected = """`
			`<span>hi</span>`
			`"""`

			`assert expected ==`
			`HTML.filter_tags(@html_span_class_sample, Pleroma.HTML.Scrubber.TwitterText)`
			`end`

			`test "does allow microformats" do`
			`expected = """`
			`<span class="h-card"><a class="u-url mention">@<span>foo</span></a></span>`
			`"""`

			`assert expected ==`
			`HTML.filter_tags(@html_span_microformats_sample, Pleroma.HTML.Scrubber.TwitterText)`
			`end`

			`test "filters invalid microformats markup" do`
			`expected = """`
			`<span class="h-card"><a>@<span>foo</span></a></span>`
			`"""`

			`assert expected ==`
			`HTML.filter_tags(`
			`@html_span_invalid_microformats_sample,`
			`Pleroma.HTML.Scrubber.TwitterText`
			`)`
			`end`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`end`

			`describe "default scrubber" do`
			`test "normalizes HTML as expected" do`
			`expected = """`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`<b>this is in bold</b>`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`<p>this is a paragraph</p>`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`this is a linebreak<br/>`
			`this is a link with allowed "rel" attribute: <a href="http://example.com/" rel="tag">example.com</a>`
			`this is a link with not allowed "rel" attribute: <a href="http://example.com/">example.com</a>`
			`this is an image: <img src="http://example.com/image.jpg"/><br/>`
			`alert('hacked')`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`"""`

			`assert expected == HTML.filter_tags(@html_sample, Pleroma.HTML.Scrubber.Default)`
			`end`

			`test "does not allow attribute-based XSS" do`
			`expected = """`
Switch from HtmlSanitizeEx to FastSanitize 2019-10-28 18:18:08 -04:00			`<img src="http://example.com/image.jpg"/>`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`"""`

			`assert expected == HTML.filter_tags(@html_onerror_sample, Pleroma.HTML.Scrubber.Default)`
			`end`
html: lock down allowed class attributes to only those related to microformats 2019-04-23 18:55:21 -04:00
			`test "does not allow spans with invalid classes" do`
			`expected = """`
			`<span>hi</span>`
			`"""`

			`assert expected == HTML.filter_tags(@html_span_class_sample, Pleroma.HTML.Scrubber.Default)`
			`end`

			`test "does allow microformats" do`
			`expected = """`
			`<span class="h-card"><a class="u-url mention">@<span>foo</span></a></span>`
			`"""`

			`assert expected ==`
			`HTML.filter_tags(@html_span_microformats_sample, Pleroma.HTML.Scrubber.Default)`
			`end`

			`test "filters invalid microformats markup" do`
			`expected = """`
			`<span class="h-card"><a>@<span>foo</span></a></span>`
			`"""`

			`assert expected ==`
			`HTML.filter_tags(`
			`@html_span_invalid_microformats_sample,`
			`Pleroma.HTML.Scrubber.Default`
			`)`
			`end`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`end`
Fix hashtags being picked up by rich media parser Closes #989 2019-06-14 07:34:42 -04:00
			`describe "extract_first_external_url" do`
			`test "extracts the url" do`
			`user = insert(:user)`

			`{:ok, activity} =`
			`CommonAPI.post(user, %{`
Add OpenAPI spec for StatusController 2020-05-12 15:59:26 -04:00			`status:`
Fix hashtags being picked up by rich media parser Closes #989 2019-06-14 07:34:42 -04:00			`"I think I just found the best github repo https://github.com/komeiji-satori/Dress"`
			`})`

			`object = Object.normalize(activity)`
			`{:ok, url} = HTML.extract_first_external_url(object, object.data["content"])`
			`assert url == "https://github.com/komeiji-satori/Dress"`
			`end`

			`test "skips mentions" do`
			`user = insert(:user)`
			`other_user = insert(:user)`

			`{:ok, activity} =`
			`CommonAPI.post(user, %{`
Add OpenAPI spec for StatusController 2020-05-12 15:59:26 -04:00			`status:`
Fix hashtags being picked up by rich media parser Closes #989 2019-06-14 07:34:42 -04:00			`"@#{other_user.nickname} install misskey! https://github.com/syuilo/misskey/blob/develop/docs/setup.en.md"`
			`})`

			`object = Object.normalize(activity)`
			`{:ok, url} = HTML.extract_first_external_url(object, object.data["content"])`

			`assert url == "https://github.com/syuilo/misskey/blob/develop/docs/setup.en.md"`

			`refute url == other_user.ap_id`
			`end`

			`test "skips hashtags" do`
			`user = insert(:user)`

			`{:ok, activity} =`
			`CommonAPI.post(user, %{`
Add OpenAPI spec for StatusController 2020-05-12 15:59:26 -04:00			`status: "#cofe https://www.pixiv.net/member_illust.php?mode=medium&illust_id=72255140"`
Fix hashtags being picked up by rich media parser Closes #989 2019-06-14 07:34:42 -04:00			`})`

			`object = Object.normalize(activity)`
			`{:ok, url} = HTML.extract_first_external_url(object, object.data["content"])`

			`assert url == "https://www.pixiv.net/member_illust.php?mode=medium&illust_id=72255140"`
			`end`
Rich Media: Skip Microformats hashtags When fixing this problem I incorrectly assumed a.hashtag is the proper way for detecting hashtags, but it is just something Pleroma and Mastodon add. Per microformats it should be detected by the presense of rel=tag. This MR adds a check for rel=tag, but I still left a.hashtag just in case 2019-06-18 17:31:30 -04:00
			`test "skips microformats hashtags" do`
			`user = insert(:user)`

			`{:ok, activity} =`
			`CommonAPI.post(user, %{`
Add OpenAPI spec for StatusController 2020-05-12 15:59:26 -04:00			`status:`
Rich Media: Skip Microformats hashtags When fixing this problem I incorrectly assumed a.hashtag is the proper way for detecting hashtags, but it is just something Pleroma and Mastodon add. Per microformats it should be detected by the presense of rel=tag. This MR adds a check for rel=tag, but I still left a.hashtag just in case 2019-06-18 17:31:30 -04:00			`"<a href=\"https://pleroma.gov/tags/cofe\" rel=\"tag\">#cofe</a> https://www.pixiv.net/member_illust.php?mode=medium&illust_id=72255140",`
Add OpenAPI spec for StatusController 2020-05-12 15:59:26 -04:00			`content_type: "text/html"`
Rich Media: Skip Microformats hashtags When fixing this problem I incorrectly assumed a.hashtag is the proper way for detecting hashtags, but it is just something Pleroma and Mastodon add. Per microformats it should be detected by the presense of rel=tag. This MR adds a check for rel=tag, but I still left a.hashtag just in case 2019-06-18 17:31:30 -04:00			`})`

			`object = Object.normalize(activity)`
			`{:ok, url} = HTML.extract_first_external_url(object, object.data["content"])`

			`assert url == "https://www.pixiv.net/member_illust.php?mode=medium&illust_id=72255140"`
			`end`
Fix Pleroma.HTML.extract_first_external_url/2 2019-11-29 03:49:35 -05:00
			`test "does not crash when there is an HTML entity in a link" do`
			`user = insert(:user)`

Add OpenAPI spec for StatusController 2020-05-12 15:59:26 -04:00			`{:ok, activity} = CommonAPI.post(user, %{status: "\"http://cofe.com/?boomer=ok&foo=bar\""})`
Fix Pleroma.HTML.extract_first_external_url/2 2019-11-29 03:49:35 -05:00
			`object = Object.normalize(activity)`

			`assert {:ok, nil} = HTML.extract_first_external_url(object, object.data["content"])`
			`end`
excluding attachment links from RichMedia 2020-06-29 08:25:57 -04:00
			`test "skips attachment links" do`
			`user = insert(:user)`

			`{:ok, activity} =`
			`CommonAPI.post(user, %{`
			`status:`
			`"<a href=\"https://pleroma.gov/media/d24caa3a498e21e0298377a9ca0149a4f4f8b767178aacf837542282e2d94fb1.png?name=image.png\" class=\"attachment\">image.png</a>"`
			`})`

			`object = Object.normalize(activity)`

			`assert {:ok, nil} = HTML.extract_first_external_url(object, object.data["content"])`
			`end`
Fix hashtags being picked up by rich media parser Closes #989 2019-06-14 07:34:42 -04:00			`end`
test: add smoketests for the scrubbing policies 2018-09-21 23:44:19 -04:00			`end`