commit 79efdc1125f170900d1dfcd7178c4fc981928e0e
Author: Anonymous <>
Date:   Wed Sep 28 21:04:02 2022 +0000

    Initial commit.

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0fa9f99
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+data/
+tags.*.txt
+histogram.txt
+*.7z
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a7e5565
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# Giantessbooru Tag Scraper
+A simple and efficient tag scraper for [Giantessbooru](https://giantessbooru.com). Run with `python scrape.py` or `./scrape.py` and edit `cookies.txt` to configure. You can download the complete dataset as of 2022-09-28 [here](https://anonfiles.com/b0Q3o9A6y1/data_7z).
diff --git a/cookies.txt b/cookies.txt
new file mode 100644
index 0000000..70c1625
--- /dev/null
+++ b/cookies.txt
@@ -0,0 +1,4 @@
+agreed=true
+ShowFurryContent=true
+ShowLQContent=true
+ShowMaleContent=true
diff --git a/histogram.sh b/histogram.sh
new file mode 100755
index 0000000..f56c073
--- /dev/null
+++ b/histogram.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+find ./data -type f -exec cat {} \; | tr \  '\n' | sort | uniq -c | sort -nr -o histogram.txt
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..508a5f4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+playwright
diff --git a/scrape.py b/scrape.py
new file mode 100755
index 0000000..46324fa
--- /dev/null
+++ b/scrape.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+from playwright.sync_api import sync_playwright
+
+root = 'https://giantessbooru.com/'
+cookies = []
+with open('cookies.txt') as f:
+    lines = [line.rstrip().split("=", 1) for line in f]
+    cookies = [{'name':k, 'value':v, 'url':root} for k,v in lines]
+
+with sync_playwright() as p:
+    browser = p.chromium.launch()\
+               .new_context(**p.devices['Desktop Chrome'])
+    browser.add_cookies(cookies)
+    page = browser.new_page()
+
+    page.goto(root+"post/list")
+    pages = int(page.locator('#paginator a').nth(-2).inner_text())
+    for i in range(pages+1):
+        page.goto(root+f"post/list/{i}")
+        print(i,page.title())
+        thumbs = page.locator('#mai > .thumb > a > img')
+        for i in range(thumbs.count()):
+            thumb = thumbs.nth(i)
+            tags  = thumb.get_attribute('title').split('//')[0].rstrip().split(' ')
+            md5   = thumb.get_attribute('src').split('/')[2]
+            with open(f"data/{md5}.tags.txt",'w',encoding="utf-8") as f:
+                f.write('\n'.join(tags) + '\n')
+        page.wait_for_timeout(1000)
+
+    browser.close()
diff --git a/shell.nix b/shell.nix
new file mode 100644
index 0000000..f9d6df9
--- /dev/null
+++ b/shell.nix
@@ -0,0 +1,9 @@
+let
+  mach-nix = import (builtins.fetchGit {
+    url = "https://github.com/DavHau/mach-nix/";
+    ref = "refs/tags/3.5.0";
+  }) {};
+in
+mach-nix.mkPythonShell {
+  requirements = builtins.readFile ./requirements.txt;
+}