commit 79efdc1125f170900d1dfcd7178c4fc981928e0e Author: Anonymous <> Date: Wed Sep 28 21:04:02 2022 +0000 Initial commit. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0fa9f99 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +data/ +tags.*.txt +histogram.txt +*.7z diff --git a/README.md b/README.md new file mode 100644 index 0000000..a7e5565 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# Giantessbooru Tag Scraper +A simple and efficient tag scraper for [Giantessbooru](https://giantessbooru.com). Run with `python scrape.py` or `./scrape.py` and edit `cookies.txt` to configure. You can download the complete dataset as of 2022-09-28 [here](https://anonfiles.com/b0Q3o9A6y1/data_7z). diff --git a/cookies.txt b/cookies.txt new file mode 100644 index 0000000..70c1625 --- /dev/null +++ b/cookies.txt @@ -0,0 +1,4 @@ +agreed=true +ShowFurryContent=true +ShowLQContent=true +ShowMaleContent=true diff --git a/histogram.sh b/histogram.sh new file mode 100755 index 0000000..f56c073 --- /dev/null +++ b/histogram.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +find ./data -type f -exec cat {} \; | tr \ '\n' | sort | uniq -c | sort -nr -o histogram.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..508a5f4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +playwright diff --git a/scrape.py b/scrape.py new file mode 100755 index 0000000..46324fa --- /dev/null +++ b/scrape.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +from playwright.sync_api import sync_playwright + +root = 'https://giantessbooru.com/' +cookies = [] +with open('cookies.txt') as f: + lines = [line.rstrip().split("=", 1) for line in f] + cookies = [{'name':k, 'value':v, 'url':root} for k,v in lines] + +with sync_playwright() as p: + browser = p.chromium.launch()\ + .new_context(**p.devices['Desktop Chrome']) + browser.add_cookies(cookies) + page = browser.new_page() + + page.goto(root+"post/list") + pages = int(page.locator('#paginator a').nth(-2).inner_text()) + for i in range(pages+1): + page.goto(root+f"post/list/{i}") + print(i,page.title()) + thumbs = page.locator('#mai > .thumb > a > img') + for i in range(thumbs.count()): + thumb = thumbs.nth(i) + tags = thumb.get_attribute('title').split('//')[0].rstrip().split(' ') + md5 = thumb.get_attribute('src').split('/')[2] + with open(f"data/{md5}.tags.txt",'w',encoding="utf-8") as f: + f.write('\n'.join(tags) + '\n') + page.wait_for_timeout(1000) + + browser.close() diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..f9d6df9 --- /dev/null +++ b/shell.nix @@ -0,0 +1,9 @@ +let + mach-nix = import (builtins.fetchGit { + url = "https://github.com/DavHau/mach-nix/"; + ref = "refs/tags/3.5.0"; + }) {}; +in +mach-nix.mkPythonShell { + requirements = builtins.readFile ./requirements.txt; +}