Browse Source

Initial commit.

master
Anonymous 1 year ago
commit
79efdc1125
7 changed files with 52 additions and 0 deletions
  1. +4
    -0
      .gitignore
  2. +2
    -0
      README.md
  3. +4
    -0
      cookies.txt
  4. +2
    -0
      histogram.sh
  5. +1
    -0
      requirements.txt
  6. +30
    -0
      scrape.py
  7. +9
    -0
      shell.nix

+ 4
- 0
.gitignore View File

@@ -0,0 +1,4 @@
data/
tags.*.txt
histogram.txt
*.7z

+ 2
- 0
README.md View File

@@ -0,0 +1,2 @@
# Giantessbooru Tag Scraper
A simple and efficient tag scraper for [Giantessbooru](https://giantessbooru.com). Run with `python scrape.py` or `./scrape.py` and edit `cookies.txt` to configure. You can download the complete dataset as of 2022-09-28 [here](https://anonfiles.com/b0Q3o9A6y1/data_7z).

+ 4
- 0
cookies.txt View File

@@ -0,0 +1,4 @@
agreed=true
ShowFurryContent=true
ShowLQContent=true
ShowMaleContent=true

+ 2
- 0
histogram.sh View File

@@ -0,0 +1,2 @@
#!/usr/bin/env bash
find ./data -type f -exec cat {} \; | tr \ '\n' | sort | uniq -c | sort -nr -o histogram.txt

+ 1
- 0
requirements.txt View File

@@ -0,0 +1 @@
playwright

+ 30
- 0
scrape.py View File

@@ -0,0 +1,30 @@
#!/usr/bin/env python3
from playwright.sync_api import sync_playwright

root = 'https://giantessbooru.com/'
cookies = []
with open('cookies.txt') as f:
lines = [line.rstrip().split("=", 1) for line in f]
cookies = [{'name':k, 'value':v, 'url':root} for k,v in lines]

with sync_playwright() as p:
browser = p.chromium.launch()\
.new_context(**p.devices['Desktop Chrome'])
browser.add_cookies(cookies)
page = browser.new_page()

page.goto(root+"post/list")
pages = int(page.locator('#paginator a').nth(-2).inner_text())
for i in range(pages+1):
page.goto(root+f"post/list/{i}")
print(i,page.title())
thumbs = page.locator('#mai > .thumb > a > img')
for i in range(thumbs.count()):
thumb = thumbs.nth(i)
tags = thumb.get_attribute('title').split('//')[0].rstrip().split(' ')
md5 = thumb.get_attribute('src').split('/')[2]
with open(f"data/{md5}.tags.txt",'w',encoding="utf-8") as f:
f.write('\n'.join(tags) + '\n')
page.wait_for_timeout(1000)

browser.close()

+ 9
- 0
shell.nix View File

@@ -0,0 +1,9 @@
let
mach-nix = import (builtins.fetchGit {
url = "https://github.com/DavHau/mach-nix/";
ref = "refs/tags/3.5.0";
}) {};
in
mach-nix.mkPythonShell {
requirements = builtins.readFile ./requirements.txt;
}

Loading…
Cancel
Save