Initial commit.
This commit is contained in:
commit
79efdc1125
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
data/
|
||||
tags.*.txt
|
||||
histogram.txt
|
||||
*.7z
|
2
README.md
Normal file
2
README.md
Normal file
@ -0,0 +1,2 @@
|
||||
# Giantessbooru Tag Scraper
|
||||
A simple and efficient tag scraper for [Giantessbooru](https://giantessbooru.com). Run with `python scrape.py` or `./scrape.py` and edit `cookies.txt` to configure. You can download the complete dataset as of 2022-09-28 [here](https://anonfiles.com/b0Q3o9A6y1/data_7z).
|
4
cookies.txt
Normal file
4
cookies.txt
Normal file
@ -0,0 +1,4 @@
|
||||
agreed=true
|
||||
ShowFurryContent=true
|
||||
ShowLQContent=true
|
||||
ShowMaleContent=true
|
2
histogram.sh
Executable file
2
histogram.sh
Executable file
@ -0,0 +1,2 @@
|
||||
#!/usr/bin/env bash
|
||||
find ./data -type f -exec cat {} \; | tr \ '\n' | sort | uniq -c | sort -nr -o histogram.txt
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
playwright
|
30
scrape.py
Executable file
30
scrape.py
Executable file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env python3
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
root = 'https://giantessbooru.com/'
|
||||
cookies = []
|
||||
with open('cookies.txt') as f:
|
||||
lines = [line.rstrip().split("=", 1) for line in f]
|
||||
cookies = [{'name':k, 'value':v, 'url':root} for k,v in lines]
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch()\
|
||||
.new_context(**p.devices['Desktop Chrome'])
|
||||
browser.add_cookies(cookies)
|
||||
page = browser.new_page()
|
||||
|
||||
page.goto(root+"post/list")
|
||||
pages = int(page.locator('#paginator a').nth(-2).inner_text())
|
||||
for i in range(pages+1):
|
||||
page.goto(root+f"post/list/{i}")
|
||||
print(i,page.title())
|
||||
thumbs = page.locator('#mai > .thumb > a > img')
|
||||
for i in range(thumbs.count()):
|
||||
thumb = thumbs.nth(i)
|
||||
tags = thumb.get_attribute('title').split('//')[0].rstrip().split(' ')
|
||||
md5 = thumb.get_attribute('src').split('/')[2]
|
||||
with open(f"data/{md5}.tags.txt",'w',encoding="utf-8") as f:
|
||||
f.write('\n'.join(tags) + '\n')
|
||||
page.wait_for_timeout(1000)
|
||||
|
||||
browser.close()
|
Loading…
Reference in New Issue
Block a user