Tag scraper for giantessbooru.com.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

31 lines
1.1KB

  1. #!/usr/bin/env python3
  2. from playwright.sync_api import sync_playwright
  3. root = 'https://giantessbooru.com/'
  4. cookies = []
  5. with open('cookies.txt') as f:
  6. lines = [line.rstrip().split("=", 1) for line in f]
  7. cookies = [{'name':k, 'value':v, 'url':root} for k,v in lines]
  8. with sync_playwright() as p:
  9. browser = p.chromium.launch()\
  10. .new_context(**p.devices['Desktop Chrome'])
  11. browser.add_cookies(cookies)
  12. page = browser.new_page()
  13. page.goto(root+"post/list")
  14. pages = int(page.locator('#paginator a').nth(-2).inner_text())
  15. for i in range(pages+1):
  16. page.goto(root+f"post/list/{i}")
  17. print(i,page.title())
  18. thumbs = page.locator('#mai > .thumb > a > img')
  19. for i in range(thumbs.count()):
  20. thumb = thumbs.nth(i)
  21. tags = thumb.get_attribute('title').split('//')[0].rstrip().split(' ')
  22. md5 = thumb.get_attribute('src').split('/')[2]
  23. with open(f"data/{md5}.tags.txt",'w',encoding="utf-8") as f:
  24. f.write('\n'.join(tags) + '\n')
  25. page.wait_for_timeout(1000)
  26. browser.close()