From 118c60015e08ed33d80f4d88b99ccc24ca043e55 Mon Sep 17 00:00:00 2001 From: lza_menace Date: Mon, 3 Apr 2023 21:04:27 -0700 Subject: [PATCH] add mirror to monero infodump site --- .gitignore | 1 + scrape.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100755 scrape.py diff --git a/.gitignore b/.gitignore index d2bd107..24438e7 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,4 @@ dmypy.json # nodes data +infodump \ No newline at end of file diff --git a/scrape.py b/scrape.py new file mode 100755 index 0000000..53d6487 --- /dev/null +++ b/scrape.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +import os +import requests +import bs4 + +os.system('mkdir -p infodump/thumbs') +url = 'https://moneroinfodump.neocities.org/' +contents = requests.get(url, timeout=15).content +soup = bs4.BeautifulSoup(contents, 'html.parser') +images = soup.find_all('img') +links = soup.find_all('a') + +for image in images: + img = image.get('src') + if img.startswith('http'): + os.system(f'wget -q --no-clobber -O infodump/{os.path.basename(img)} {img}') + image['src'] = os.path.basename(img) + elif img.startswith('data:image/png'): + pass + else: + os.system(f'wget -q --no-clobber -O infodump/{img} {img}') + image['src'] = img + +for link in links: + href = link.get('href') + if href and href.startswith('https://i.imgur.com'): + link['href'] = os.path.basename(href) + +with open('infodump/index.html', 'w') as f: + f.write(str(soup)) \ No newline at end of file