diff --git a/web/download_images_gpt4o.py b/web/download_images_gpt4o.py new file mode 100644 index 0000000..d62e9fb --- /dev/null +++ b/web/download_images_gpt4o.py @@ -0,0 +1,76 @@ +import os +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse +import base64 +import re + +def download_image(url, folder_path): + if not os.path.isdir(folder_path): + os.makedirs(folder_path) + + try: + response = requests.get(url, stream=True) + response.raise_for_status() # Kontrollera om förfrågan lyckades + except requests.RequestException as e: + print(f"Failed to retrieve image {url}: {e}") + return + + filename = os.path.join(folder_path, os.path.basename(urlparse(url).path)) + with open(filename, 'wb') as file: + for chunk in response.iter_content(1024): + file.write(chunk) + print(f"Downloaded: {filename}") + +def save_base64_image(data_url, folder_path, count): + if not os.path.isdir(folder_path): + os.makedirs(folder_path) + + match = re.match(r'data:image/(?P[^;]+);base64,(?P.+)', data_url) + if match: + ext = match.group('ext') + data = match.group('data') + img_data = base64.b64decode(data) + filename = os.path.join(folder_path, f'image_{count}.{ext}') + with open(filename, 'wb') as file: + file.write(img_data) + print(f"Downloaded: {filename}") + else: + print(f"Invalid base64 image data: {data_url}") + +def download_all_images(html_content, base_url, folder_path): + soup = BeautifulSoup(html_content, 'html.parser') + img_tags = soup.find_all('img') + + count = 0 + for img in img_tags: + img_url = img.get('src') + if not img_url: + continue + + if img_url.startswith(('http://', 'https://')): + img_url = urljoin(base_url, img_url) + print(f"Attempting to download image: {img_url}") + download_image(img_url, folder_path) + elif img_url.startswith('data:image/'): + print(f"Attempting to save base64 image: {img_url[:30]}...") # Print only the start of the data URL + count += 1 + save_base64_image(img_url, folder_path, count) + else: + print(f"Ignoring non-http URL: {img_url}") + +def main(): + url = input("Enter the URL of the webpage: ") + folder_path = os.path.expanduser("~/Downloads/downloaded_images") + + try: + response = requests.get(url) + response.raise_for_status() # Kontrollera om förfrågan lyckades + except requests.RequestException as e: + print(f"Failed to retrieve webpage {url}: {e}") + return + + download_all_images(response.content, url, folder_path) + +if __name__ == "__main__": + main()