python_test/web/download_images_gpt4o.py

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import base64
import re

def download_image(url, folder_path):
    if not os.path.isdir(folder_path):
        os.makedirs(folder_path)

    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Kontrollera om förfrågan lyckades
    except requests.RequestException as e:
        print(f"Failed to retrieve image {url}: {e}")
        return

    filename = os.path.join(folder_path, os.path.basename(urlparse(url).path))
    with open(filename, 'wb') as file:
        for chunk in response.iter_content(1024):
            file.write(chunk)
    print(f"Downloaded: {filename}")

def save_base64_image(data_url, folder_path, count):
    if not os.path.isdir(folder_path):
        os.makedirs(folder_path)

    match = re.match(r'data:image/(?P<ext>[^;]+);base64,(?P<data>.+)', data_url)
    if match:
        ext = match.group('ext')
        data = match.group('data')
        img_data = base64.b64decode(data)
        filename = os.path.join(folder_path, f'image_{count}.{ext}')
        with open(filename, 'wb') as file:
            file.write(img_data)
        print(f"Downloaded: {filename}")
    else:
        print(f"Invalid base64 image data: {data_url}")

def download_all_images(html_content, base_url, folder_path):
    soup = BeautifulSoup(html_content, 'html.parser')
    img_tags = soup.find_all('img')

    count = 0
    for img in img_tags:
        img_url = img.get('src')
        if not img_url:
            continue

        if img_url.startswith(('http://', 'https://')):
            img_url = urljoin(base_url, img_url)
            print(f"Attempting to download image: {img_url}")
            download_image(img_url, folder_path)
        elif img_url.startswith('data:image/'):
            print(f"Attempting to save base64 image: {img_url[:30]}...")  # Print only the start of the data URL
            count += 1
            save_base64_image(img_url, folder_path, count)
        else:
            print(f"Ignoring non-http URL: {img_url}")

def main():
    url = input("Enter the URL of the webpage: ")
    folder_path = os.path.expanduser("~/Downloads/downloaded_images")

    try:
        response = requests.get(url)
        response.raise_for_status()  # Kontrollera om förfrågan lyckades
    except requests.RequestException as e:
        print(f"Failed to retrieve webpage {url}: {e}")
        return

    download_all_images(response.content, url, folder_path)

if __name__ == "__main__":
    main()