6 Commits

Author SHA1 Message Date
Joakim Persson 82c7887cf7 Lade till requests 2024-08-13 15:27:36 +02:00
Joakim Persson 5f546a848e Beroenden som behövs till appar i /web 2024-08-13 15:20:10 +02:00
Joakim Persson 7eb889a889 Nedladdning av bilder från websida. Kodbasen från GPT 4 Omni 2024-08-13 15:15:04 +02:00
Joakim Persson 6dab09c861 Fixat så bilderna landar i hämtade filer. Tar bort skalningsinformation m.m. efter filnamnet så att suffix blir korrekt. 2024-08-13 15:13:59 +02:00
Joakim Persson 8288450662 Lade till .DS_Store 2024-08-13 14:31:21 +02:00
Joakim Persson ae67b9c7b3 Test med att hämta alla bilder på hemsida 2024-08-13 14:30:34 +02:00
4 changed files with 142 additions and 1 deletions
+1
View File
@@ -162,3 +162,4 @@ cython_debug/
# Exclude venv from smartassist
smartassist/smartassist_dev_venv
.DS_Store
+76
View File
@@ -0,0 +1,76 @@
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import base64
import re
def download_image(url, folder_path):
if not os.path.isdir(folder_path):
os.makedirs(folder_path)
try:
response = requests.get(url, stream=True)
response.raise_for_status() # Kontrollera om förfrågan lyckades
except requests.RequestException as e:
print(f"Failed to retrieve image {url}: {e}")
return
filename = os.path.join(folder_path, os.path.basename(urlparse(url).path))
with open(filename, 'wb') as file:
for chunk in response.iter_content(1024):
file.write(chunk)
print(f"Downloaded: {filename}")
def save_base64_image(data_url, folder_path, count):
if not os.path.isdir(folder_path):
os.makedirs(folder_path)
match = re.match(r'data:image/(?P<ext>[^;]+);base64,(?P<data>.+)', data_url)
if match:
ext = match.group('ext')
data = match.group('data')
img_data = base64.b64decode(data)
filename = os.path.join(folder_path, f'image_{count}.{ext}')
with open(filename, 'wb') as file:
file.write(img_data)
print(f"Downloaded: {filename}")
else:
print(f"Invalid base64 image data: {data_url}")
def download_all_images(html_content, base_url, folder_path):
soup = BeautifulSoup(html_content, 'html.parser')
img_tags = soup.find_all('img')
count = 0
for img in img_tags:
img_url = img.get('src')
if not img_url:
continue
if img_url.startswith(('http://', 'https://')):
img_url = urljoin(base_url, img_url)
print(f"Attempting to download image: {img_url}")
download_image(img_url, folder_path)
elif img_url.startswith('data:image/'):
print(f"Attempting to save base64 image: {img_url[:30]}...") # Print only the start of the data URL
count += 1
save_base64_image(img_url, folder_path, count)
else:
print(f"Ignoring non-http URL: {img_url}")
def main():
url = input("Enter the URL of the webpage: ")
folder_path = os.path.expanduser("~/Downloads/downloaded_images")
try:
response = requests.get(url)
response.raise_for_status() # Kontrollera om förfrågan lyckades
except requests.RequestException as e:
print(f"Failed to retrieve webpage {url}: {e}")
return
download_all_images(response.content, url, folder_path)
if __name__ == "__main__":
main()
+62
View File
@@ -0,0 +1,62 @@
import os
import re
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import base64
def ladda_ner_bilder(url):
# Hämta HTML-sidan
svar = requests.get(url)
soup = BeautifulSoup(svar.text, 'html.parser')
# Hitta alla bilder
bilder = []
for img in soup.find_all('img'):
src = img.get('src')
if src:
bilder.append(src)
# Hantera inline-bilder i base64
INLINE_BILD_MÖNSTER = r'data:image/(.*?);base64,(.*)'
matcher = re.compile(INLINE_BILD_MÖNSTER)
for match in matcher.finditer(svar.text):
bild_typ = match.group(1)
bild_data = match.group(2)
bilder.append(f"data:{bild_typ};base64,{bild_data}")
# Ladda ner bilderna
bild_katalog = os.path.expanduser("~/Downloads/bilder")
if not os.path.exists(bild_katalog):
os.makedirs(bild_katalog)
for bild_url in bilder:
if not bild_url.startswith('http'):
bild_url = urljoin(url, bild_url)
if bild_url.startswith('data:'):
# Dekodera base64-strängen och spara den som en bild
format, data = bild_url.split(';base64,')
data = base64.b64decode(data)
filnamn = 'inline_' + str(len(bilder)) + '.gif'
with open(os.path.join(bild_katalog, filnamn), 'wb') as f:
f.write(data)
else:
svar = requests.get(bild_url)
if svar.status_code == 200:
filnamn = os.path.basename(bild_url).split('?')[0]
with open(os.path.join(bild_katalog, filnamn), 'wb') as f:
f.write(svar.content)
print(f"Bilden {filnamn} har laddats ner till {bild_katalog}.")
def main():
url = input("Ange URL till sidan från vilken du vill hämta bilder: ")
if not url.startswith('http'):
url = 'http://' + url
try:
ladda_ner_bilder(url)
except Exception as e:
print(f"Fel inträffade: {e}")
if __name__ == "__main__":
main()
+2
View File
@@ -0,0 +1,2 @@
BeautifulSoup4
requests