Python function for crawlers

Code:

from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup
from typing import List


def extract_urls(address: str) -> List[str]:
    try:
        response = requests.get(address)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.text, 'html.parser')
        base_url = "{0.scheme}://{0.netloc}".format(urlparse(address))
        urls = set()

        for a in soup.find_all('a', href=True):
            full_url = urljoin(base_url, a['href'])
            if full_url.startswith('http') and '.' in urlparse(full_url).netloc:
                urls.add(full_url)

        return list(urls)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []


# Press the green button in the gutter to run the script.
# Example usage
url = 'https://www.yotamarker.com/f6-extras'
found_urls = extract_urls(url)
print(found_urls)

# See PyCharm help at https://www.jetbrains.com/help/pycharm/

it returns URLs in a url.

Sun	Mon	Tue	Wed	Thu	Fri	Sat
					1	2
3	4	5	6	7	8	9
10	11	12	13	14	15	16
17	18	19	20	21	22	23
24	25	26	27	28	29	30

the LivinGrimoire Artificial General Intelligence software design pattern forum

Python function for crawlers

Python function for crawlersTue Nov 12, 2024 1:33 am

the LivinGrimoire Artificial General Intelligence software design pattern forum

Python function for crawlers

descriptionPython function for crawlersTue Nov 12, 2024 1:33 am

Python function for crawlersTue Nov 12, 2024 1:33 am