Spaces:
Sleeping
Sleeping
| import re | |
| from urllib.parse import urlparse, urlunparse | |
| import httpx | |
| def extract_urls(text: str): | |
| """Extract URLs from raw text.""" | |
| url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?""" | |
| return re.findall(url_pattern, text) | |
| def extract_domain_from_url(url: str) -> str: | |
| """ | |
| Extracts the domain (netloc) from a given URL. | |
| Parameters: | |
| url (str): The full URL. | |
| Returns: | |
| str: The domain (e.g., 'example.com'). | |
| """ | |
| redirect_url = resolve_short_url(url) | |
| print(f"redirect: {url} -> {redirect_url}") | |
| parsed = urlparse(redirect_url) | |
| domain = parsed.netloc | |
| print(f"domain: {redirect_url} -> {domain}") | |
| return domain | |
| def normalize_url(url: str) -> str: | |
| """Ensure the URL has a scheme and is normalized.""" | |
| parsed = urlparse(url, scheme="http") | |
| if not parsed.netloc: | |
| parsed = urlparse("http://" + url) | |
| return urlunparse(parsed) | |
| def resolve_short_url(url: str) -> str: | |
| """Make a HEAD request without following redirects, return the Location if redirected.""" | |
| url = normalize_url(url) | |
| try: | |
| with httpx.Client(follow_redirects=False, timeout=5) as client: | |
| response = client.head(url, headers={"User-Agent": "Mozilla/5.0"}) | |
| if response.status_code in {301, 302, 303, 307, 308}: | |
| location = response.headers.get("location") | |
| return resolve_short_url(location) | |
| return url # No redirect | |
| except httpx.RequestError as e: | |
| print(f"Error: {e}") | |
| return url |