Spaces:

kokluch
/

phishing-detector-api

Sleeping

phishing-detector-api / url_tools.py

Fix resolve shorten urls to go to last redirection.

c5e53a5 6 months ago

1.58 kB

	import re
	from urllib.parse import urlparse, urlunparse
	import httpx

	def extract_urls(text: str):
	"""Extract URLs from raw text."""
	url_pattern = r"""(?:(?:https?:\/\/\|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
	return re.findall(url_pattern, text)

	def extract_domain_from_url(url: str) -> str:
	"""
	Extracts the domain (netloc) from a given URL.

	Parameters:
	url (str): The full URL.

	Returns:
	str: The domain (e.g., 'example.com').
	"""
	redirect_url = resolve_short_url(url)
	print(f"redirect: {url} -> {redirect_url}")
	parsed = urlparse(redirect_url)
	domain = parsed.netloc
	print(f"domain: {redirect_url} -> {domain}")
	return domain

	def normalize_url(url: str) -> str:
	"""Ensure the URL has a scheme and is normalized."""
	parsed = urlparse(url, scheme="http")
	if not parsed.netloc:
	parsed = urlparse("http://" + url)
	return urlunparse(parsed)

	def resolve_short_url(url: str) -> str:
	"""Make a HEAD request without following redirects, return the Location if redirected."""
	url = normalize_url(url)
	try:
	with httpx.Client(follow_redirects=False, timeout=5) as client:
	response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
	if response.status_code in {301, 302, 303, 307, 308}:
	location = response.headers.get("location")
	return resolve_short_url(location)
	return url # No redirect
	except httpx.RequestError as e:
	print(f"Error: {e}")
	return url