Spaces:
Build error
Build error
| import re | |
| from urllib.parse import urlparse | |
| from loguru import logger | |
| from .base import BaseCrawler | |
| from .custom_article import CustomArticleCrawler | |
| from .github import GithubCrawler | |
| class CrawlerDispatcher: | |
| def __init__(self) -> None: | |
| self._crawlers = {} | |
| def build(cls) -> "CrawlerDispatcher": | |
| dispatcher = cls() | |
| return dispatcher | |
| def register_github(self) -> "CrawlerDispatcher": | |
| self.register("https://github.com", GithubCrawler) | |
| return self | |
| def register(self, domain: str, crawler: type[BaseCrawler]) -> None: | |
| parsed_domain = urlparse(domain) | |
| domain = parsed_domain.netloc | |
| self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler | |
| def get_crawler(self, url: str) -> BaseCrawler: | |
| for pattern, crawler in self._crawlers.items(): | |
| if re.match(pattern, url): | |
| return crawler() | |
| else: | |
| logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.") | |
| return CustomArticleCrawler() | |