Spaces:
Build error
Build error
| import os | |
| import pathlib | |
| import shutil | |
| import subprocess | |
| import tempfile | |
| from loguru import logger | |
| from llm_engineering.domain.documents import RepositoryDocument | |
| from .base import BaseCrawler | |
| class GithubCrawler(BaseCrawler): | |
| model = RepositoryDocument | |
| def __init__( | |
| self, | |
| include=( | |
| ".txt", | |
| ".md", | |
| ".rst", | |
| ".json", | |
| ".yml", | |
| ".yaml", | |
| ".xml", | |
| ".html", | |
| ".csv", | |
| ".py", | |
| ".sh", | |
| ".cfg", | |
| ".conf", | |
| ".js", | |
| ".css", | |
| ".scss", | |
| ".cpp", | |
| ".hpp", | |
| ".h", | |
| ".cc", | |
| ".hh", | |
| ".cmake", | |
| ".bat", | |
| ".rb", | |
| ".bash", | |
| ".qml", | |
| ".proto", | |
| ".properties", | |
| ".template", | |
| ".in", | |
| ".inc", | |
| ".pyi", | |
| ".typed", | |
| ), | |
| ignore=( | |
| ".git", | |
| ".toml", | |
| ".lock", | |
| ".png", | |
| ".gitignore", | |
| ".ico", | |
| ".jpg", | |
| ".jpeg", | |
| ".webp", | |
| ".svg", | |
| ".gif", | |
| ".stl", | |
| ".dae", | |
| ".jar", | |
| ".pdf", | |
| ), | |
| ) -> None: | |
| super().__init__() | |
| self._ignore = ignore | |
| self._include = include | |
| def extract(self, link: str, **kwargs) -> None: | |
| old_model = self.model.find(link=link) | |
| if old_model is not None: | |
| logger.info(f"Repository already exists in the database: {link}") | |
| return | |
| logger.info(f"Starting scrapping GitHub repository: {link}") | |
| repo_name = link.rstrip("/").split("/")[-1] | |
| local_temp = tempfile.mkdtemp() | |
| file_types = {} | |
| try: | |
| os.chdir(local_temp) | |
| subprocess.run(["git", "clone", link], check=True) | |
| repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118 | |
| tree = {} | |
| current_size = 0 | |
| max_size = 16793598 - 100000 # 16 MB in bytes | |
| for root, _, files in os.walk(repo_path): | |
| dir = root.replace(repo_path, "").lstrip("/") | |
| if dir.startswith(tuple(self._ignore)): | |
| continue | |
| for file in files: | |
| if file.endswith(tuple(self._ignore)) or file.startswith("."): | |
| continue | |
| if not file.endswith(tuple(self._include)): | |
| continue | |
| file_path = os.path.join(dir, file) # noqa: PTH118 | |
| full_file_path = os.path.join(root, file) # noqa: PTH118 | |
| try: | |
| with open(full_file_path, "r", errors="ignore") as f: # noqa: PTH123 | |
| file_extension = pathlib.Path(full_file_path).suffix | |
| file_types[file_extension] = 1 | |
| content = f.read().replace(" ", "") | |
| file_size = len(content.encode("utf-8")) | |
| # Check if adding this file exceeds the size limit | |
| if current_size + file_size > max_size: | |
| # Save the current tree and clear it | |
| self.save_tree(tree, repo_name, link) | |
| tree.clear() | |
| current_size = 0 | |
| # Add file to tree | |
| tree[file_path] = content | |
| current_size += file_size | |
| except Exception as e: | |
| logger.error(f"Failed to process file {file_path}: {e}") | |
| # Save any remaining files in the tree | |
| if tree: | |
| self.save_tree(tree, repo_name, link) | |
| except Exception as e: | |
| logger.error(f"Error while processing repository: {e}") | |
| raise | |
| finally: | |
| shutil.rmtree(local_temp, ignore_errors=True) | |
| logger.info(f"Finished scrapping GitHub repository: {link}") | |
| logger.info(file_types) | |
| def save_tree(self, tree, repo_name, link): | |
| """Helper method to save the current tree.""" | |
| try: | |
| instance = self.model( | |
| content=tree, | |
| name=repo_name, | |
| link=link, | |
| platform="github", | |
| author_id="46648381-8bf3-4877-b6b4-d48c9de9d870", | |
| author_full_name="CS370 Project", | |
| ) | |
| instance.save() | |
| except Exception as e: | |
| logger.error(f"Failed to save tree: {e}") | |