more github ingesting

This commit is contained in:
mruwnik 2025-12-23 20:02:10 +00:00
parent f729122754
commit 526bfa5f6b
10 changed files with 1924 additions and 9 deletions

View File

@ -206,7 +206,7 @@ services:
<<: *worker-base <<: *worker-base
environment: environment:
<<: *worker-env <<: *worker-env
QUEUES: "backup,email,ebooks,discord,comic,blogs,forums,maintenance,notes,scheduler" QUEUES: "backup,blogs,comic,discord,ebooks,email,forums,github,photo_embed,maintenance,notes,scheduler"
ingest-hub: ingest-hub:
<<: *worker-base <<: *worker-base

View File

@ -44,7 +44,7 @@ RUN git config --global user.email "${GIT_USER_EMAIL}" && \
git config --global user.name "${GIT_USER_NAME}" git config --global user.name "${GIT_USER_NAME}"
# Default queues to process # Default queues to process
ENV QUEUES="backup,ebooks,email,discord,comic,blogs,forums,photo_embed,maintenance" ENV QUEUES="backup,blogs,comic,discord,ebooks,email,forums,github,photo_embed,maintenance"
ENV PYTHONPATH="/app" ENV PYTHONPATH="/app"
ENTRYPOINT ["./entry.sh"] ENTRYPOINT ["./entry.sh"]

View File

@ -351,9 +351,8 @@ class GithubAccountAdmin(ModelView, model=GithubAccount):
"updated_at", "updated_at",
] ]
column_searchable_list = ["name", "id"] column_searchable_list = ["name", "id"]
# Hide sensitive columns from display # Sensitive columns (access_token, private_key) are already excluded from column_list
column_exclude_list = ["access_token", "private_key"] form_excluded_columns = ["repos", "access_token", "private_key"]
form_excluded_columns = ["repos"]
class GithubRepoAdmin(ModelView, model=GithubRepo): class GithubRepoAdmin(ModelView, model=GithubRepo):

View File

@ -1,4 +1,5 @@
from celery import Celery from celery import Celery
from celery.schedules import crontab
from kombu.utils.url import safequote from kombu.utils.url import safequote
from memory.common import settings from memory.common import settings
@ -123,6 +124,12 @@ app.conf.update(
f"{BACKUP_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-backup"}, f"{BACKUP_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-backup"},
f"{GITHUB_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-github"}, f"{GITHUB_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-github"},
}, },
beat_schedule={
"sync-github-repos-hourly": {
"task": SYNC_ALL_GITHUB_REPOS,
"schedule": crontab(minute=0), # Every hour at :00
},
},
) )

View File

@ -58,6 +58,12 @@ ALL_COLLECTIONS: dict[str, Collection] = {
"text": True, "text": True,
"multimodal": True, "multimodal": True,
}, },
"github": {
"dimension": 1024,
"distance": "Cosine",
"text": True,
"multimodal": False,
},
"text": { "text": {
"dimension": 1024, "dimension": 1024,
"distance": "Cosine", "distance": "Cosine",

View File

@ -802,6 +802,23 @@ class MiscDoc(SourceItem):
} }
class GithubItemPayload(SourceItemPayload):
kind: Annotated[str, "Type: issue, pr, comment, or project_card"]
repo_path: Annotated[str, "Repository path (owner/name)"]
number: Annotated[int | None, "Issue or PR number"]
state: Annotated[str | None, "State: open, closed, merged"]
title: Annotated[str | None, "Issue or PR title"]
author: Annotated[str | None, "Author username"]
labels: Annotated[list[str] | None, "GitHub labels"]
assignees: Annotated[list[str] | None, "Assigned users"]
milestone: Annotated[str | None, "Milestone name"]
project_status: Annotated[str | None, "GitHub Project status"]
project_priority: Annotated[str | None, "GitHub Project priority"]
created_at: Annotated[datetime | None, "Creation date"]
closed_at: Annotated[datetime | None, "Close date"]
merged_at: Annotated[datetime | None, "Merge date (PRs only)"]
class GithubItem(SourceItem): class GithubItem(SourceItem):
__tablename__ = "github_item" __tablename__ = "github_item"
@ -854,6 +871,29 @@ class GithubItem(SourceItem):
Index("gh_repo_id_idx", "repo_id"), Index("gh_repo_id_idx", "repo_id"),
) )
@classmethod
def get_collections(cls) -> list[str]:
return ["github"]
def as_payload(self) -> GithubItemPayload:
return GithubItemPayload(
**super().as_payload(),
kind=cast(str, self.kind),
repo_path=cast(str, self.repo_path),
number=cast(int | None, self.number),
state=cast(str | None, self.state),
title=cast(str | None, self.title),
author=cast(str | None, self.author),
labels=cast(list[str] | None, self.labels),
assignees=cast(list[str] | None, self.assignees),
milestone=cast(str | None, self.milestone),
project_status=cast(str | None, self.project_status),
project_priority=cast(str | None, self.project_priority),
created_at=cast(datetime | None, self.created_at),
closed_at=cast(datetime | None, self.closed_at),
merged_at=cast(datetime | None, self.merged_at),
)
class NotePayload(SourceItemPayload): class NotePayload(SourceItemPayload):
note_type: Annotated[str | None, "Category of the note"] note_type: Annotated[str | None, "Category of the note"]

View File

@ -10,6 +10,7 @@ from memory.workers.tasks import (
ebook, ebook,
email, email,
forums, forums,
github,
maintenance, maintenance,
notes, notes,
observations, observations,
@ -24,6 +25,7 @@ __all__ = [
"ebook", "ebook",
"discord", "discord",
"forums", "forums",
"github",
"maintenance", "maintenance",
"notes", "notes",
"observations", "observations",

View File

@ -0,0 +1,705 @@
"""Tests for GitHub API client and parser."""
import pytest
from datetime import datetime, timezone
from unittest.mock import Mock, patch, MagicMock
import requests
from memory.parsers.github import (
GithubCredentials,
GithubClient,
GithubIssueData,
GithubComment,
parse_github_date,
compute_content_hash,
)
# =============================================================================
# Tests for utility functions
# =============================================================================
@pytest.mark.parametrize(
"date_str,expected",
[
("2024-01-15T10:30:00Z", datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc)),
(
"2024-06-20T14:45:30Z",
datetime(2024, 6, 20, 14, 45, 30, tzinfo=timezone.utc),
),
(None, None),
("", None),
],
)
def test_parse_github_date(date_str, expected):
"""Test parsing GitHub date strings."""
result = parse_github_date(date_str)
assert result == expected
def test_compute_content_hash_body_only():
"""Test content hash with body only."""
hash1 = compute_content_hash("This is the body", [])
hash2 = compute_content_hash("This is the body", [])
hash3 = compute_content_hash("Different body", [])
assert hash1 == hash2 # Same content = same hash
assert hash1 != hash3 # Different content = different hash
def test_compute_content_hash_with_comments():
"""Test content hash includes comments."""
comments = [
GithubComment(
id=1,
author="user1",
body="First comment",
created_at="2024-01-01T00:00:00Z",
updated_at="2024-01-01T00:00:00Z",
),
GithubComment(
id=2,
author="user2",
body="Second comment",
created_at="2024-01-02T00:00:00Z",
updated_at="2024-01-02T00:00:00Z",
),
]
hash_with_comments = compute_content_hash("Body", comments)
hash_without_comments = compute_content_hash("Body", [])
assert hash_with_comments != hash_without_comments
def test_compute_content_hash_empty_body():
"""Test content hash with empty/None body."""
hash1 = compute_content_hash("", [])
hash2 = compute_content_hash(None, []) # type: ignore
# Both should produce valid hashes
assert len(hash1) == 64 # SHA256 hex
assert len(hash2) == 64
def test_compute_content_hash_comment_order_matters():
"""Test that comment order affects the hash."""
comment1 = GithubComment(
id=1, author="a", body="First", created_at="", updated_at=""
)
comment2 = GithubComment(
id=2, author="b", body="Second", created_at="", updated_at=""
)
hash_order1 = compute_content_hash("Body", [comment1, comment2])
hash_order2 = compute_content_hash("Body", [comment2, comment1])
assert hash_order1 != hash_order2
# =============================================================================
# Tests for GithubClient initialization
# =============================================================================
def test_github_client_pat_auth():
"""Test client initialization with PAT authentication."""
credentials = GithubCredentials(
auth_type="pat",
access_token="ghp_test_token",
)
with patch.object(requests.Session, "get"):
client = GithubClient(credentials)
assert "Bearer ghp_test_token" in client.session.headers["Authorization"]
assert client.session.headers["Accept"] == "application/vnd.github+json"
assert client.session.headers["X-GitHub-Api-Version"] == "2022-11-28"
# =============================================================================
# Tests for fetch_issues
# =============================================================================
def test_fetch_issues_basic():
"""Test fetching issues from repository."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
def mock_get(url, **kwargs):
"""Route mock responses based on URL."""
response = Mock()
response.headers = {"X-RateLimit-Remaining": "4999"}
response.raise_for_status = Mock()
page = kwargs.get("params", {}).get("page", 1)
if "/repos/" in url and "/issues" in url and "/comments" not in url:
# Issues endpoint
if page == 1:
response.json.return_value = [
{
"number": 1,
"title": "Test Issue",
"body": "Issue body",
"state": "open",
"user": {"login": "testuser"},
"labels": [{"name": "bug"}],
"assignees": [{"login": "dev1"}],
"milestone": {"title": "v1.0"},
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-02T00:00:00Z",
"closed_at": None,
"comments": 2,
# Note: Do NOT include "pull_request" key for real issues
# The API checks `if "pull_request" in issue` to skip PRs
}
]
else:
response.json.return_value = []
elif "/comments" in url:
# Comments endpoint
if page == 1:
response.json.return_value = [
{
"id": 100,
"user": {"login": "commenter"},
"body": "A comment",
"created_at": "2024-01-01T12:00:00Z",
"updated_at": "2024-01-01T12:00:00Z",
}
]
else:
response.json.return_value = []
else:
response.json.return_value = []
return response
with patch.object(requests.Session, "get", side_effect=mock_get):
client = GithubClient(credentials)
issues = list(client.fetch_issues("owner", "repo"))
assert len(issues) == 1
issue = issues[0]
assert issue["number"] == 1
assert issue["title"] == "Test Issue"
assert issue["kind"] == "issue"
assert issue["state"] == "open"
assert issue["author"] == "testuser"
assert issue["labels"] == ["bug"]
assert issue["assignees"] == ["dev1"]
assert issue["milestone"] == "v1.0"
assert len(issue["comments"]) == 1
def test_fetch_issues_skips_prs():
"""Test that PRs in issue list are skipped."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
def mock_get(url, **kwargs):
"""Route mock responses based on URL."""
response = Mock()
response.headers = {"X-RateLimit-Remaining": "4999"}
response.raise_for_status = Mock()
page = kwargs.get("params", {}).get("page", 1)
if "/repos/" in url and "/issues" in url and "/comments" not in url:
if page == 1:
response.json.return_value = [
{
"number": 1,
"title": "Issue",
"body": "Body",
"state": "open",
"user": {"login": "user"},
"labels": [],
"assignees": [],
"milestone": None,
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-01T00:00:00Z",
"closed_at": None,
"comments": 0,
# Real issues don't have "pull_request" key
},
{
"number": 2,
"title": "PR posing as issue",
"body": "Body",
"state": "open",
"user": {"login": "user"},
"labels": [],
"assignees": [],
"milestone": None,
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-01T00:00:00Z",
"closed_at": None,
"comments": 0,
"pull_request": {"url": "https://..."}, # PRs have this key
},
]
else:
response.json.return_value = []
elif "/comments" in url:
response.json.return_value = []
else:
response.json.return_value = []
return response
with patch.object(requests.Session, "get", side_effect=mock_get):
client = GithubClient(credentials)
issues = list(client.fetch_issues("owner", "repo"))
assert len(issues) == 1
assert issues[0]["number"] == 1
def test_fetch_issues_with_since_filter():
"""Test fetching issues with since parameter."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = []
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
mock_response.raise_for_status = Mock()
with patch.object(requests.Session, "get") as mock_get:
mock_get.return_value = mock_response
client = GithubClient(credentials)
since = datetime(2024, 1, 15, tzinfo=timezone.utc)
list(client.fetch_issues("owner", "repo", since=since))
# Verify since was passed to API
call_args = mock_get.call_args
assert "since" in call_args.kwargs.get("params", {})
def test_fetch_issues_with_state_filter():
"""Test fetching issues with state filter."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = []
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
mock_response.raise_for_status = Mock()
with patch.object(requests.Session, "get") as mock_get:
mock_get.return_value = mock_response
client = GithubClient(credentials)
list(client.fetch_issues("owner", "repo", state="closed"))
call_args = mock_get.call_args
assert call_args.kwargs.get("params", {}).get("state") == "closed"
def test_fetch_issues_with_labels_filter():
"""Test fetching issues with labels filter."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = []
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
mock_response.raise_for_status = Mock()
with patch.object(requests.Session, "get") as mock_get:
mock_get.return_value = mock_response
client = GithubClient(credentials)
list(client.fetch_issues("owner", "repo", labels=["bug", "critical"]))
call_args = mock_get.call_args
assert call_args.kwargs.get("params", {}).get("labels") == "bug,critical"
# =============================================================================
# Tests for fetch_prs
# =============================================================================
def test_fetch_prs_basic():
"""Test fetching PRs from repository."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
def mock_get(url, **kwargs):
"""Route mock responses based on URL."""
response = Mock()
response.headers = {"X-RateLimit-Remaining": "4999"}
response.raise_for_status = Mock()
page = kwargs.get("params", {}).get("page", 1)
if "/pulls" in url and "/comments" not in url:
if page == 1:
response.json.return_value = [
{
"number": 10,
"title": "Add feature",
"body": "PR body",
"state": "open",
"user": {"login": "contributor"},
"labels": [{"name": "enhancement"}],
"assignees": [{"login": "reviewer"}],
"milestone": None,
"created_at": "2024-01-05T00:00:00Z",
"updated_at": "2024-01-06T00:00:00Z",
"closed_at": None,
"merged_at": None,
"diff_url": "https://github.com/owner/repo/pull/10.diff",
"comments": 0,
}
]
else:
response.json.return_value = []
elif ".diff" in url:
response.ok = True
response.text = "+100 lines added\n-50 lines removed"
elif "/comments" in url:
response.json.return_value = []
else:
response.json.return_value = []
return response
with patch.object(requests.Session, "get", side_effect=mock_get):
client = GithubClient(credentials)
prs = list(client.fetch_prs("owner", "repo"))
assert len(prs) == 1
pr = prs[0]
assert pr["number"] == 10
assert pr["title"] == "Add feature"
assert pr["kind"] == "pr"
assert pr["diff_summary"] is not None
assert "100 lines added" in pr["diff_summary"]
def test_fetch_prs_merged():
"""Test fetching merged PR."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = [
{
"number": 20,
"title": "Merged PR",
"body": "Body",
"state": "closed",
"user": {"login": "user"},
"labels": [],
"assignees": [],
"milestone": None,
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-10T00:00:00Z",
"closed_at": "2024-01-10T00:00:00Z",
"merged_at": "2024-01-10T00:00:00Z",
"additions": 10,
"deletions": 5,
"comments": 0,
}
]
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
mock_response.raise_for_status = Mock()
mock_empty = Mock()
mock_empty.json.return_value = []
mock_empty.headers = {"X-RateLimit-Remaining": "4998"}
mock_empty.raise_for_status = Mock()
with patch.object(requests.Session, "get") as mock_get:
mock_get.side_effect = [mock_response, mock_empty, mock_empty]
client = GithubClient(credentials)
prs = list(client.fetch_prs("owner", "repo"))
pr = prs[0]
assert pr["state"] == "closed"
assert pr["merged_at"] == datetime(2024, 1, 10, 0, 0, 0, tzinfo=timezone.utc)
def test_fetch_prs_stops_at_since():
"""Test that PR fetching stops when reaching older items."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = [
{
"number": 30,
"title": "Recent PR",
"body": "Body",
"state": "open",
"user": {"login": "user"},
"labels": [],
"assignees": [],
"milestone": None,
"created_at": "2024-01-20T00:00:00Z",
"updated_at": "2024-01-20T00:00:00Z",
"closed_at": None,
"merged_at": None,
"additions": 1,
"deletions": 1,
"comments": 0,
},
{
"number": 29,
"title": "Old PR",
"body": "Body",
"state": "open",
"user": {"login": "user"},
"labels": [],
"assignees": [],
"milestone": None,
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-01T00:00:00Z", # Older than since
"closed_at": None,
"merged_at": None,
"additions": 1,
"deletions": 1,
"comments": 0,
},
]
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
mock_response.raise_for_status = Mock()
mock_empty = Mock()
mock_empty.json.return_value = []
mock_empty.headers = {"X-RateLimit-Remaining": "4998"}
mock_empty.raise_for_status = Mock()
with patch.object(requests.Session, "get") as mock_get:
mock_get.side_effect = [mock_response, mock_empty]
client = GithubClient(credentials)
since = datetime(2024, 1, 15, tzinfo=timezone.utc)
prs = list(client.fetch_prs("owner", "repo", since=since))
# Should only get the recent PR, stop at the old one
assert len(prs) == 1
assert prs[0]["number"] == 30
# =============================================================================
# Tests for fetch_comments
# =============================================================================
def test_fetch_comments_pagination():
"""Test comment fetching with pagination."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
# First page of comments
mock_page1 = Mock()
mock_page1.json.return_value = [
{
"id": 1,
"user": {"login": "user1"},
"body": "Comment 1",
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-01T00:00:00Z",
}
]
mock_page1.headers = {"X-RateLimit-Remaining": "4999"}
mock_page1.raise_for_status = Mock()
# Second page of comments
mock_page2 = Mock()
mock_page2.json.return_value = [
{
"id": 2,
"user": {"login": "user2"},
"body": "Comment 2",
"created_at": "2024-01-02T00:00:00Z",
"updated_at": "2024-01-02T00:00:00Z",
}
]
mock_page2.headers = {"X-RateLimit-Remaining": "4998"}
mock_page2.raise_for_status = Mock()
# Empty page to stop
mock_empty = Mock()
mock_empty.json.return_value = []
mock_empty.headers = {"X-RateLimit-Remaining": "4997"}
mock_empty.raise_for_status = Mock()
with patch.object(requests.Session, "get") as mock_get:
mock_get.side_effect = [mock_page1, mock_page2, mock_empty]
client = GithubClient(credentials)
comments = client.fetch_comments("owner", "repo", 1)
assert len(comments) == 2
assert comments[0]["author"] == "user1"
assert comments[1]["author"] == "user2"
def test_fetch_comments_handles_ghost_user():
"""Test comment with deleted/ghost user."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = [
{
"id": 1,
"user": None, # Deleted user
"body": "Comment from ghost",
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-01T00:00:00Z",
}
]
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
mock_response.raise_for_status = Mock()
mock_empty = Mock()
mock_empty.json.return_value = []
mock_empty.headers = {"X-RateLimit-Remaining": "4998"}
mock_empty.raise_for_status = Mock()
with patch.object(requests.Session, "get") as mock_get:
mock_get.side_effect = [mock_response, mock_empty]
client = GithubClient(credentials)
comments = client.fetch_comments("owner", "repo", 1)
assert len(comments) == 1
assert comments[0]["author"] == "ghost"
# =============================================================================
# Tests for rate limiting
# =============================================================================
def test_rate_limit_handling():
"""Test rate limit detection and backoff."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = []
mock_response.headers = {
"X-RateLimit-Remaining": "0",
"X-RateLimit-Reset": str(int(datetime.now(timezone.utc).timestamp()) + 1),
}
mock_response.raise_for_status = Mock()
with patch.object(requests.Session, "get") as mock_get:
mock_get.return_value = mock_response
with patch("time.sleep") as mock_sleep:
client = GithubClient(credentials)
list(client.fetch_issues("owner", "repo"))
# Should have waited due to rate limit
mock_sleep.assert_called()
# =============================================================================
# Tests for project fields
# =============================================================================
def test_fetch_project_fields():
"""Test fetching GitHub Projects v2 fields."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = {
"data": {
"repository": {
"issue": {
"projectItems": {
"nodes": [
{
"project": {"title": "Sprint Board"},
"fieldValues": {
"nodes": [
{"field": {"name": "Status"}, "name": "In Progress"},
{"field": {"name": "Priority"}, "text": "High"},
]
},
}
]
}
}
}
}
}
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
mock_response.raise_for_status = Mock()
with patch.object(requests.Session, "post") as mock_post:
mock_post.return_value = mock_response
client = GithubClient(credentials)
fields = client.fetch_project_fields("owner", "repo", 1)
assert fields is not None
# Fields are prefixed with project name
assert "Sprint Board.Status" in fields
assert fields["Sprint Board.Status"] == "In Progress"
assert "Sprint Board.Priority" in fields
assert fields["Sprint Board.Priority"] == "High"
def test_fetch_project_fields_not_in_project():
"""Test fetching project fields for issue not in any project."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = {
"data": {"repository": {"issue": {"projectItems": {"nodes": []}}}}
}
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
mock_response.raise_for_status = Mock()
with patch.object(requests.Session, "post") as mock_post:
mock_post.return_value = mock_response
client = GithubClient(credentials)
fields = client.fetch_project_fields("owner", "repo", 1)
assert fields is None
def test_fetch_project_fields_graphql_error():
"""Test handling GraphQL errors gracefully."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.json.return_value = {
"errors": [{"message": "Something went wrong"}],
"data": None,
}
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
mock_response.raise_for_status = Mock()
with patch.object(requests.Session, "post") as mock_post:
mock_post.return_value = mock_response
client = GithubClient(credentials)
fields = client.fetch_project_fields("owner", "repo", 1)
assert fields is None
# =============================================================================
# Tests for error handling
# =============================================================================
def test_fetch_issues_handles_api_error():
"""Test graceful handling of API errors."""
credentials = GithubCredentials(auth_type="pat", access_token="token")
mock_response = Mock()
mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found")
with patch.object(requests.Session, "get") as mock_get:
mock_get.return_value = mock_response
client = GithubClient(credentials)
with pytest.raises(requests.HTTPError):
list(client.fetch_issues("owner", "nonexistent"))

File diff suppressed because it is too large Load Diff

View File

@ -15,6 +15,8 @@ Usage:
python run_celery_task.py blogs sync-webpage --url "https://example.com" python run_celery_task.py blogs sync-webpage --url "https://example.com"
python run_celery_task.py comic sync-all-comics python run_celery_task.py comic sync-all-comics
python run_celery_task.py forums sync-lesswrong --since-date "2025-01-01" --min-karma 10 --limit 50 --cooldown 0.5 --max-items 1000 python run_celery_task.py forums sync-lesswrong --since-date "2025-01-01" --min-karma 10 --limit 50 --cooldown 0.5 --max-items 1000
python run_celery_task.py github sync-all-repos
python run_celery_task.py github sync-repo --repo-id 1 --force-full
""" """
import json import json
@ -51,8 +53,10 @@ from memory.common.celery_app import (
UPDATE_METADATA_FOR_SOURCE_ITEMS, UPDATE_METADATA_FOR_SOURCE_ITEMS,
SETUP_GIT_NOTES, SETUP_GIT_NOTES,
TRACK_GIT_CHANGES, TRACK_GIT_CHANGES,
BACKUP_TO_S3_DIRECTORY, BACKUP_PATH,
BACKUP_ALL, BACKUP_ALL,
SYNC_GITHUB_REPO,
SYNC_ALL_GITHUB_REPOS,
app, app,
) )
@ -100,9 +104,13 @@ TASK_MAPPINGS = {
"track_git_changes": TRACK_GIT_CHANGES, "track_git_changes": TRACK_GIT_CHANGES,
}, },
"backup": { "backup": {
"backup_to_s3_directory": BACKUP_TO_S3_DIRECTORY, "backup_path": BACKUP_PATH,
"backup_all": BACKUP_ALL, "backup_all": BACKUP_ALL,
}, },
"github": {
"sync_all_repos": SYNC_ALL_GITHUB_REPOS,
"sync_repo": SYNC_GITHUB_REPO,
},
} }
QUEUE_MAPPINGS = { QUEUE_MAPPINGS = {
"email": "email", "email": "email",
@ -200,9 +208,9 @@ def backup_all(ctx):
@backup.command("path") @backup.command("path")
@click.option("--path", required=True, help="Path to backup") @click.option("--path", required=True, help="Path to backup")
@click.pass_context @click.pass_context
def backup_to_s3_directory(ctx, path): def backup_path_cmd(ctx, path):
"""Backup a specific path.""" """Backup a specific path."""
execute_task(ctx, "backup", "backup_to_s3_directory", path=path) execute_task(ctx, "backup", "backup_path", path=path)
@cli.group() @cli.group()
@ -533,5 +541,28 @@ def forums_sync_lesswrong_post(ctx, url):
execute_task(ctx, "forums", "sync_lesswrong_post", url=url) execute_task(ctx, "forums", "sync_lesswrong_post", url=url)
@cli.group()
@click.pass_context
def github(ctx):
"""GitHub-related tasks."""
pass
@github.command("sync-all-repos")
@click.pass_context
def github_sync_all_repos(ctx):
"""Sync all active GitHub repos."""
execute_task(ctx, "github", "sync_all_repos")
@github.command("sync-repo")
@click.option("--repo-id", type=int, required=True, help="GitHub repo ID")
@click.option("--force-full", is_flag=True, help="Force a full sync instead of incremental")
@click.pass_context
def github_sync_repo(ctx, repo_id, force_full):
"""Sync a specific GitHub repo."""
execute_task(ctx, "github", "sync_repo", repo_id=repo_id, force_full=force_full)
if __name__ == "__main__": if __name__ == "__main__":
cli() cli()