mirror of
https://github.com/mruwnik/memory.git
synced 2026-01-02 09:12:58 +01:00
more github ingesting
This commit is contained in:
parent
f729122754
commit
526bfa5f6b
@ -206,7 +206,7 @@ services:
|
||||
<<: *worker-base
|
||||
environment:
|
||||
<<: *worker-env
|
||||
QUEUES: "backup,email,ebooks,discord,comic,blogs,forums,maintenance,notes,scheduler"
|
||||
QUEUES: "backup,blogs,comic,discord,ebooks,email,forums,github,photo_embed,maintenance,notes,scheduler"
|
||||
|
||||
ingest-hub:
|
||||
<<: *worker-base
|
||||
|
||||
@ -44,7 +44,7 @@ RUN git config --global user.email "${GIT_USER_EMAIL}" && \
|
||||
git config --global user.name "${GIT_USER_NAME}"
|
||||
|
||||
# Default queues to process
|
||||
ENV QUEUES="backup,ebooks,email,discord,comic,blogs,forums,photo_embed,maintenance"
|
||||
ENV QUEUES="backup,blogs,comic,discord,ebooks,email,forums,github,photo_embed,maintenance"
|
||||
ENV PYTHONPATH="/app"
|
||||
|
||||
ENTRYPOINT ["./entry.sh"]
|
||||
@ -351,9 +351,8 @@ class GithubAccountAdmin(ModelView, model=GithubAccount):
|
||||
"updated_at",
|
||||
]
|
||||
column_searchable_list = ["name", "id"]
|
||||
# Hide sensitive columns from display
|
||||
column_exclude_list = ["access_token", "private_key"]
|
||||
form_excluded_columns = ["repos"]
|
||||
# Sensitive columns (access_token, private_key) are already excluded from column_list
|
||||
form_excluded_columns = ["repos", "access_token", "private_key"]
|
||||
|
||||
|
||||
class GithubRepoAdmin(ModelView, model=GithubRepo):
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from celery import Celery
|
||||
from celery.schedules import crontab
|
||||
from kombu.utils.url import safequote
|
||||
from memory.common import settings
|
||||
|
||||
@ -123,6 +124,12 @@ app.conf.update(
|
||||
f"{BACKUP_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-backup"},
|
||||
f"{GITHUB_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-github"},
|
||||
},
|
||||
beat_schedule={
|
||||
"sync-github-repos-hourly": {
|
||||
"task": SYNC_ALL_GITHUB_REPOS,
|
||||
"schedule": crontab(minute=0), # Every hour at :00
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -58,6 +58,12 @@ ALL_COLLECTIONS: dict[str, Collection] = {
|
||||
"text": True,
|
||||
"multimodal": True,
|
||||
},
|
||||
"github": {
|
||||
"dimension": 1024,
|
||||
"distance": "Cosine",
|
||||
"text": True,
|
||||
"multimodal": False,
|
||||
},
|
||||
"text": {
|
||||
"dimension": 1024,
|
||||
"distance": "Cosine",
|
||||
|
||||
@ -802,6 +802,23 @@ class MiscDoc(SourceItem):
|
||||
}
|
||||
|
||||
|
||||
class GithubItemPayload(SourceItemPayload):
|
||||
kind: Annotated[str, "Type: issue, pr, comment, or project_card"]
|
||||
repo_path: Annotated[str, "Repository path (owner/name)"]
|
||||
number: Annotated[int | None, "Issue or PR number"]
|
||||
state: Annotated[str | None, "State: open, closed, merged"]
|
||||
title: Annotated[str | None, "Issue or PR title"]
|
||||
author: Annotated[str | None, "Author username"]
|
||||
labels: Annotated[list[str] | None, "GitHub labels"]
|
||||
assignees: Annotated[list[str] | None, "Assigned users"]
|
||||
milestone: Annotated[str | None, "Milestone name"]
|
||||
project_status: Annotated[str | None, "GitHub Project status"]
|
||||
project_priority: Annotated[str | None, "GitHub Project priority"]
|
||||
created_at: Annotated[datetime | None, "Creation date"]
|
||||
closed_at: Annotated[datetime | None, "Close date"]
|
||||
merged_at: Annotated[datetime | None, "Merge date (PRs only)"]
|
||||
|
||||
|
||||
class GithubItem(SourceItem):
|
||||
__tablename__ = "github_item"
|
||||
|
||||
@ -854,6 +871,29 @@ class GithubItem(SourceItem):
|
||||
Index("gh_repo_id_idx", "repo_id"),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_collections(cls) -> list[str]:
|
||||
return ["github"]
|
||||
|
||||
def as_payload(self) -> GithubItemPayload:
|
||||
return GithubItemPayload(
|
||||
**super().as_payload(),
|
||||
kind=cast(str, self.kind),
|
||||
repo_path=cast(str, self.repo_path),
|
||||
number=cast(int | None, self.number),
|
||||
state=cast(str | None, self.state),
|
||||
title=cast(str | None, self.title),
|
||||
author=cast(str | None, self.author),
|
||||
labels=cast(list[str] | None, self.labels),
|
||||
assignees=cast(list[str] | None, self.assignees),
|
||||
milestone=cast(str | None, self.milestone),
|
||||
project_status=cast(str | None, self.project_status),
|
||||
project_priority=cast(str | None, self.project_priority),
|
||||
created_at=cast(datetime | None, self.created_at),
|
||||
closed_at=cast(datetime | None, self.closed_at),
|
||||
merged_at=cast(datetime | None, self.merged_at),
|
||||
)
|
||||
|
||||
|
||||
class NotePayload(SourceItemPayload):
|
||||
note_type: Annotated[str | None, "Category of the note"]
|
||||
|
||||
@ -10,6 +10,7 @@ from memory.workers.tasks import (
|
||||
ebook,
|
||||
email,
|
||||
forums,
|
||||
github,
|
||||
maintenance,
|
||||
notes,
|
||||
observations,
|
||||
@ -24,6 +25,7 @@ __all__ = [
|
||||
"ebook",
|
||||
"discord",
|
||||
"forums",
|
||||
"github",
|
||||
"maintenance",
|
||||
"notes",
|
||||
"observations",
|
||||
|
||||
705
tests/memory/parsers/test_github.py
Normal file
705
tests/memory/parsers/test_github.py
Normal file
@ -0,0 +1,705 @@
|
||||
"""Tests for GitHub API client and parser."""
|
||||
|
||||
import pytest
|
||||
from datetime import datetime, timezone
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
import requests
|
||||
|
||||
from memory.parsers.github import (
|
||||
GithubCredentials,
|
||||
GithubClient,
|
||||
GithubIssueData,
|
||||
GithubComment,
|
||||
parse_github_date,
|
||||
compute_content_hash,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests for utility functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_str,expected",
|
||||
[
|
||||
("2024-01-15T10:30:00Z", datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc)),
|
||||
(
|
||||
"2024-06-20T14:45:30Z",
|
||||
datetime(2024, 6, 20, 14, 45, 30, tzinfo=timezone.utc),
|
||||
),
|
||||
(None, None),
|
||||
("", None),
|
||||
],
|
||||
)
|
||||
def test_parse_github_date(date_str, expected):
|
||||
"""Test parsing GitHub date strings."""
|
||||
result = parse_github_date(date_str)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_compute_content_hash_body_only():
|
||||
"""Test content hash with body only."""
|
||||
hash1 = compute_content_hash("This is the body", [])
|
||||
hash2 = compute_content_hash("This is the body", [])
|
||||
hash3 = compute_content_hash("Different body", [])
|
||||
|
||||
assert hash1 == hash2 # Same content = same hash
|
||||
assert hash1 != hash3 # Different content = different hash
|
||||
|
||||
|
||||
def test_compute_content_hash_with_comments():
|
||||
"""Test content hash includes comments."""
|
||||
comments = [
|
||||
GithubComment(
|
||||
id=1,
|
||||
author="user1",
|
||||
body="First comment",
|
||||
created_at="2024-01-01T00:00:00Z",
|
||||
updated_at="2024-01-01T00:00:00Z",
|
||||
),
|
||||
GithubComment(
|
||||
id=2,
|
||||
author="user2",
|
||||
body="Second comment",
|
||||
created_at="2024-01-02T00:00:00Z",
|
||||
updated_at="2024-01-02T00:00:00Z",
|
||||
),
|
||||
]
|
||||
|
||||
hash_with_comments = compute_content_hash("Body", comments)
|
||||
hash_without_comments = compute_content_hash("Body", [])
|
||||
|
||||
assert hash_with_comments != hash_without_comments
|
||||
|
||||
|
||||
def test_compute_content_hash_empty_body():
|
||||
"""Test content hash with empty/None body."""
|
||||
hash1 = compute_content_hash("", [])
|
||||
hash2 = compute_content_hash(None, []) # type: ignore
|
||||
|
||||
# Both should produce valid hashes
|
||||
assert len(hash1) == 64 # SHA256 hex
|
||||
assert len(hash2) == 64
|
||||
|
||||
|
||||
def test_compute_content_hash_comment_order_matters():
|
||||
"""Test that comment order affects the hash."""
|
||||
comment1 = GithubComment(
|
||||
id=1, author="a", body="First", created_at="", updated_at=""
|
||||
)
|
||||
comment2 = GithubComment(
|
||||
id=2, author="b", body="Second", created_at="", updated_at=""
|
||||
)
|
||||
|
||||
hash_order1 = compute_content_hash("Body", [comment1, comment2])
|
||||
hash_order2 = compute_content_hash("Body", [comment2, comment1])
|
||||
|
||||
assert hash_order1 != hash_order2
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests for GithubClient initialization
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_github_client_pat_auth():
|
||||
"""Test client initialization with PAT authentication."""
|
||||
credentials = GithubCredentials(
|
||||
auth_type="pat",
|
||||
access_token="ghp_test_token",
|
||||
)
|
||||
|
||||
with patch.object(requests.Session, "get"):
|
||||
client = GithubClient(credentials)
|
||||
|
||||
assert "Bearer ghp_test_token" in client.session.headers["Authorization"]
|
||||
assert client.session.headers["Accept"] == "application/vnd.github+json"
|
||||
assert client.session.headers["X-GitHub-Api-Version"] == "2022-11-28"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests for fetch_issues
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_fetch_issues_basic():
|
||||
"""Test fetching issues from repository."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
def mock_get(url, **kwargs):
|
||||
"""Route mock responses based on URL."""
|
||||
response = Mock()
|
||||
response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
response.raise_for_status = Mock()
|
||||
|
||||
page = kwargs.get("params", {}).get("page", 1)
|
||||
|
||||
if "/repos/" in url and "/issues" in url and "/comments" not in url:
|
||||
# Issues endpoint
|
||||
if page == 1:
|
||||
response.json.return_value = [
|
||||
{
|
||||
"number": 1,
|
||||
"title": "Test Issue",
|
||||
"body": "Issue body",
|
||||
"state": "open",
|
||||
"user": {"login": "testuser"},
|
||||
"labels": [{"name": "bug"}],
|
||||
"assignees": [{"login": "dev1"}],
|
||||
"milestone": {"title": "v1.0"},
|
||||
"created_at": "2024-01-01T00:00:00Z",
|
||||
"updated_at": "2024-01-02T00:00:00Z",
|
||||
"closed_at": None,
|
||||
"comments": 2,
|
||||
# Note: Do NOT include "pull_request" key for real issues
|
||||
# The API checks `if "pull_request" in issue` to skip PRs
|
||||
}
|
||||
]
|
||||
else:
|
||||
response.json.return_value = []
|
||||
elif "/comments" in url:
|
||||
# Comments endpoint
|
||||
if page == 1:
|
||||
response.json.return_value = [
|
||||
{
|
||||
"id": 100,
|
||||
"user": {"login": "commenter"},
|
||||
"body": "A comment",
|
||||
"created_at": "2024-01-01T12:00:00Z",
|
||||
"updated_at": "2024-01-01T12:00:00Z",
|
||||
}
|
||||
]
|
||||
else:
|
||||
response.json.return_value = []
|
||||
else:
|
||||
response.json.return_value = []
|
||||
|
||||
return response
|
||||
|
||||
with patch.object(requests.Session, "get", side_effect=mock_get):
|
||||
client = GithubClient(credentials)
|
||||
issues = list(client.fetch_issues("owner", "repo"))
|
||||
|
||||
assert len(issues) == 1
|
||||
issue = issues[0]
|
||||
assert issue["number"] == 1
|
||||
assert issue["title"] == "Test Issue"
|
||||
assert issue["kind"] == "issue"
|
||||
assert issue["state"] == "open"
|
||||
assert issue["author"] == "testuser"
|
||||
assert issue["labels"] == ["bug"]
|
||||
assert issue["assignees"] == ["dev1"]
|
||||
assert issue["milestone"] == "v1.0"
|
||||
assert len(issue["comments"]) == 1
|
||||
|
||||
|
||||
def test_fetch_issues_skips_prs():
|
||||
"""Test that PRs in issue list are skipped."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
def mock_get(url, **kwargs):
|
||||
"""Route mock responses based on URL."""
|
||||
response = Mock()
|
||||
response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
response.raise_for_status = Mock()
|
||||
|
||||
page = kwargs.get("params", {}).get("page", 1)
|
||||
|
||||
if "/repos/" in url and "/issues" in url and "/comments" not in url:
|
||||
if page == 1:
|
||||
response.json.return_value = [
|
||||
{
|
||||
"number": 1,
|
||||
"title": "Issue",
|
||||
"body": "Body",
|
||||
"state": "open",
|
||||
"user": {"login": "user"},
|
||||
"labels": [],
|
||||
"assignees": [],
|
||||
"milestone": None,
|
||||
"created_at": "2024-01-01T00:00:00Z",
|
||||
"updated_at": "2024-01-01T00:00:00Z",
|
||||
"closed_at": None,
|
||||
"comments": 0,
|
||||
# Real issues don't have "pull_request" key
|
||||
},
|
||||
{
|
||||
"number": 2,
|
||||
"title": "PR posing as issue",
|
||||
"body": "Body",
|
||||
"state": "open",
|
||||
"user": {"login": "user"},
|
||||
"labels": [],
|
||||
"assignees": [],
|
||||
"milestone": None,
|
||||
"created_at": "2024-01-01T00:00:00Z",
|
||||
"updated_at": "2024-01-01T00:00:00Z",
|
||||
"closed_at": None,
|
||||
"comments": 0,
|
||||
"pull_request": {"url": "https://..."}, # PRs have this key
|
||||
},
|
||||
]
|
||||
else:
|
||||
response.json.return_value = []
|
||||
elif "/comments" in url:
|
||||
response.json.return_value = []
|
||||
else:
|
||||
response.json.return_value = []
|
||||
|
||||
return response
|
||||
|
||||
with patch.object(requests.Session, "get", side_effect=mock_get):
|
||||
client = GithubClient(credentials)
|
||||
issues = list(client.fetch_issues("owner", "repo"))
|
||||
|
||||
assert len(issues) == 1
|
||||
assert issues[0]["number"] == 1
|
||||
|
||||
|
||||
def test_fetch_issues_with_since_filter():
|
||||
"""Test fetching issues with since parameter."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = []
|
||||
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "get") as mock_get:
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
client = GithubClient(credentials)
|
||||
since = datetime(2024, 1, 15, tzinfo=timezone.utc)
|
||||
list(client.fetch_issues("owner", "repo", since=since))
|
||||
|
||||
# Verify since was passed to API
|
||||
call_args = mock_get.call_args
|
||||
assert "since" in call_args.kwargs.get("params", {})
|
||||
|
||||
|
||||
def test_fetch_issues_with_state_filter():
|
||||
"""Test fetching issues with state filter."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = []
|
||||
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "get") as mock_get:
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
client = GithubClient(credentials)
|
||||
list(client.fetch_issues("owner", "repo", state="closed"))
|
||||
|
||||
call_args = mock_get.call_args
|
||||
assert call_args.kwargs.get("params", {}).get("state") == "closed"
|
||||
|
||||
|
||||
def test_fetch_issues_with_labels_filter():
|
||||
"""Test fetching issues with labels filter."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = []
|
||||
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "get") as mock_get:
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
client = GithubClient(credentials)
|
||||
list(client.fetch_issues("owner", "repo", labels=["bug", "critical"]))
|
||||
|
||||
call_args = mock_get.call_args
|
||||
assert call_args.kwargs.get("params", {}).get("labels") == "bug,critical"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests for fetch_prs
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_fetch_prs_basic():
|
||||
"""Test fetching PRs from repository."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
def mock_get(url, **kwargs):
|
||||
"""Route mock responses based on URL."""
|
||||
response = Mock()
|
||||
response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
response.raise_for_status = Mock()
|
||||
|
||||
page = kwargs.get("params", {}).get("page", 1)
|
||||
|
||||
if "/pulls" in url and "/comments" not in url:
|
||||
if page == 1:
|
||||
response.json.return_value = [
|
||||
{
|
||||
"number": 10,
|
||||
"title": "Add feature",
|
||||
"body": "PR body",
|
||||
"state": "open",
|
||||
"user": {"login": "contributor"},
|
||||
"labels": [{"name": "enhancement"}],
|
||||
"assignees": [{"login": "reviewer"}],
|
||||
"milestone": None,
|
||||
"created_at": "2024-01-05T00:00:00Z",
|
||||
"updated_at": "2024-01-06T00:00:00Z",
|
||||
"closed_at": None,
|
||||
"merged_at": None,
|
||||
"diff_url": "https://github.com/owner/repo/pull/10.diff",
|
||||
"comments": 0,
|
||||
}
|
||||
]
|
||||
else:
|
||||
response.json.return_value = []
|
||||
elif ".diff" in url:
|
||||
response.ok = True
|
||||
response.text = "+100 lines added\n-50 lines removed"
|
||||
elif "/comments" in url:
|
||||
response.json.return_value = []
|
||||
else:
|
||||
response.json.return_value = []
|
||||
|
||||
return response
|
||||
|
||||
with patch.object(requests.Session, "get", side_effect=mock_get):
|
||||
client = GithubClient(credentials)
|
||||
prs = list(client.fetch_prs("owner", "repo"))
|
||||
|
||||
assert len(prs) == 1
|
||||
pr = prs[0]
|
||||
assert pr["number"] == 10
|
||||
assert pr["title"] == "Add feature"
|
||||
assert pr["kind"] == "pr"
|
||||
assert pr["diff_summary"] is not None
|
||||
assert "100 lines added" in pr["diff_summary"]
|
||||
|
||||
|
||||
def test_fetch_prs_merged():
|
||||
"""Test fetching merged PR."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = [
|
||||
{
|
||||
"number": 20,
|
||||
"title": "Merged PR",
|
||||
"body": "Body",
|
||||
"state": "closed",
|
||||
"user": {"login": "user"},
|
||||
"labels": [],
|
||||
"assignees": [],
|
||||
"milestone": None,
|
||||
"created_at": "2024-01-01T00:00:00Z",
|
||||
"updated_at": "2024-01-10T00:00:00Z",
|
||||
"closed_at": "2024-01-10T00:00:00Z",
|
||||
"merged_at": "2024-01-10T00:00:00Z",
|
||||
"additions": 10,
|
||||
"deletions": 5,
|
||||
"comments": 0,
|
||||
}
|
||||
]
|
||||
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
mock_empty = Mock()
|
||||
mock_empty.json.return_value = []
|
||||
mock_empty.headers = {"X-RateLimit-Remaining": "4998"}
|
||||
mock_empty.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "get") as mock_get:
|
||||
mock_get.side_effect = [mock_response, mock_empty, mock_empty]
|
||||
|
||||
client = GithubClient(credentials)
|
||||
prs = list(client.fetch_prs("owner", "repo"))
|
||||
|
||||
pr = prs[0]
|
||||
assert pr["state"] == "closed"
|
||||
assert pr["merged_at"] == datetime(2024, 1, 10, 0, 0, 0, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def test_fetch_prs_stops_at_since():
|
||||
"""Test that PR fetching stops when reaching older items."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = [
|
||||
{
|
||||
"number": 30,
|
||||
"title": "Recent PR",
|
||||
"body": "Body",
|
||||
"state": "open",
|
||||
"user": {"login": "user"},
|
||||
"labels": [],
|
||||
"assignees": [],
|
||||
"milestone": None,
|
||||
"created_at": "2024-01-20T00:00:00Z",
|
||||
"updated_at": "2024-01-20T00:00:00Z",
|
||||
"closed_at": None,
|
||||
"merged_at": None,
|
||||
"additions": 1,
|
||||
"deletions": 1,
|
||||
"comments": 0,
|
||||
},
|
||||
{
|
||||
"number": 29,
|
||||
"title": "Old PR",
|
||||
"body": "Body",
|
||||
"state": "open",
|
||||
"user": {"login": "user"},
|
||||
"labels": [],
|
||||
"assignees": [],
|
||||
"milestone": None,
|
||||
"created_at": "2024-01-01T00:00:00Z",
|
||||
"updated_at": "2024-01-01T00:00:00Z", # Older than since
|
||||
"closed_at": None,
|
||||
"merged_at": None,
|
||||
"additions": 1,
|
||||
"deletions": 1,
|
||||
"comments": 0,
|
||||
},
|
||||
]
|
||||
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
mock_empty = Mock()
|
||||
mock_empty.json.return_value = []
|
||||
mock_empty.headers = {"X-RateLimit-Remaining": "4998"}
|
||||
mock_empty.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "get") as mock_get:
|
||||
mock_get.side_effect = [mock_response, mock_empty]
|
||||
|
||||
client = GithubClient(credentials)
|
||||
since = datetime(2024, 1, 15, tzinfo=timezone.utc)
|
||||
prs = list(client.fetch_prs("owner", "repo", since=since))
|
||||
|
||||
# Should only get the recent PR, stop at the old one
|
||||
assert len(prs) == 1
|
||||
assert prs[0]["number"] == 30
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests for fetch_comments
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_fetch_comments_pagination():
|
||||
"""Test comment fetching with pagination."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
# First page of comments
|
||||
mock_page1 = Mock()
|
||||
mock_page1.json.return_value = [
|
||||
{
|
||||
"id": 1,
|
||||
"user": {"login": "user1"},
|
||||
"body": "Comment 1",
|
||||
"created_at": "2024-01-01T00:00:00Z",
|
||||
"updated_at": "2024-01-01T00:00:00Z",
|
||||
}
|
||||
]
|
||||
mock_page1.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_page1.raise_for_status = Mock()
|
||||
|
||||
# Second page of comments
|
||||
mock_page2 = Mock()
|
||||
mock_page2.json.return_value = [
|
||||
{
|
||||
"id": 2,
|
||||
"user": {"login": "user2"},
|
||||
"body": "Comment 2",
|
||||
"created_at": "2024-01-02T00:00:00Z",
|
||||
"updated_at": "2024-01-02T00:00:00Z",
|
||||
}
|
||||
]
|
||||
mock_page2.headers = {"X-RateLimit-Remaining": "4998"}
|
||||
mock_page2.raise_for_status = Mock()
|
||||
|
||||
# Empty page to stop
|
||||
mock_empty = Mock()
|
||||
mock_empty.json.return_value = []
|
||||
mock_empty.headers = {"X-RateLimit-Remaining": "4997"}
|
||||
mock_empty.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "get") as mock_get:
|
||||
mock_get.side_effect = [mock_page1, mock_page2, mock_empty]
|
||||
|
||||
client = GithubClient(credentials)
|
||||
comments = client.fetch_comments("owner", "repo", 1)
|
||||
|
||||
assert len(comments) == 2
|
||||
assert comments[0]["author"] == "user1"
|
||||
assert comments[1]["author"] == "user2"
|
||||
|
||||
|
||||
def test_fetch_comments_handles_ghost_user():
|
||||
"""Test comment with deleted/ghost user."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = [
|
||||
{
|
||||
"id": 1,
|
||||
"user": None, # Deleted user
|
||||
"body": "Comment from ghost",
|
||||
"created_at": "2024-01-01T00:00:00Z",
|
||||
"updated_at": "2024-01-01T00:00:00Z",
|
||||
}
|
||||
]
|
||||
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
mock_empty = Mock()
|
||||
mock_empty.json.return_value = []
|
||||
mock_empty.headers = {"X-RateLimit-Remaining": "4998"}
|
||||
mock_empty.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "get") as mock_get:
|
||||
mock_get.side_effect = [mock_response, mock_empty]
|
||||
|
||||
client = GithubClient(credentials)
|
||||
comments = client.fetch_comments("owner", "repo", 1)
|
||||
|
||||
assert len(comments) == 1
|
||||
assert comments[0]["author"] == "ghost"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests for rate limiting
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_rate_limit_handling():
|
||||
"""Test rate limit detection and backoff."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = []
|
||||
mock_response.headers = {
|
||||
"X-RateLimit-Remaining": "0",
|
||||
"X-RateLimit-Reset": str(int(datetime.now(timezone.utc).timestamp()) + 1),
|
||||
}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "get") as mock_get:
|
||||
mock_get.return_value = mock_response
|
||||
with patch("time.sleep") as mock_sleep:
|
||||
client = GithubClient(credentials)
|
||||
list(client.fetch_issues("owner", "repo"))
|
||||
|
||||
# Should have waited due to rate limit
|
||||
mock_sleep.assert_called()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests for project fields
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_fetch_project_fields():
|
||||
"""Test fetching GitHub Projects v2 fields."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = {
|
||||
"data": {
|
||||
"repository": {
|
||||
"issue": {
|
||||
"projectItems": {
|
||||
"nodes": [
|
||||
{
|
||||
"project": {"title": "Sprint Board"},
|
||||
"fieldValues": {
|
||||
"nodes": [
|
||||
{"field": {"name": "Status"}, "name": "In Progress"},
|
||||
{"field": {"name": "Priority"}, "text": "High"},
|
||||
]
|
||||
},
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "post") as mock_post:
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = GithubClient(credentials)
|
||||
fields = client.fetch_project_fields("owner", "repo", 1)
|
||||
|
||||
assert fields is not None
|
||||
# Fields are prefixed with project name
|
||||
assert "Sprint Board.Status" in fields
|
||||
assert fields["Sprint Board.Status"] == "In Progress"
|
||||
assert "Sprint Board.Priority" in fields
|
||||
assert fields["Sprint Board.Priority"] == "High"
|
||||
|
||||
|
||||
def test_fetch_project_fields_not_in_project():
|
||||
"""Test fetching project fields for issue not in any project."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = {
|
||||
"data": {"repository": {"issue": {"projectItems": {"nodes": []}}}}
|
||||
}
|
||||
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "post") as mock_post:
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = GithubClient(credentials)
|
||||
fields = client.fetch_project_fields("owner", "repo", 1)
|
||||
|
||||
assert fields is None
|
||||
|
||||
|
||||
def test_fetch_project_fields_graphql_error():
|
||||
"""Test handling GraphQL errors gracefully."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = {
|
||||
"errors": [{"message": "Something went wrong"}],
|
||||
"data": None,
|
||||
}
|
||||
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||
mock_response.raise_for_status = Mock()
|
||||
|
||||
with patch.object(requests.Session, "post") as mock_post:
|
||||
mock_post.return_value = mock_response
|
||||
|
||||
client = GithubClient(credentials)
|
||||
fields = client.fetch_project_fields("owner", "repo", 1)
|
||||
|
||||
assert fields is None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests for error handling
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def test_fetch_issues_handles_api_error():
|
||||
"""Test graceful handling of API errors."""
|
||||
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||
|
||||
mock_response = Mock()
|
||||
mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found")
|
||||
|
||||
with patch.object(requests.Session, "get") as mock_get:
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
client = GithubClient(credentials)
|
||||
|
||||
with pytest.raises(requests.HTTPError):
|
||||
list(client.fetch_issues("owner", "nonexistent"))
|
||||
1125
tests/memory/workers/tasks/test_github_tasks.py
Normal file
1125
tests/memory/workers/tasks/test_github_tasks.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -15,6 +15,8 @@ Usage:
|
||||
python run_celery_task.py blogs sync-webpage --url "https://example.com"
|
||||
python run_celery_task.py comic sync-all-comics
|
||||
python run_celery_task.py forums sync-lesswrong --since-date "2025-01-01" --min-karma 10 --limit 50 --cooldown 0.5 --max-items 1000
|
||||
python run_celery_task.py github sync-all-repos
|
||||
python run_celery_task.py github sync-repo --repo-id 1 --force-full
|
||||
"""
|
||||
|
||||
import json
|
||||
@ -51,8 +53,10 @@ from memory.common.celery_app import (
|
||||
UPDATE_METADATA_FOR_SOURCE_ITEMS,
|
||||
SETUP_GIT_NOTES,
|
||||
TRACK_GIT_CHANGES,
|
||||
BACKUP_TO_S3_DIRECTORY,
|
||||
BACKUP_PATH,
|
||||
BACKUP_ALL,
|
||||
SYNC_GITHUB_REPO,
|
||||
SYNC_ALL_GITHUB_REPOS,
|
||||
app,
|
||||
)
|
||||
|
||||
@ -100,9 +104,13 @@ TASK_MAPPINGS = {
|
||||
"track_git_changes": TRACK_GIT_CHANGES,
|
||||
},
|
||||
"backup": {
|
||||
"backup_to_s3_directory": BACKUP_TO_S3_DIRECTORY,
|
||||
"backup_path": BACKUP_PATH,
|
||||
"backup_all": BACKUP_ALL,
|
||||
},
|
||||
"github": {
|
||||
"sync_all_repos": SYNC_ALL_GITHUB_REPOS,
|
||||
"sync_repo": SYNC_GITHUB_REPO,
|
||||
},
|
||||
}
|
||||
QUEUE_MAPPINGS = {
|
||||
"email": "email",
|
||||
@ -200,9 +208,9 @@ def backup_all(ctx):
|
||||
@backup.command("path")
|
||||
@click.option("--path", required=True, help="Path to backup")
|
||||
@click.pass_context
|
||||
def backup_to_s3_directory(ctx, path):
|
||||
def backup_path_cmd(ctx, path):
|
||||
"""Backup a specific path."""
|
||||
execute_task(ctx, "backup", "backup_to_s3_directory", path=path)
|
||||
execute_task(ctx, "backup", "backup_path", path=path)
|
||||
|
||||
|
||||
@cli.group()
|
||||
@ -533,5 +541,28 @@ def forums_sync_lesswrong_post(ctx, url):
|
||||
execute_task(ctx, "forums", "sync_lesswrong_post", url=url)
|
||||
|
||||
|
||||
@cli.group()
|
||||
@click.pass_context
|
||||
def github(ctx):
|
||||
"""GitHub-related tasks."""
|
||||
pass
|
||||
|
||||
|
||||
@github.command("sync-all-repos")
|
||||
@click.pass_context
|
||||
def github_sync_all_repos(ctx):
|
||||
"""Sync all active GitHub repos."""
|
||||
execute_task(ctx, "github", "sync_all_repos")
|
||||
|
||||
|
||||
@github.command("sync-repo")
|
||||
@click.option("--repo-id", type=int, required=True, help="GitHub repo ID")
|
||||
@click.option("--force-full", is_flag=True, help="Force a full sync instead of incremental")
|
||||
@click.pass_context
|
||||
def github_sync_repo(ctx, repo_id, force_full):
|
||||
"""Sync a specific GitHub repo."""
|
||||
execute_task(ctx, "github", "sync_repo", repo_id=repo_id, force_full=force_full)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user