mirror of
https://github.com/mruwnik/memory.git
synced 2026-01-02 09:12:58 +01:00
more github ingesting
This commit is contained in:
parent
f729122754
commit
526bfa5f6b
@ -206,7 +206,7 @@ services:
|
|||||||
<<: *worker-base
|
<<: *worker-base
|
||||||
environment:
|
environment:
|
||||||
<<: *worker-env
|
<<: *worker-env
|
||||||
QUEUES: "backup,email,ebooks,discord,comic,blogs,forums,maintenance,notes,scheduler"
|
QUEUES: "backup,blogs,comic,discord,ebooks,email,forums,github,photo_embed,maintenance,notes,scheduler"
|
||||||
|
|
||||||
ingest-hub:
|
ingest-hub:
|
||||||
<<: *worker-base
|
<<: *worker-base
|
||||||
|
|||||||
@ -44,7 +44,7 @@ RUN git config --global user.email "${GIT_USER_EMAIL}" && \
|
|||||||
git config --global user.name "${GIT_USER_NAME}"
|
git config --global user.name "${GIT_USER_NAME}"
|
||||||
|
|
||||||
# Default queues to process
|
# Default queues to process
|
||||||
ENV QUEUES="backup,ebooks,email,discord,comic,blogs,forums,photo_embed,maintenance"
|
ENV QUEUES="backup,blogs,comic,discord,ebooks,email,forums,github,photo_embed,maintenance"
|
||||||
ENV PYTHONPATH="/app"
|
ENV PYTHONPATH="/app"
|
||||||
|
|
||||||
ENTRYPOINT ["./entry.sh"]
|
ENTRYPOINT ["./entry.sh"]
|
||||||
@ -351,9 +351,8 @@ class GithubAccountAdmin(ModelView, model=GithubAccount):
|
|||||||
"updated_at",
|
"updated_at",
|
||||||
]
|
]
|
||||||
column_searchable_list = ["name", "id"]
|
column_searchable_list = ["name", "id"]
|
||||||
# Hide sensitive columns from display
|
# Sensitive columns (access_token, private_key) are already excluded from column_list
|
||||||
column_exclude_list = ["access_token", "private_key"]
|
form_excluded_columns = ["repos", "access_token", "private_key"]
|
||||||
form_excluded_columns = ["repos"]
|
|
||||||
|
|
||||||
|
|
||||||
class GithubRepoAdmin(ModelView, model=GithubRepo):
|
class GithubRepoAdmin(ModelView, model=GithubRepo):
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
from celery import Celery
|
from celery import Celery
|
||||||
|
from celery.schedules import crontab
|
||||||
from kombu.utils.url import safequote
|
from kombu.utils.url import safequote
|
||||||
from memory.common import settings
|
from memory.common import settings
|
||||||
|
|
||||||
@ -123,6 +124,12 @@ app.conf.update(
|
|||||||
f"{BACKUP_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-backup"},
|
f"{BACKUP_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-backup"},
|
||||||
f"{GITHUB_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-github"},
|
f"{GITHUB_ROOT}.*": {"queue": f"{settings.CELERY_QUEUE_PREFIX}-github"},
|
||||||
},
|
},
|
||||||
|
beat_schedule={
|
||||||
|
"sync-github-repos-hourly": {
|
||||||
|
"task": SYNC_ALL_GITHUB_REPOS,
|
||||||
|
"schedule": crontab(minute=0), # Every hour at :00
|
||||||
|
},
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -58,6 +58,12 @@ ALL_COLLECTIONS: dict[str, Collection] = {
|
|||||||
"text": True,
|
"text": True,
|
||||||
"multimodal": True,
|
"multimodal": True,
|
||||||
},
|
},
|
||||||
|
"github": {
|
||||||
|
"dimension": 1024,
|
||||||
|
"distance": "Cosine",
|
||||||
|
"text": True,
|
||||||
|
"multimodal": False,
|
||||||
|
},
|
||||||
"text": {
|
"text": {
|
||||||
"dimension": 1024,
|
"dimension": 1024,
|
||||||
"distance": "Cosine",
|
"distance": "Cosine",
|
||||||
|
|||||||
@ -802,6 +802,23 @@ class MiscDoc(SourceItem):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class GithubItemPayload(SourceItemPayload):
|
||||||
|
kind: Annotated[str, "Type: issue, pr, comment, or project_card"]
|
||||||
|
repo_path: Annotated[str, "Repository path (owner/name)"]
|
||||||
|
number: Annotated[int | None, "Issue or PR number"]
|
||||||
|
state: Annotated[str | None, "State: open, closed, merged"]
|
||||||
|
title: Annotated[str | None, "Issue or PR title"]
|
||||||
|
author: Annotated[str | None, "Author username"]
|
||||||
|
labels: Annotated[list[str] | None, "GitHub labels"]
|
||||||
|
assignees: Annotated[list[str] | None, "Assigned users"]
|
||||||
|
milestone: Annotated[str | None, "Milestone name"]
|
||||||
|
project_status: Annotated[str | None, "GitHub Project status"]
|
||||||
|
project_priority: Annotated[str | None, "GitHub Project priority"]
|
||||||
|
created_at: Annotated[datetime | None, "Creation date"]
|
||||||
|
closed_at: Annotated[datetime | None, "Close date"]
|
||||||
|
merged_at: Annotated[datetime | None, "Merge date (PRs only)"]
|
||||||
|
|
||||||
|
|
||||||
class GithubItem(SourceItem):
|
class GithubItem(SourceItem):
|
||||||
__tablename__ = "github_item"
|
__tablename__ = "github_item"
|
||||||
|
|
||||||
@ -854,6 +871,29 @@ class GithubItem(SourceItem):
|
|||||||
Index("gh_repo_id_idx", "repo_id"),
|
Index("gh_repo_id_idx", "repo_id"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_collections(cls) -> list[str]:
|
||||||
|
return ["github"]
|
||||||
|
|
||||||
|
def as_payload(self) -> GithubItemPayload:
|
||||||
|
return GithubItemPayload(
|
||||||
|
**super().as_payload(),
|
||||||
|
kind=cast(str, self.kind),
|
||||||
|
repo_path=cast(str, self.repo_path),
|
||||||
|
number=cast(int | None, self.number),
|
||||||
|
state=cast(str | None, self.state),
|
||||||
|
title=cast(str | None, self.title),
|
||||||
|
author=cast(str | None, self.author),
|
||||||
|
labels=cast(list[str] | None, self.labels),
|
||||||
|
assignees=cast(list[str] | None, self.assignees),
|
||||||
|
milestone=cast(str | None, self.milestone),
|
||||||
|
project_status=cast(str | None, self.project_status),
|
||||||
|
project_priority=cast(str | None, self.project_priority),
|
||||||
|
created_at=cast(datetime | None, self.created_at),
|
||||||
|
closed_at=cast(datetime | None, self.closed_at),
|
||||||
|
merged_at=cast(datetime | None, self.merged_at),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class NotePayload(SourceItemPayload):
|
class NotePayload(SourceItemPayload):
|
||||||
note_type: Annotated[str | None, "Category of the note"]
|
note_type: Annotated[str | None, "Category of the note"]
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from memory.workers.tasks import (
|
|||||||
ebook,
|
ebook,
|
||||||
email,
|
email,
|
||||||
forums,
|
forums,
|
||||||
|
github,
|
||||||
maintenance,
|
maintenance,
|
||||||
notes,
|
notes,
|
||||||
observations,
|
observations,
|
||||||
@ -24,6 +25,7 @@ __all__ = [
|
|||||||
"ebook",
|
"ebook",
|
||||||
"discord",
|
"discord",
|
||||||
"forums",
|
"forums",
|
||||||
|
"github",
|
||||||
"maintenance",
|
"maintenance",
|
||||||
"notes",
|
"notes",
|
||||||
"observations",
|
"observations",
|
||||||
|
|||||||
705
tests/memory/parsers/test_github.py
Normal file
705
tests/memory/parsers/test_github.py
Normal file
@ -0,0 +1,705 @@
|
|||||||
|
"""Tests for GitHub API client and parser."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from unittest.mock import Mock, patch, MagicMock
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from memory.parsers.github import (
|
||||||
|
GithubCredentials,
|
||||||
|
GithubClient,
|
||||||
|
GithubIssueData,
|
||||||
|
GithubComment,
|
||||||
|
parse_github_date,
|
||||||
|
compute_content_hash,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tests for utility functions
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"date_str,expected",
|
||||||
|
[
|
||||||
|
("2024-01-15T10:30:00Z", datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc)),
|
||||||
|
(
|
||||||
|
"2024-06-20T14:45:30Z",
|
||||||
|
datetime(2024, 6, 20, 14, 45, 30, tzinfo=timezone.utc),
|
||||||
|
),
|
||||||
|
(None, None),
|
||||||
|
("", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parse_github_date(date_str, expected):
|
||||||
|
"""Test parsing GitHub date strings."""
|
||||||
|
result = parse_github_date(date_str)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_content_hash_body_only():
|
||||||
|
"""Test content hash with body only."""
|
||||||
|
hash1 = compute_content_hash("This is the body", [])
|
||||||
|
hash2 = compute_content_hash("This is the body", [])
|
||||||
|
hash3 = compute_content_hash("Different body", [])
|
||||||
|
|
||||||
|
assert hash1 == hash2 # Same content = same hash
|
||||||
|
assert hash1 != hash3 # Different content = different hash
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_content_hash_with_comments():
|
||||||
|
"""Test content hash includes comments."""
|
||||||
|
comments = [
|
||||||
|
GithubComment(
|
||||||
|
id=1,
|
||||||
|
author="user1",
|
||||||
|
body="First comment",
|
||||||
|
created_at="2024-01-01T00:00:00Z",
|
||||||
|
updated_at="2024-01-01T00:00:00Z",
|
||||||
|
),
|
||||||
|
GithubComment(
|
||||||
|
id=2,
|
||||||
|
author="user2",
|
||||||
|
body="Second comment",
|
||||||
|
created_at="2024-01-02T00:00:00Z",
|
||||||
|
updated_at="2024-01-02T00:00:00Z",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
hash_with_comments = compute_content_hash("Body", comments)
|
||||||
|
hash_without_comments = compute_content_hash("Body", [])
|
||||||
|
|
||||||
|
assert hash_with_comments != hash_without_comments
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_content_hash_empty_body():
|
||||||
|
"""Test content hash with empty/None body."""
|
||||||
|
hash1 = compute_content_hash("", [])
|
||||||
|
hash2 = compute_content_hash(None, []) # type: ignore
|
||||||
|
|
||||||
|
# Both should produce valid hashes
|
||||||
|
assert len(hash1) == 64 # SHA256 hex
|
||||||
|
assert len(hash2) == 64
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_content_hash_comment_order_matters():
|
||||||
|
"""Test that comment order affects the hash."""
|
||||||
|
comment1 = GithubComment(
|
||||||
|
id=1, author="a", body="First", created_at="", updated_at=""
|
||||||
|
)
|
||||||
|
comment2 = GithubComment(
|
||||||
|
id=2, author="b", body="Second", created_at="", updated_at=""
|
||||||
|
)
|
||||||
|
|
||||||
|
hash_order1 = compute_content_hash("Body", [comment1, comment2])
|
||||||
|
hash_order2 = compute_content_hash("Body", [comment2, comment1])
|
||||||
|
|
||||||
|
assert hash_order1 != hash_order2
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tests for GithubClient initialization
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def test_github_client_pat_auth():
|
||||||
|
"""Test client initialization with PAT authentication."""
|
||||||
|
credentials = GithubCredentials(
|
||||||
|
auth_type="pat",
|
||||||
|
access_token="ghp_test_token",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get"):
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
|
||||||
|
assert "Bearer ghp_test_token" in client.session.headers["Authorization"]
|
||||||
|
assert client.session.headers["Accept"] == "application/vnd.github+json"
|
||||||
|
assert client.session.headers["X-GitHub-Api-Version"] == "2022-11-28"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tests for fetch_issues
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_issues_basic():
|
||||||
|
"""Test fetching issues from repository."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
def mock_get(url, **kwargs):
|
||||||
|
"""Route mock responses based on URL."""
|
||||||
|
response = Mock()
|
||||||
|
response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
page = kwargs.get("params", {}).get("page", 1)
|
||||||
|
|
||||||
|
if "/repos/" in url and "/issues" in url and "/comments" not in url:
|
||||||
|
# Issues endpoint
|
||||||
|
if page == 1:
|
||||||
|
response.json.return_value = [
|
||||||
|
{
|
||||||
|
"number": 1,
|
||||||
|
"title": "Test Issue",
|
||||||
|
"body": "Issue body",
|
||||||
|
"state": "open",
|
||||||
|
"user": {"login": "testuser"},
|
||||||
|
"labels": [{"name": "bug"}],
|
||||||
|
"assignees": [{"login": "dev1"}],
|
||||||
|
"milestone": {"title": "v1.0"},
|
||||||
|
"created_at": "2024-01-01T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-02T00:00:00Z",
|
||||||
|
"closed_at": None,
|
||||||
|
"comments": 2,
|
||||||
|
# Note: Do NOT include "pull_request" key for real issues
|
||||||
|
# The API checks `if "pull_request" in issue` to skip PRs
|
||||||
|
}
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
response.json.return_value = []
|
||||||
|
elif "/comments" in url:
|
||||||
|
# Comments endpoint
|
||||||
|
if page == 1:
|
||||||
|
response.json.return_value = [
|
||||||
|
{
|
||||||
|
"id": 100,
|
||||||
|
"user": {"login": "commenter"},
|
||||||
|
"body": "A comment",
|
||||||
|
"created_at": "2024-01-01T12:00:00Z",
|
||||||
|
"updated_at": "2024-01-01T12:00:00Z",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
response.json.return_value = []
|
||||||
|
else:
|
||||||
|
response.json.return_value = []
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get", side_effect=mock_get):
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
issues = list(client.fetch_issues("owner", "repo"))
|
||||||
|
|
||||||
|
assert len(issues) == 1
|
||||||
|
issue = issues[0]
|
||||||
|
assert issue["number"] == 1
|
||||||
|
assert issue["title"] == "Test Issue"
|
||||||
|
assert issue["kind"] == "issue"
|
||||||
|
assert issue["state"] == "open"
|
||||||
|
assert issue["author"] == "testuser"
|
||||||
|
assert issue["labels"] == ["bug"]
|
||||||
|
assert issue["assignees"] == ["dev1"]
|
||||||
|
assert issue["milestone"] == "v1.0"
|
||||||
|
assert len(issue["comments"]) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_issues_skips_prs():
|
||||||
|
"""Test that PRs in issue list are skipped."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
def mock_get(url, **kwargs):
|
||||||
|
"""Route mock responses based on URL."""
|
||||||
|
response = Mock()
|
||||||
|
response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
page = kwargs.get("params", {}).get("page", 1)
|
||||||
|
|
||||||
|
if "/repos/" in url and "/issues" in url and "/comments" not in url:
|
||||||
|
if page == 1:
|
||||||
|
response.json.return_value = [
|
||||||
|
{
|
||||||
|
"number": 1,
|
||||||
|
"title": "Issue",
|
||||||
|
"body": "Body",
|
||||||
|
"state": "open",
|
||||||
|
"user": {"login": "user"},
|
||||||
|
"labels": [],
|
||||||
|
"assignees": [],
|
||||||
|
"milestone": None,
|
||||||
|
"created_at": "2024-01-01T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-01T00:00:00Z",
|
||||||
|
"closed_at": None,
|
||||||
|
"comments": 0,
|
||||||
|
# Real issues don't have "pull_request" key
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"number": 2,
|
||||||
|
"title": "PR posing as issue",
|
||||||
|
"body": "Body",
|
||||||
|
"state": "open",
|
||||||
|
"user": {"login": "user"},
|
||||||
|
"labels": [],
|
||||||
|
"assignees": [],
|
||||||
|
"milestone": None,
|
||||||
|
"created_at": "2024-01-01T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-01T00:00:00Z",
|
||||||
|
"closed_at": None,
|
||||||
|
"comments": 0,
|
||||||
|
"pull_request": {"url": "https://..."}, # PRs have this key
|
||||||
|
},
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
response.json.return_value = []
|
||||||
|
elif "/comments" in url:
|
||||||
|
response.json.return_value = []
|
||||||
|
else:
|
||||||
|
response.json.return_value = []
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get", side_effect=mock_get):
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
issues = list(client.fetch_issues("owner", "repo"))
|
||||||
|
|
||||||
|
assert len(issues) == 1
|
||||||
|
assert issues[0]["number"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_issues_with_since_filter():
|
||||||
|
"""Test fetching issues with since parameter."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = []
|
||||||
|
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get") as mock_get:
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
since = datetime(2024, 1, 15, tzinfo=timezone.utc)
|
||||||
|
list(client.fetch_issues("owner", "repo", since=since))
|
||||||
|
|
||||||
|
# Verify since was passed to API
|
||||||
|
call_args = mock_get.call_args
|
||||||
|
assert "since" in call_args.kwargs.get("params", {})
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_issues_with_state_filter():
|
||||||
|
"""Test fetching issues with state filter."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = []
|
||||||
|
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get") as mock_get:
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
list(client.fetch_issues("owner", "repo", state="closed"))
|
||||||
|
|
||||||
|
call_args = mock_get.call_args
|
||||||
|
assert call_args.kwargs.get("params", {}).get("state") == "closed"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_issues_with_labels_filter():
|
||||||
|
"""Test fetching issues with labels filter."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = []
|
||||||
|
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get") as mock_get:
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
list(client.fetch_issues("owner", "repo", labels=["bug", "critical"]))
|
||||||
|
|
||||||
|
call_args = mock_get.call_args
|
||||||
|
assert call_args.kwargs.get("params", {}).get("labels") == "bug,critical"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tests for fetch_prs
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_prs_basic():
|
||||||
|
"""Test fetching PRs from repository."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
def mock_get(url, **kwargs):
|
||||||
|
"""Route mock responses based on URL."""
|
||||||
|
response = Mock()
|
||||||
|
response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
page = kwargs.get("params", {}).get("page", 1)
|
||||||
|
|
||||||
|
if "/pulls" in url and "/comments" not in url:
|
||||||
|
if page == 1:
|
||||||
|
response.json.return_value = [
|
||||||
|
{
|
||||||
|
"number": 10,
|
||||||
|
"title": "Add feature",
|
||||||
|
"body": "PR body",
|
||||||
|
"state": "open",
|
||||||
|
"user": {"login": "contributor"},
|
||||||
|
"labels": [{"name": "enhancement"}],
|
||||||
|
"assignees": [{"login": "reviewer"}],
|
||||||
|
"milestone": None,
|
||||||
|
"created_at": "2024-01-05T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-06T00:00:00Z",
|
||||||
|
"closed_at": None,
|
||||||
|
"merged_at": None,
|
||||||
|
"diff_url": "https://github.com/owner/repo/pull/10.diff",
|
||||||
|
"comments": 0,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
response.json.return_value = []
|
||||||
|
elif ".diff" in url:
|
||||||
|
response.ok = True
|
||||||
|
response.text = "+100 lines added\n-50 lines removed"
|
||||||
|
elif "/comments" in url:
|
||||||
|
response.json.return_value = []
|
||||||
|
else:
|
||||||
|
response.json.return_value = []
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get", side_effect=mock_get):
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
prs = list(client.fetch_prs("owner", "repo"))
|
||||||
|
|
||||||
|
assert len(prs) == 1
|
||||||
|
pr = prs[0]
|
||||||
|
assert pr["number"] == 10
|
||||||
|
assert pr["title"] == "Add feature"
|
||||||
|
assert pr["kind"] == "pr"
|
||||||
|
assert pr["diff_summary"] is not None
|
||||||
|
assert "100 lines added" in pr["diff_summary"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_prs_merged():
|
||||||
|
"""Test fetching merged PR."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = [
|
||||||
|
{
|
||||||
|
"number": 20,
|
||||||
|
"title": "Merged PR",
|
||||||
|
"body": "Body",
|
||||||
|
"state": "closed",
|
||||||
|
"user": {"login": "user"},
|
||||||
|
"labels": [],
|
||||||
|
"assignees": [],
|
||||||
|
"milestone": None,
|
||||||
|
"created_at": "2024-01-01T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-10T00:00:00Z",
|
||||||
|
"closed_at": "2024-01-10T00:00:00Z",
|
||||||
|
"merged_at": "2024-01-10T00:00:00Z",
|
||||||
|
"additions": 10,
|
||||||
|
"deletions": 5,
|
||||||
|
"comments": 0,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
mock_empty = Mock()
|
||||||
|
mock_empty.json.return_value = []
|
||||||
|
mock_empty.headers = {"X-RateLimit-Remaining": "4998"}
|
||||||
|
mock_empty.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get") as mock_get:
|
||||||
|
mock_get.side_effect = [mock_response, mock_empty, mock_empty]
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
prs = list(client.fetch_prs("owner", "repo"))
|
||||||
|
|
||||||
|
pr = prs[0]
|
||||||
|
assert pr["state"] == "closed"
|
||||||
|
assert pr["merged_at"] == datetime(2024, 1, 10, 0, 0, 0, tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_prs_stops_at_since():
|
||||||
|
"""Test that PR fetching stops when reaching older items."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = [
|
||||||
|
{
|
||||||
|
"number": 30,
|
||||||
|
"title": "Recent PR",
|
||||||
|
"body": "Body",
|
||||||
|
"state": "open",
|
||||||
|
"user": {"login": "user"},
|
||||||
|
"labels": [],
|
||||||
|
"assignees": [],
|
||||||
|
"milestone": None,
|
||||||
|
"created_at": "2024-01-20T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-20T00:00:00Z",
|
||||||
|
"closed_at": None,
|
||||||
|
"merged_at": None,
|
||||||
|
"additions": 1,
|
||||||
|
"deletions": 1,
|
||||||
|
"comments": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"number": 29,
|
||||||
|
"title": "Old PR",
|
||||||
|
"body": "Body",
|
||||||
|
"state": "open",
|
||||||
|
"user": {"login": "user"},
|
||||||
|
"labels": [],
|
||||||
|
"assignees": [],
|
||||||
|
"milestone": None,
|
||||||
|
"created_at": "2024-01-01T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-01T00:00:00Z", # Older than since
|
||||||
|
"closed_at": None,
|
||||||
|
"merged_at": None,
|
||||||
|
"additions": 1,
|
||||||
|
"deletions": 1,
|
||||||
|
"comments": 0,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
mock_empty = Mock()
|
||||||
|
mock_empty.json.return_value = []
|
||||||
|
mock_empty.headers = {"X-RateLimit-Remaining": "4998"}
|
||||||
|
mock_empty.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get") as mock_get:
|
||||||
|
mock_get.side_effect = [mock_response, mock_empty]
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
since = datetime(2024, 1, 15, tzinfo=timezone.utc)
|
||||||
|
prs = list(client.fetch_prs("owner", "repo", since=since))
|
||||||
|
|
||||||
|
# Should only get the recent PR, stop at the old one
|
||||||
|
assert len(prs) == 1
|
||||||
|
assert prs[0]["number"] == 30
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tests for fetch_comments
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_comments_pagination():
|
||||||
|
"""Test comment fetching with pagination."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
# First page of comments
|
||||||
|
mock_page1 = Mock()
|
||||||
|
mock_page1.json.return_value = [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"user": {"login": "user1"},
|
||||||
|
"body": "Comment 1",
|
||||||
|
"created_at": "2024-01-01T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-01T00:00:00Z",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
mock_page1.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_page1.raise_for_status = Mock()
|
||||||
|
|
||||||
|
# Second page of comments
|
||||||
|
mock_page2 = Mock()
|
||||||
|
mock_page2.json.return_value = [
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"user": {"login": "user2"},
|
||||||
|
"body": "Comment 2",
|
||||||
|
"created_at": "2024-01-02T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-02T00:00:00Z",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
mock_page2.headers = {"X-RateLimit-Remaining": "4998"}
|
||||||
|
mock_page2.raise_for_status = Mock()
|
||||||
|
|
||||||
|
# Empty page to stop
|
||||||
|
mock_empty = Mock()
|
||||||
|
mock_empty.json.return_value = []
|
||||||
|
mock_empty.headers = {"X-RateLimit-Remaining": "4997"}
|
||||||
|
mock_empty.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get") as mock_get:
|
||||||
|
mock_get.side_effect = [mock_page1, mock_page2, mock_empty]
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
comments = client.fetch_comments("owner", "repo", 1)
|
||||||
|
|
||||||
|
assert len(comments) == 2
|
||||||
|
assert comments[0]["author"] == "user1"
|
||||||
|
assert comments[1]["author"] == "user2"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_comments_handles_ghost_user():
|
||||||
|
"""Test comment with deleted/ghost user."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"user": None, # Deleted user
|
||||||
|
"body": "Comment from ghost",
|
||||||
|
"created_at": "2024-01-01T00:00:00Z",
|
||||||
|
"updated_at": "2024-01-01T00:00:00Z",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
mock_empty = Mock()
|
||||||
|
mock_empty.json.return_value = []
|
||||||
|
mock_empty.headers = {"X-RateLimit-Remaining": "4998"}
|
||||||
|
mock_empty.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get") as mock_get:
|
||||||
|
mock_get.side_effect = [mock_response, mock_empty]
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
comments = client.fetch_comments("owner", "repo", 1)
|
||||||
|
|
||||||
|
assert len(comments) == 1
|
||||||
|
assert comments[0]["author"] == "ghost"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tests for rate limiting
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def test_rate_limit_handling():
|
||||||
|
"""Test rate limit detection and backoff."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = []
|
||||||
|
mock_response.headers = {
|
||||||
|
"X-RateLimit-Remaining": "0",
|
||||||
|
"X-RateLimit-Reset": str(int(datetime.now(timezone.utc).timestamp()) + 1),
|
||||||
|
}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get") as mock_get:
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
with patch("time.sleep") as mock_sleep:
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
list(client.fetch_issues("owner", "repo"))
|
||||||
|
|
||||||
|
# Should have waited due to rate limit
|
||||||
|
mock_sleep.assert_called()
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tests for project fields
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_project_fields():
|
||||||
|
"""Test fetching GitHub Projects v2 fields."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = {
|
||||||
|
"data": {
|
||||||
|
"repository": {
|
||||||
|
"issue": {
|
||||||
|
"projectItems": {
|
||||||
|
"nodes": [
|
||||||
|
{
|
||||||
|
"project": {"title": "Sprint Board"},
|
||||||
|
"fieldValues": {
|
||||||
|
"nodes": [
|
||||||
|
{"field": {"name": "Status"}, "name": "In Progress"},
|
||||||
|
{"field": {"name": "Priority"}, "text": "High"},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "post") as mock_post:
|
||||||
|
mock_post.return_value = mock_response
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
fields = client.fetch_project_fields("owner", "repo", 1)
|
||||||
|
|
||||||
|
assert fields is not None
|
||||||
|
# Fields are prefixed with project name
|
||||||
|
assert "Sprint Board.Status" in fields
|
||||||
|
assert fields["Sprint Board.Status"] == "In Progress"
|
||||||
|
assert "Sprint Board.Priority" in fields
|
||||||
|
assert fields["Sprint Board.Priority"] == "High"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_project_fields_not_in_project():
|
||||||
|
"""Test fetching project fields for issue not in any project."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = {
|
||||||
|
"data": {"repository": {"issue": {"projectItems": {"nodes": []}}}}
|
||||||
|
}
|
||||||
|
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "post") as mock_post:
|
||||||
|
mock_post.return_value = mock_response
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
fields = client.fetch_project_fields("owner", "repo", 1)
|
||||||
|
|
||||||
|
assert fields is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_project_fields_graphql_error():
|
||||||
|
"""Test handling GraphQL errors gracefully."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = {
|
||||||
|
"errors": [{"message": "Something went wrong"}],
|
||||||
|
"data": None,
|
||||||
|
}
|
||||||
|
mock_response.headers = {"X-RateLimit-Remaining": "4999"}
|
||||||
|
mock_response.raise_for_status = Mock()
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "post") as mock_post:
|
||||||
|
mock_post.return_value = mock_response
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
fields = client.fetch_project_fields("owner", "repo", 1)
|
||||||
|
|
||||||
|
assert fields is None
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tests for error handling
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_issues_handles_api_error():
|
||||||
|
"""Test graceful handling of API errors."""
|
||||||
|
credentials = GithubCredentials(auth_type="pat", access_token="token")
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found")
|
||||||
|
|
||||||
|
with patch.object(requests.Session, "get") as mock_get:
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
client = GithubClient(credentials)
|
||||||
|
|
||||||
|
with pytest.raises(requests.HTTPError):
|
||||||
|
list(client.fetch_issues("owner", "nonexistent"))
|
||||||
1125
tests/memory/workers/tasks/test_github_tasks.py
Normal file
1125
tests/memory/workers/tasks/test_github_tasks.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -15,6 +15,8 @@ Usage:
|
|||||||
python run_celery_task.py blogs sync-webpage --url "https://example.com"
|
python run_celery_task.py blogs sync-webpage --url "https://example.com"
|
||||||
python run_celery_task.py comic sync-all-comics
|
python run_celery_task.py comic sync-all-comics
|
||||||
python run_celery_task.py forums sync-lesswrong --since-date "2025-01-01" --min-karma 10 --limit 50 --cooldown 0.5 --max-items 1000
|
python run_celery_task.py forums sync-lesswrong --since-date "2025-01-01" --min-karma 10 --limit 50 --cooldown 0.5 --max-items 1000
|
||||||
|
python run_celery_task.py github sync-all-repos
|
||||||
|
python run_celery_task.py github sync-repo --repo-id 1 --force-full
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@ -51,8 +53,10 @@ from memory.common.celery_app import (
|
|||||||
UPDATE_METADATA_FOR_SOURCE_ITEMS,
|
UPDATE_METADATA_FOR_SOURCE_ITEMS,
|
||||||
SETUP_GIT_NOTES,
|
SETUP_GIT_NOTES,
|
||||||
TRACK_GIT_CHANGES,
|
TRACK_GIT_CHANGES,
|
||||||
BACKUP_TO_S3_DIRECTORY,
|
BACKUP_PATH,
|
||||||
BACKUP_ALL,
|
BACKUP_ALL,
|
||||||
|
SYNC_GITHUB_REPO,
|
||||||
|
SYNC_ALL_GITHUB_REPOS,
|
||||||
app,
|
app,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -100,9 +104,13 @@ TASK_MAPPINGS = {
|
|||||||
"track_git_changes": TRACK_GIT_CHANGES,
|
"track_git_changes": TRACK_GIT_CHANGES,
|
||||||
},
|
},
|
||||||
"backup": {
|
"backup": {
|
||||||
"backup_to_s3_directory": BACKUP_TO_S3_DIRECTORY,
|
"backup_path": BACKUP_PATH,
|
||||||
"backup_all": BACKUP_ALL,
|
"backup_all": BACKUP_ALL,
|
||||||
},
|
},
|
||||||
|
"github": {
|
||||||
|
"sync_all_repos": SYNC_ALL_GITHUB_REPOS,
|
||||||
|
"sync_repo": SYNC_GITHUB_REPO,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
QUEUE_MAPPINGS = {
|
QUEUE_MAPPINGS = {
|
||||||
"email": "email",
|
"email": "email",
|
||||||
@ -200,9 +208,9 @@ def backup_all(ctx):
|
|||||||
@backup.command("path")
|
@backup.command("path")
|
||||||
@click.option("--path", required=True, help="Path to backup")
|
@click.option("--path", required=True, help="Path to backup")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def backup_to_s3_directory(ctx, path):
|
def backup_path_cmd(ctx, path):
|
||||||
"""Backup a specific path."""
|
"""Backup a specific path."""
|
||||||
execute_task(ctx, "backup", "backup_to_s3_directory", path=path)
|
execute_task(ctx, "backup", "backup_path", path=path)
|
||||||
|
|
||||||
|
|
||||||
@cli.group()
|
@cli.group()
|
||||||
@ -533,5 +541,28 @@ def forums_sync_lesswrong_post(ctx, url):
|
|||||||
execute_task(ctx, "forums", "sync_lesswrong_post", url=url)
|
execute_task(ctx, "forums", "sync_lesswrong_post", url=url)
|
||||||
|
|
||||||
|
|
||||||
|
@cli.group()
|
||||||
|
@click.pass_context
|
||||||
|
def github(ctx):
|
||||||
|
"""GitHub-related tasks."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@github.command("sync-all-repos")
|
||||||
|
@click.pass_context
|
||||||
|
def github_sync_all_repos(ctx):
|
||||||
|
"""Sync all active GitHub repos."""
|
||||||
|
execute_task(ctx, "github", "sync_all_repos")
|
||||||
|
|
||||||
|
|
||||||
|
@github.command("sync-repo")
|
||||||
|
@click.option("--repo-id", type=int, required=True, help="GitHub repo ID")
|
||||||
|
@click.option("--force-full", is_flag=True, help="Force a full sync instead of incremental")
|
||||||
|
@click.pass_context
|
||||||
|
def github_sync_repo(ctx, repo_id, force_full):
|
||||||
|
"""Sync a specific GitHub repo."""
|
||||||
|
execute_task(ctx, "github", "sync_repo", repo_id=repo_id, force_full=force_full)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cli()
|
cli()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user