mirror of
https://github.com/mruwnik/memory.git
synced 2026-01-02 09:12:58 +01:00
fix google docs bugs
This commit is contained in:
parent
c08a10da28
commit
59c45ff1fb
@ -206,7 +206,7 @@ services:
|
|||||||
<<: *worker-base
|
<<: *worker-base
|
||||||
environment:
|
environment:
|
||||||
<<: *worker-env
|
<<: *worker-env
|
||||||
QUEUES: "backup,blogs,comic,discord,ebooks,email,forums,github,people,photo_embed,maintenance,notes,scheduler"
|
QUEUES: "backup,blogs,comic,discord,ebooks,email,forums,github,google,people,photo_embed,maintenance,notes,scheduler"
|
||||||
|
|
||||||
ingest-hub:
|
ingest-hub:
|
||||||
<<: *worker-base
|
<<: *worker-base
|
||||||
|
|||||||
@ -44,7 +44,7 @@ RUN git config --global user.email "${GIT_USER_EMAIL}" && \
|
|||||||
git config --global user.name "${GIT_USER_NAME}"
|
git config --global user.name "${GIT_USER_NAME}"
|
||||||
|
|
||||||
# Default queues to process
|
# Default queues to process
|
||||||
ENV QUEUES="backup,blogs,comic,discord,ebooks,email,forums,github,people,photo_embed,maintenance"
|
ENV QUEUES="backup,blogs,comic,discord,ebooks,email,forums,github,google,people,photo_embed,maintenance,notes,scheduler"
|
||||||
ENV PYTHONPATH="/app"
|
ENV PYTHONPATH="/app"
|
||||||
|
|
||||||
ENTRYPOINT ["./entry.sh"]
|
ENTRYPOINT ["./entry.sh"]
|
||||||
@ -85,7 +85,7 @@ ALL_COLLECTIONS: dict[str, Collection] = {
|
|||||||
"doc": {
|
"doc": {
|
||||||
"dimension": 1024,
|
"dimension": 1024,
|
||||||
"distance": "Cosine",
|
"distance": "Cosine",
|
||||||
"text": False,
|
"text": True,
|
||||||
"multimodal": True,
|
"multimodal": True,
|
||||||
},
|
},
|
||||||
# Observations
|
# Observations
|
||||||
|
|||||||
@ -106,6 +106,24 @@ class GoogleDriveClient:
|
|||||||
self._service = build("drive", "v3", credentials=creds)
|
self._service = build("drive", "v3", credentials=creds)
|
||||||
return self._service
|
return self._service
|
||||||
|
|
||||||
|
def get_file_metadata(self, file_id: str) -> dict:
|
||||||
|
"""Get metadata for a single file or folder."""
|
||||||
|
service = self._get_service()
|
||||||
|
return (
|
||||||
|
service.files()
|
||||||
|
.get(
|
||||||
|
fileId=file_id,
|
||||||
|
fields="id, name, mimeType, modifiedTime, createdTime, owners, lastModifyingUser, parents, size",
|
||||||
|
supportsAllDrives=True,
|
||||||
|
)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_folder(self, file_id: str) -> bool:
|
||||||
|
"""Check if a file ID refers to a folder."""
|
||||||
|
metadata = self.get_file_metadata(file_id)
|
||||||
|
return metadata.get("mimeType") == "application/vnd.google-apps.folder"
|
||||||
|
|
||||||
def list_files_in_folder(
|
def list_files_in_folder(
|
||||||
self,
|
self,
|
||||||
folder_id: str,
|
folder_id: str,
|
||||||
@ -141,6 +159,8 @@ class GoogleDriveClient:
|
|||||||
fields="nextPageToken, files(id, name, mimeType, modifiedTime, createdTime, owners, lastModifyingUser, parents, size)",
|
fields="nextPageToken, files(id, name, mimeType, modifiedTime, createdTime, owners, lastModifyingUser, parents, size)",
|
||||||
pageToken=page_token,
|
pageToken=page_token,
|
||||||
pageSize=page_size,
|
pageSize=page_size,
|
||||||
|
supportsAllDrives=True,
|
||||||
|
includeItemsFromAllDrives=True,
|
||||||
)
|
)
|
||||||
.execute()
|
.execute()
|
||||||
)
|
)
|
||||||
@ -191,7 +211,11 @@ class GoogleDriveClient:
|
|||||||
while current_id:
|
while current_id:
|
||||||
try:
|
try:
|
||||||
file = (
|
file = (
|
||||||
service.files().get(fileId=current_id, fields="name, parents").execute()
|
service.files().get(
|
||||||
|
fileId=current_id,
|
||||||
|
fields="name, parents",
|
||||||
|
supportsAllDrives=True,
|
||||||
|
).execute()
|
||||||
)
|
)
|
||||||
path_parts.insert(0, file["name"])
|
path_parts.insert(0, file["name"])
|
||||||
parents = file.get("parents", [])
|
parents = file.get("parents", [])
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from memory.workers.tasks import (
|
|||||||
email,
|
email,
|
||||||
forums,
|
forums,
|
||||||
github,
|
github,
|
||||||
|
google_drive,
|
||||||
maintenance,
|
maintenance,
|
||||||
notes,
|
notes,
|
||||||
observations,
|
observations,
|
||||||
@ -28,6 +29,7 @@ __all__ = [
|
|||||||
"email",
|
"email",
|
||||||
"forums",
|
"forums",
|
||||||
"github",
|
"github",
|
||||||
|
"google_drive",
|
||||||
"maintenance",
|
"maintenance",
|
||||||
"notes",
|
"notes",
|
||||||
"observations",
|
"observations",
|
||||||
|
|||||||
@ -237,13 +237,22 @@ def sync_google_folder(folder_id: int, force_full: bool = False) -> dict[str, An
|
|||||||
|
|
||||||
docs_synced = 0
|
docs_synced = 0
|
||||||
task_ids = []
|
task_ids = []
|
||||||
|
is_single_doc = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get folder path for context
|
google_id = cast(str, folder.folder_id)
|
||||||
folder_path = client.get_folder_path(cast(str, folder.folder_id))
|
|
||||||
|
# Check if this is a single document or a folder
|
||||||
|
file_metadata = client.get_file_metadata(google_id)
|
||||||
|
is_folder = file_metadata.get("mimeType") == "application/vnd.google-apps.folder"
|
||||||
|
is_single_doc = not is_folder
|
||||||
|
|
||||||
|
if is_folder:
|
||||||
|
# It's a folder - list and sync all files inside
|
||||||
|
folder_path = client.get_folder_path(google_id)
|
||||||
|
|
||||||
for file_meta in client.list_files_in_folder(
|
for file_meta in client.list_files_in_folder(
|
||||||
cast(str, folder.folder_id),
|
google_id,
|
||||||
recursive=cast(bool, folder.recursive),
|
recursive=cast(bool, folder.recursive),
|
||||||
since=since,
|
since=since,
|
||||||
):
|
):
|
||||||
@ -256,6 +265,38 @@ def sync_google_folder(folder_id: int, force_full: bool = False) -> dict[str, An
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error fetching file {file_meta.get('name')}: {e}")
|
logger.error(f"Error fetching file {file_meta.get('name')}: {e}")
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
# It's a single document - sync it directly
|
||||||
|
logger.info(f"Syncing single document: {file_metadata.get('name')}")
|
||||||
|
folder_path = client.get_folder_path(google_id)
|
||||||
|
|
||||||
|
# Check if we need to sync based on modification time
|
||||||
|
if since and file_metadata.get("modifiedTime"):
|
||||||
|
from memory.parsers.google_drive import parse_google_date
|
||||||
|
modified_at = parse_google_date(file_metadata.get("modifiedTime"))
|
||||||
|
if modified_at and modified_at <= since:
|
||||||
|
logger.info(f"Document not modified since last sync, skipping")
|
||||||
|
folder.last_sync_at = now
|
||||||
|
session.commit()
|
||||||
|
return {
|
||||||
|
"status": "completed",
|
||||||
|
"sync_type": "incremental",
|
||||||
|
"folder_id": folder_id,
|
||||||
|
"folder_name": folder.folder_name,
|
||||||
|
"docs_synced": 0,
|
||||||
|
"task_ids": [],
|
||||||
|
"is_single_doc": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_data = client.fetch_file(file_metadata, folder_path)
|
||||||
|
serialized = _serialize_file_data(file_data)
|
||||||
|
task = sync_google_doc.delay(folder.id, serialized)
|
||||||
|
task_ids.append(task.id)
|
||||||
|
docs_synced = 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error fetching document {file_metadata.get('name')}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
# Update sync timestamps
|
# Update sync timestamps
|
||||||
folder.last_sync_at = now
|
folder.last_sync_at = now
|
||||||
@ -275,6 +316,7 @@ def sync_google_folder(folder_id: int, force_full: bool = False) -> dict[str, An
|
|||||||
"folder_name": folder.folder_name,
|
"folder_name": folder.folder_name,
|
||||||
"docs_synced": docs_synced,
|
"docs_synced": docs_synced,
|
||||||
"task_ids": task_ids,
|
"task_ids": task_ids,
|
||||||
|
"is_single_doc": is_single_doc,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user