diff --git a/src/greek_lang/database/migrations/versions/20250621_2051_55f95da68641_.py b/src/greek_lang/database/migrations/versions/20250621_2051_55f95da68641_.py new file mode 100644 index 0000000..a07db20 --- /dev/null +++ b/src/greek_lang/database/migrations/versions/20250621_2051_55f95da68641_.py @@ -0,0 +1,32 @@ +"""empty message + +Revision ID: 55f95da68641 +Revises: 19fc4bee7a9f +Create Date: 2025-06-21 20:51:15.097769 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '55f95da68641' +down_revision: Union[str, None] = '19fc4bee7a9f' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('glossary_word', 'audio_file') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('glossary_word', sa.Column('audio_file', sa.TEXT(), autoincrement=False, nullable=True)) + # ### end Alembic commands ### diff --git a/src/greek_lang/database/migrations/versions/20250621_2051_78357f437f61_.py b/src/greek_lang/database/migrations/versions/20250621_2051_78357f437f61_.py new file mode 100644 index 0000000..76bc0f2 --- /dev/null +++ b/src/greek_lang/database/migrations/versions/20250621_2051_78357f437f61_.py @@ -0,0 +1,32 @@ +"""empty message + +Revision ID: 78357f437f61 +Revises: 55f95da68641 +Create Date: 2025-06-21 20:51:29.437692 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '78357f437f61' +down_revision: Union[str, None] = '55f95da68641' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('glossary_word', sa.Column('audio_file', sa.LargeBinary(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('glossary_word', 'audio_file') + # ### end Alembic commands ### diff --git a/src/greek_lang/glossaries/models.py b/src/greek_lang/glossaries/models.py index 715e911..7b8a7fd 100644 --- a/src/greek_lang/glossaries/models.py +++ b/src/greek_lang/glossaries/models.py @@ -3,7 +3,7 @@ from __future__ import annotations import datetime import enum -from sqlalchemy import BigInteger, Text, DateTime, Enum, func +from sqlalchemy import BigInteger, Text, DateTime, Enum, func, LargeBinary from sqlalchemy.dialects.postgresql import ARRAY from sqlalchemy.orm import Mapped, mapped_column @@ -37,6 +37,7 @@ class GlossaryWord(Base): term: Mapped[str] = mapped_column( Text(), nullable=False, + unique=True, ) language: Mapped[LanguageEnum] = mapped_column( Enum(LanguageEnum, native_enum=False), @@ -75,8 +76,8 @@ class GlossaryWord(Base): Text(), nullable=True, ) - audio_file: Mapped[str | None] = mapped_column( - Text(), + audio_file: Mapped[bytes | None] = mapped_column( + LargeBinary(), nullable=True, ) created_at: Mapped[datetime.datetime] = mapped_column( diff --git a/src/greek_lang/openai_manager/manager.py b/src/greek_lang/openai_manager/manager.py index ab0e5f2..55a8b11 100644 --- a/src/greek_lang/openai_manager/manager.py +++ b/src/greek_lang/openai_manager/manager.py @@ -5,39 +5,47 @@ import dataclasses import pydantic from openai import AsyncOpenAI +from greek_lang.languages import LanguageEnum +from greek_lang.glossaries.models import LexicalCategoryEnum + class WordInfo(pydantic.BaseModel): + lemma: str = pydantic.Field( + ..., + description="lemma (base form) - for verbs, use the 1st person singular in present indicative, " + "for nouns and adjectives, use the nominative singular masculine (for adjectives)", + ) transcription: str = pydantic.Field( ..., - description="phonetic transcription in IPA", + description="lemma phonetic transcription in IPA", ) translation: str = pydantic.Field( ..., - description="translation in {target_language}", + description="lemma translation in {target_language}", ) description: str = pydantic.Field( ..., - description="description in {target_language}", + description="lemma description in {target_language}", ) part_of_speech: str = pydantic.Field( ..., - description="part of speech in {target_language}", + description=f"part of speech, one of {[cat.value for cat in LexicalCategoryEnum]}", ) example: str = pydantic.Field( ..., - description="example", + description="lemma example", ) example_transcription: str = pydantic.Field( ..., - description="phonetic transcription in IPA of an example", + description="lemma phonetic transcription in IPA of an example", ) example_translation: str = pydantic.Field( ..., - description="translation of the example in {target_language}", + description="lemma translation of the example in {target_language}", ) category: str = pydantic.Field( ..., - description="semantic category in {target_language}", + description=f"semantic category in {{target_language}}", ) etymology: str = pydantic.Field( ..., @@ -53,8 +61,8 @@ class OpenAiManager: self, *, word: str, - source_lang: str, - target_lang: str, + source_lang: LanguageEnum, + target_lang: LanguageEnum, model: str = "gpt-4o", ) -> WordInfo: system_message = { @@ -63,7 +71,7 @@ class OpenAiManager: } user_message = { "role": "user", - "content": f'Provide detailed information about the word "{word}" in language {source_lang}, set {{target_language}} = {target_lang}.', + "content": f'Provide detailed information about the word "{word}" in language {source_lang!s}, set {{target_language}} = {target_lang!s}.', } response = await self.client.beta.chat.completions.parse( model=model, diff --git a/src/greek_lang/translator.py b/src/greek_lang/translator.py new file mode 100644 index 0000000..d5994b7 --- /dev/null +++ b/src/greek_lang/translator.py @@ -0,0 +1,47 @@ +from dependency_injector.wiring import inject, Provide +from sqlalchemy.ext.asyncio import async_sessionmaker, AsyncSession + +from greek_lang.audio.manager import get_pronunciation +from greek_lang.database.container import DatabaseContainer +from greek_lang.languages import LanguageEnum +from greek_lang.openai_manager.container import OpenAiContainer +from greek_lang.openai_manager.manager import OpenAiManager +from greek_lang.glossaries.models import GlossaryWord, LexicalCategoryEnum + + +@inject +async def translate( + word: str, + source_lang: LanguageEnum, + target_lang: LanguageEnum = LanguageEnum.ru, + note: str | None = None, + tags: tuple[str, ...] = tuple(), + open_ai_manager: OpenAiManager = Provide[OpenAiContainer.ai_manager], + db_session_maker: async_sessionmaker[AsyncSession] = Provide[ + DatabaseContainer.async_session_maker, + ], +) -> GlossaryWord: + word_response = await open_ai_manager.get_gpt_response( + word=word, + source_lang=source_lang, + target_lang=target_lang, + ) + pronon = await get_pronunciation(text="έμπορος", source_lang=source_lang) + + async with db_session_maker() as db_session, db_session.begin(): + glossary_word = GlossaryWord( + term=word_response.lemma, + language=source_lang.value, + transcription=word_response.transcription, + translation=word_response.translation, + description=word_response.description, + lexical_category=LexicalCategoryEnum(word_response.part_of_speech), + meaning_category=word_response.category, + example=f"{word_response.example}({word_response.example_translation})", + etymology=word_response.etymology, + note=note, + tags=list(tags), + audio_file=pronon.getvalue(), + ) + db_session.add(glossary_word) + return glossary_word