add translator

This commit is contained in:
ruslangilfanov 2025-06-22 14:37:27 +03:00
parent 709adbf507
commit c9911c8abc
No known key found for this signature in database
5 changed files with 134 additions and 14 deletions

View File

@ -0,0 +1,32 @@
"""empty message
Revision ID: 55f95da68641
Revises: 19fc4bee7a9f
Create Date: 2025-06-21 20:51:15.097769
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = '55f95da68641'
down_revision: Union[str, None] = '19fc4bee7a9f'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('glossary_word', 'audio_file')
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('glossary_word', sa.Column('audio_file', sa.TEXT(), autoincrement=False, nullable=True))
# ### end Alembic commands ###

View File

@ -0,0 +1,32 @@
"""empty message
Revision ID: 78357f437f61
Revises: 55f95da68641
Create Date: 2025-06-21 20:51:29.437692
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = '78357f437f61'
down_revision: Union[str, None] = '55f95da68641'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('glossary_word', sa.Column('audio_file', sa.LargeBinary(), nullable=True))
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('glossary_word', 'audio_file')
# ### end Alembic commands ###

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import datetime import datetime
import enum import enum
from sqlalchemy import BigInteger, Text, DateTime, Enum, func from sqlalchemy import BigInteger, Text, DateTime, Enum, func, LargeBinary
from sqlalchemy.dialects.postgresql import ARRAY from sqlalchemy.dialects.postgresql import ARRAY
from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy.orm import Mapped, mapped_column
@ -37,6 +37,7 @@ class GlossaryWord(Base):
term: Mapped[str] = mapped_column( term: Mapped[str] = mapped_column(
Text(), Text(),
nullable=False, nullable=False,
unique=True,
) )
language: Mapped[LanguageEnum] = mapped_column( language: Mapped[LanguageEnum] = mapped_column(
Enum(LanguageEnum, native_enum=False), Enum(LanguageEnum, native_enum=False),
@ -75,8 +76,8 @@ class GlossaryWord(Base):
Text(), Text(),
nullable=True, nullable=True,
) )
audio_file: Mapped[str | None] = mapped_column( audio_file: Mapped[bytes | None] = mapped_column(
Text(), LargeBinary(),
nullable=True, nullable=True,
) )
created_at: Mapped[datetime.datetime] = mapped_column( created_at: Mapped[datetime.datetime] = mapped_column(

View File

@ -5,39 +5,47 @@ import dataclasses
import pydantic import pydantic
from openai import AsyncOpenAI from openai import AsyncOpenAI
from greek_lang.languages import LanguageEnum
from greek_lang.glossaries.models import LexicalCategoryEnum
class WordInfo(pydantic.BaseModel): class WordInfo(pydantic.BaseModel):
lemma: str = pydantic.Field(
...,
description="lemma (base form) - for verbs, use the 1st person singular in present indicative, "
"for nouns and adjectives, use the nominative singular masculine (for adjectives)",
)
transcription: str = pydantic.Field( transcription: str = pydantic.Field(
..., ...,
description="phonetic transcription in IPA", description="lemma phonetic transcription in IPA",
) )
translation: str = pydantic.Field( translation: str = pydantic.Field(
..., ...,
description="translation in {target_language}", description="lemma translation in {target_language}",
) )
description: str = pydantic.Field( description: str = pydantic.Field(
..., ...,
description="description in {target_language}", description="lemma description in {target_language}",
) )
part_of_speech: str = pydantic.Field( part_of_speech: str = pydantic.Field(
..., ...,
description="part of speech in {target_language}", description=f"part of speech, one of {[cat.value for cat in LexicalCategoryEnum]}",
) )
example: str = pydantic.Field( example: str = pydantic.Field(
..., ...,
description="example", description="lemma example",
) )
example_transcription: str = pydantic.Field( example_transcription: str = pydantic.Field(
..., ...,
description="phonetic transcription in IPA of an example", description="lemma phonetic transcription in IPA of an example",
) )
example_translation: str = pydantic.Field( example_translation: str = pydantic.Field(
..., ...,
description="translation of the example in {target_language}", description="lemma translation of the example in {target_language}",
) )
category: str = pydantic.Field( category: str = pydantic.Field(
..., ...,
description="semantic category in {target_language}", description=f"semantic category in {{target_language}}",
) )
etymology: str = pydantic.Field( etymology: str = pydantic.Field(
..., ...,
@ -53,8 +61,8 @@ class OpenAiManager:
self, self,
*, *,
word: str, word: str,
source_lang: str, source_lang: LanguageEnum,
target_lang: str, target_lang: LanguageEnum,
model: str = "gpt-4o", model: str = "gpt-4o",
) -> WordInfo: ) -> WordInfo:
system_message = { system_message = {
@ -63,7 +71,7 @@ class OpenAiManager:
} }
user_message = { user_message = {
"role": "user", "role": "user",
"content": f'Provide detailed information about the word "{word}" in language {source_lang}, set {{target_language}} = {target_lang}.', "content": f'Provide detailed information about the word "{word}" in language {source_lang!s}, set {{target_language}} = {target_lang!s}.',
} }
response = await self.client.beta.chat.completions.parse( response = await self.client.beta.chat.completions.parse(
model=model, model=model,

View File

@ -0,0 +1,47 @@
from dependency_injector.wiring import inject, Provide
from sqlalchemy.ext.asyncio import async_sessionmaker, AsyncSession
from greek_lang.audio.manager import get_pronunciation
from greek_lang.database.container import DatabaseContainer
from greek_lang.languages import LanguageEnum
from greek_lang.openai_manager.container import OpenAiContainer
from greek_lang.openai_manager.manager import OpenAiManager
from greek_lang.glossaries.models import GlossaryWord, LexicalCategoryEnum
@inject
async def translate(
word: str,
source_lang: LanguageEnum,
target_lang: LanguageEnum = LanguageEnum.ru,
note: str | None = None,
tags: tuple[str, ...] = tuple(),
open_ai_manager: OpenAiManager = Provide[OpenAiContainer.ai_manager],
db_session_maker: async_sessionmaker[AsyncSession] = Provide[
DatabaseContainer.async_session_maker,
],
) -> GlossaryWord:
word_response = await open_ai_manager.get_gpt_response(
word=word,
source_lang=source_lang,
target_lang=target_lang,
)
pronon = await get_pronunciation(text="έμπορος", source_lang=source_lang)
async with db_session_maker() as db_session, db_session.begin():
glossary_word = GlossaryWord(
term=word_response.lemma,
language=source_lang.value,
transcription=word_response.transcription,
translation=word_response.translation,
description=word_response.description,
lexical_category=LexicalCategoryEnum(word_response.part_of_speech),
meaning_category=word_response.category,
example=f"{word_response.example}({word_response.example_translation})",
etymology=word_response.etymology,
note=note,
tags=list(tags),
audio_file=pronon.getvalue(),
)
db_session.add(glossary_word)
return glossary_word