Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support Tika for document text extraction #3582

Merged
merged 2 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Added HTML and Typescript UI components to support configration of te…
…xt extraction engine.

Updated RAG /config and /config/update endpoints to support UI updates.

Fixed .dockerignore to prevent Python venv from being copied into Docker image.
  • Loading branch information
nickovs committed Jul 1, 2024
commit 7aa35a37573c1d0af136176756a16ba73b74f74b
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 11,7 @@ vite.config.js.timestamp-*
vite.config.ts.timestamp-*
__pycache__
.idea
venv
_old
uploads
.ipynb_checkpoints
Expand Down
30 changes: 25 additions & 5 deletions backend/apps/rag/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 93,7 @@
SRC_LOG_LEVELS,
UPLOAD_DIR,
DOCS_DIR,
DOCUMENT_USE_TIKA,
TEXT_EXTRACTION_ENGINE,
TIKA_SERVER_URL,
RAG_TOP_K,
RAG_RELEVANCE_THRESHOLD,
Expand Down Expand Up @@ -150,6 150,9 @@
ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
)

app.state.config.TEXT_EXTRACTION_ENGINE = TEXT_EXTRACTION_ENGINE
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL

app.state.config.CHUNK_SIZE = CHUNK_SIZE
app.state.config.CHUNK_OVERLAP = CHUNK_OVERLAP

Expand Down Expand Up @@ -390,6 393,10 @@ async def get_rag_config(user=Depends(get_admin_user)):
return {
"status": True,
"pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
"text_extraction": {
"engine": app.state.config.TEXT_EXTRACTION_ENGINE,
"tika_server_url": app.state.config.TIKA_SERVER_URL,
},
"chunk": {
"chunk_size": app.state.config.CHUNK_SIZE,
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
Expand Down Expand Up @@ -419,6 426,11 @@ async def get_rag_config(user=Depends(get_admin_user)):
}


class TextExtractionConfig(BaseModel):
engine: str = ""
tika_server_url: Optional[str] = None


class ChunkParamUpdateForm(BaseModel):
chunk_size: int
chunk_overlap: int
Expand Down Expand Up @@ -452,6 464,7 @@ class WebConfig(BaseModel):

class ConfigUpdateForm(BaseModel):
pdf_extract_images: Optional[bool] = None
text_extraction: Optional[TextExtractionConfig] = None
chunk: Optional[ChunkParamUpdateForm] = None
youtube: Optional[YoutubeLoaderConfig] = None
web: Optional[WebConfig] = None
Expand All @@ -465,6 478,11 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
else app.state.config.PDF_EXTRACT_IMAGES
)

if form_data.text_extraction is not None:
log.info(f"Updating text settings: {form_data.text_extraction}")
app.state.config.TEXT_EXTRACTION_ENGINE = form_data.text_extraction.engine
app.state.config.TIKA_SERVER_URL = form_data.text_extraction.tika_server_url

if form_data.chunk is not None:
app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size
app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
Expand Down Expand Up @@ -501,6 519,10 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
return {
"status": True,
"pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
"text_extraction": {
"engine": app.state.config.TEXT_EXTRACTION_ENGINE,
"tika_server_url": app.state.config.TIKA_SERVER_URL,
},
"chunk": {
"chunk_size": app.state.config.CHUNK_SIZE,
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
Expand Down Expand Up @@ -1001,7 1023,7 @@ def load(self) -> List[Document]:
else:
headers = {}

endpoint = str(TIKA_SERVER_URL)
endpoint = app.state.config.TIKA_SERVER_URL
if not endpoint.endswith("/"):
endpoint = "/"
endpoint = "tika/text"
Expand Down Expand Up @@ -1072,9 1094,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
"msg",
]

log.warning("Use tika: %s, server URL: %s", DOCUMENT_USE_TIKA, TIKA_SERVER_URL)

if DOCUMENT_USE_TIKA and TIKA_SERVER_URL:
if app.state.config.TEXT_EXTRACTION_ENGINE == "tika" and app.state.config.TIKA_SERVER_URL:
if file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0
):
Expand Down
8 changes: 4 additions & 4 deletions backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,10 882,10 @@ class BannerModel(BaseModel):
# RAG document text extraction
####################################

DOCUMENT_USE_TIKA = PersistentConfig(
"DOCUMENT_USE_TIKA",
"rag.document_use_tika",
os.environ.get("DOCUMENT_USE_TIKA", "false").lower() == "true"
TEXT_EXTRACTION_ENGINE = PersistentConfig(
"TEXT_EXTRACTION_ENGINE",
"rag.text_extraction_engine",
os.environ.get("TEXT_EXTRACTION_ENGINE", "").lower()
)

TIKA_SERVER_URL = PersistentConfig(
Expand Down
6 changes: 6 additions & 0 deletions src/lib/apis/rag/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 32,11 @@ type ChunkConfigForm = {
chunk_overlap: number;
};

type TextExtractConfigForm = {
engine: string;
tika_server_url: string | null;
};

type YoutubeConfigForm = {
language: string[];
translation?: string | null;
Expand All @@ -40,6 45,7 @@ type YoutubeConfigForm = {
type RAGConfigForm = {
pdf_extract_images?: boolean;
chunk?: ChunkConfigForm;
text_extraction?: TextExtractConfigForm;
web_loader_ssl_verification?: boolean;
youtube?: YoutubeConfigForm;
};
Expand Down
50 changes: 50 additions & 0 deletions src/lib/components/admin/Settings/Documents.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 37,10 @@
let embeddingModel = '';
let rerankingModel = '';

let textExtractionEngine = 'default';
let tikaServerUrl = '';
let showTikaServerUrl = false;

let chunkSize = 0;
let chunkOverlap = 0;
let pdfExtractImages = true;
Expand Down Expand Up @@ -163,11 167,20 @@
rerankingModelUpdateHandler();
}

if (textExtractionEngine === 'tika' && tikaServerUrl === '') {
toast.error($i18n.t('Tika Server URL required.'));
return;
}

const res = await updateRAGConfig(localStorage.token, {
pdf_extract_images: pdfExtractImages,
chunk: {
chunk_overlap: chunkOverlap,
chunk_size: chunkSize
},
text_extraction: {
engine: textExtractionEngine,
tika_server_url: tikaServerUrl
}
});

Expand Down Expand Up @@ -213,6 226,10 @@

chunkSize = res.chunk.chunk_size;
chunkOverlap = res.chunk.chunk_overlap;

textExtractionEngine = res.text_extraction.engine;
tikaServerUrl = res.text_extraction.tika_server_url;
showTikaServerUrl = textExtractionEngine === 'tika';
}
});
</script>
Expand Down Expand Up @@ -388,6 405,39 @@
</div>
</div>

<hr class="dark:border-gray-850" />

<div class="">
<div class="text-sm font-medium">{$i18n.t('Text Extraction')}</div>

<div class="flex w-full justify-between mt-2">
<div class="self-center text-xs font-medium">{$i18n.t('Engine')}</div>
<div class="flex items-center relative">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={textExtractionEngine}
on:change={(e) => {
showTikaServerUrl = (e.target.value === 'tika');
}}
>
<option value="default">{$i18n.t('Default')}</option>
<option value="tika">{$i18n.t('Tika')}</option>
</select>
</div>
</div>

{#if showTikaServerUrl}
<div class="flex w-full mt-2">
<div class="flex-1 mr-2">
<input
class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
placeholder={$i18n.t('Enter Tika Server URL')}
bind:value={tikaServerUrl}
/>
</div>
</div>
{/if}
</div>
<hr class=" dark:border-gray-850 my-1" />

<div class="space-y-2" />
Expand Down