Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support Tika for document text extraction #3582

Merged
merged 2 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 10,8 @@ node_modules
vite.config.js.timestamp-*
vite.config.ts.timestamp-*
__pycache__
.env
.idea
venv
_old
uploads
.ipynb_checkpoints
Expand Down
147 changes: 107 additions & 40 deletions backend/apps/rag/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 93,8 @@
SRC_LOG_LEVELS,
UPLOAD_DIR,
DOCS_DIR,
TEXT_EXTRACTION_ENGINE,
TIKA_SERVER_URL,
RAG_TOP_K,
RAG_RELEVANCE_THRESHOLD,
RAG_EMBEDDING_ENGINE,
Expand Down Expand Up @@ -148,6 150,9 @@
ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
)

app.state.config.TEXT_EXTRACTION_ENGINE = TEXT_EXTRACTION_ENGINE
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL

app.state.config.CHUNK_SIZE = CHUNK_SIZE
app.state.config.CHUNK_OVERLAP = CHUNK_OVERLAP

Expand Down Expand Up @@ -388,6 393,10 @@ async def get_rag_config(user=Depends(get_admin_user)):
return {
"status": True,
"pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
"text_extraction": {
"engine": app.state.config.TEXT_EXTRACTION_ENGINE,
"tika_server_url": app.state.config.TIKA_SERVER_URL,
},
"chunk": {
"chunk_size": app.state.config.CHUNK_SIZE,
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
Expand Down Expand Up @@ -417,6 426,11 @@ async def get_rag_config(user=Depends(get_admin_user)):
}


class TextExtractionConfig(BaseModel):
engine: str = ""
tika_server_url: Optional[str] = None


class ChunkParamUpdateForm(BaseModel):
chunk_size: int
chunk_overlap: int
Expand Down Expand Up @@ -450,6 464,7 @@ class WebConfig(BaseModel):

class ConfigUpdateForm(BaseModel):
pdf_extract_images: Optional[bool] = None
text_extraction: Optional[TextExtractionConfig] = None
chunk: Optional[ChunkParamUpdateForm] = None
youtube: Optional[YoutubeLoaderConfig] = None
web: Optional[WebConfig] = None
Expand All @@ -463,6 478,11 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
else app.state.config.PDF_EXTRACT_IMAGES
)

if form_data.text_extraction is not None:
log.info(f"Updating text settings: {form_data.text_extraction}")
app.state.config.TEXT_EXTRACTION_ENGINE = form_data.text_extraction.engine
app.state.config.TIKA_SERVER_URL = form_data.text_extraction.tika_server_url

if form_data.chunk is not None:
app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size
app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
Expand Down Expand Up @@ -499,6 519,10 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
return {
"status": True,
"pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
"text_extraction": {
"engine": app.state.config.TEXT_EXTRACTION_ENGINE,
"tika_server_url": app.state.config.TIKA_SERVER_URL,
},
"chunk": {
"chunk_size": app.state.config.CHUNK_SIZE,
"chunk_overlap": app.state.config.CHUNK_OVERLAP,
Expand Down Expand Up @@ -985,6 1009,41 @@ def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> b
return False


class TikaLoader:
def __init__(self, file_path, mime_type=None):
self.file_path = file_path
self.mime_type = mime_type

def load(self) -> List[Document]:
with (open(self.file_path, "rb") as f):
data = f.read()

if self.mime_type is not None:
headers = {"Content-Type": self.mime_type}
else:
headers = {}

endpoint = app.state.config.TIKA_SERVER_URL
if not endpoint.endswith("/"):
endpoint = "/"
endpoint = "tika/text"

r = requests.put(endpoint, data=data, headers=headers)

if r.ok:
raw_metadata = r.json()
text = raw_metadata.get("X-TIKA:content", "<No text content found>")

if "Content-Type" in raw_metadata:
headers["Content-Type"] = raw_metadata["Content-Type"]

log.info("Tika extracted text: %s", text)

return [Document(page_content=text, metadata=headers)]
else:
raise Exception(f"Error calling Tika: {r.reason}")


def get_loader(filename: str, file_content_type: str, file_path: str):
file_ext = filename.split(".")[-1].lower()
known_type = True
Expand Down Expand Up @@ -1035,47 1094,55 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
"msg",
]

if file_ext == "pdf":
loader = PyPDFLoader(
file_path, extract_images=app.state.config.PDF_EXTRACT_IMAGES
)
elif file_ext == "csv":
loader = CSVLoader(file_path)
elif file_ext == "rst":
loader = UnstructuredRSTLoader(file_path, mode="elements")
elif file_ext == "xml":
loader = UnstructuredXMLLoader(file_path)
elif file_ext in ["htm", "html"]:
loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
elif file_ext == "md":
loader = UnstructuredMarkdownLoader(file_path)
elif file_content_type == "application/epub zip":
loader = UnstructuredEPubLoader(file_path)
elif (
file_content_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or file_ext in ["doc", "docx"]
):
loader = Docx2txtLoader(file_path)
elif file_content_type in [
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
] or file_ext in ["xls", "xlsx"]:
loader = UnstructuredExcelLoader(file_path)
elif file_content_type in [
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
] or file_ext in ["ppt", "pptx"]:
loader = UnstructuredPowerPointLoader(file_path)
elif file_ext == "msg":
loader = OutlookMessageLoader(file_path)
elif file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0
):
loader = TextLoader(file_path, autodetect_encoding=True)
if app.state.config.TEXT_EXTRACTION_ENGINE == "tika" and app.state.config.TIKA_SERVER_URL:
if file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0
):
loader = TextLoader(file_path, autodetect_encoding=True)
else:
loader = TikaLoader(file_path, file_content_type)
else:
loader = TextLoader(file_path, autodetect_encoding=True)
known_type = False
if file_ext == "pdf":
loader = PyPDFLoader(
file_path, extract_images=app.state.config.PDF_EXTRACT_IMAGES
)
elif file_ext == "csv":
loader = CSVLoader(file_path)
elif file_ext == "rst":
loader = UnstructuredRSTLoader(file_path, mode="elements")
elif file_ext == "xml":
loader = UnstructuredXMLLoader(file_path)
elif file_ext in ["htm", "html"]:
loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
elif file_ext == "md":
loader = UnstructuredMarkdownLoader(file_path)
elif file_content_type == "application/epub zip":
loader = UnstructuredEPubLoader(file_path)
elif (
file_content_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or file_ext in ["doc", "docx"]
):
loader = Docx2txtLoader(file_path)
elif file_content_type in [
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
] or file_ext in ["xls", "xlsx"]:
loader = UnstructuredExcelLoader(file_path)
elif file_content_type in [
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
] or file_ext in ["ppt", "pptx"]:
loader = UnstructuredPowerPointLoader(file_path)
elif file_ext == "msg":
loader = OutlookMessageLoader(file_path)
elif file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0
):
loader = TextLoader(file_path, autodetect_encoding=True)
else:
loader = TextLoader(file_path, autodetect_encoding=True)
known_type = False

return loader, known_type

Expand Down
16 changes: 16 additions & 0 deletions backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 878,22 @@ class BannerModel(BaseModel):
if WEBUI_AUTH and WEBUI_SECRET_KEY == "":
raise ValueError(ERROR_MESSAGES.ENV_VAR_NOT_FOUND)

####################################
# RAG document text extraction
####################################

TEXT_EXTRACTION_ENGINE = PersistentConfig(
"TEXT_EXTRACTION_ENGINE",
"rag.text_extraction_engine",
os.environ.get("TEXT_EXTRACTION_ENGINE", "").lower()
)

TIKA_SERVER_URL = PersistentConfig(
"TIKA_SERVER_URL",
"rag.tika_server_url",
os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment
)

####################################
# RAG
####################################
Expand Down
6 changes: 6 additions & 0 deletions src/lib/apis/rag/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 32,11 @@ type ChunkConfigForm = {
chunk_overlap: number;
};

type TextExtractConfigForm = {
engine: string;
tika_server_url: string | null;
};

type YoutubeConfigForm = {
language: string[];
translation?: string | null;
Expand All @@ -40,6 45,7 @@ type YoutubeConfigForm = {
type RAGConfigForm = {
pdf_extract_images?: boolean;
chunk?: ChunkConfigForm;
text_extraction?: TextExtractConfigForm;
web_loader_ssl_verification?: boolean;
youtube?: YoutubeConfigForm;
};
Expand Down
50 changes: 50 additions & 0 deletions src/lib/components/admin/Settings/Documents.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 37,10 @@
let embeddingModel = '';
let rerankingModel = '';

let textExtractionEngine = 'default';
let tikaServerUrl = '';
let showTikaServerUrl = false;

let chunkSize = 0;
let chunkOverlap = 0;
let pdfExtractImages = true;
Expand Down Expand Up @@ -163,11 167,20 @@
rerankingModelUpdateHandler();
}

if (textExtractionEngine === 'tika' && tikaServerUrl === '') {
toast.error($i18n.t('Tika Server URL required.'));
return;
}

const res = await updateRAGConfig(localStorage.token, {
pdf_extract_images: pdfExtractImages,
chunk: {
chunk_overlap: chunkOverlap,
chunk_size: chunkSize
},
text_extraction: {
engine: textExtractionEngine,
tika_server_url: tikaServerUrl
}
});

Expand Down Expand Up @@ -213,6 226,10 @@

chunkSize = res.chunk.chunk_size;
chunkOverlap = res.chunk.chunk_overlap;

textExtractionEngine = res.text_extraction.engine;
tikaServerUrl = res.text_extraction.tika_server_url;
showTikaServerUrl = textExtractionEngine === 'tika';
}
});
</script>
Expand Down Expand Up @@ -388,6 405,39 @@
</div>
</div>

<hr class="dark:border-gray-850" />

<div class="">
<div class="text-sm font-medium">{$i18n.t('Text Extraction')}</div>

<div class="flex w-full justify-between mt-2">
<div class="self-center text-xs font-medium">{$i18n.t('Engine')}</div>
<div class="flex items-center relative">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={textExtractionEngine}
on:change={(e) => {
showTikaServerUrl = (e.target.value === 'tika');
}}
>
<option value="default">{$i18n.t('Default')}</option>
<option value="tika">{$i18n.t('Tika')}</option>
</select>
</div>
</div>

{#if showTikaServerUrl}
<div class="flex w-full mt-2">
<div class="flex-1 mr-2">
<input
class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
placeholder={$i18n.t('Enter Tika Server URL')}
bind:value={tikaServerUrl}
/>
</div>
</div>
{/if}
</div>
<hr class=" dark:border-gray-850 my-1" />

<div class="space-y-2" />
Expand Down