feat: update/add pageSeparator params to LlamaParseReader (#1051)

run-llama · Jul 25, 2024 · fb6db45 · fb6db45
1 parent e4d4e0d
commit fb6db45
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 5 deletions.
diff --git a/.changeset/weak-news-train.md b/.changeset/weak-news-train.md
@@ -0,0 1,5 @@
 ---
 "llamaindex": patch
 ---
 
 feat: add pageSeparator params to LlamaParseReader
diff --git a/apps/docs/docs/modules/data_loaders/discord.mdx b/apps/docs/docs/modules/data_loaders/discord.mdx
@@ -20,7 20,7 @@ Copy the URL in your browser and select the server you want your bot to join.
 #### DiscordReader()
 
 - `discordToken?`: The Discord bot token.
-- `makeRequest?`: Optionally provide a custom request function for edge environments, e.g. `fetch`. See discord.js for more info.
 - `requestHandler?`: Optionally provide a custom request function for edge environments, e.g. `fetch`. See discord.js for more info.
 
 #### DiscordReader.loadData
 

diff --git a/apps/docs/docs/modules/data_loaders/index.mdx b/apps/docs/docs/modules/data_loaders/index.mdx
@@ -16,7 16,15 @@ It is a simple reader that reads all files from a directory and its subdirectori
 
 <CodeBlock language="ts">{CodeSource}</CodeBlock>
 
-Currently, it supports reading `.txt`, `.pdf`, `.csv`, `.md`, `.docx`, `.htm`, `.html`, `.jpg`, `.jpeg`, `.png` and `.gif` files, but support for other file types is planned.
 Currently, the following readers are mapped to specific file types:
 
 - [TextFileReader](../../api/classes/TextFileReader.md): `.txt`
 - [PDFReader](../../api/classes/PDFReader.md): `.pdf`
 - [PapaCSVReader](../../api/classes/PapaCSVReader.md): `.csv`
 - [MarkdownReader](../../api/classes/MarkdownReader.md): `.md`
 - [DocxReader](../../api/classes/DocxReader.md): `.docx`
 - [HTMLReader](../../api/classes/HTMLReader.md): `.htm`, `.html`
 - [ImageReader](../../api/classes/ImageReader.md): `.jpg`, `.jpeg`, `.png`, `.gif`
 
 You can modify the reader three different ways:
 

diff --git a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
@@ -41,7 41,9 @@ They can be divided into two groups.
 - `doNotCache?` Optional. Set to true to not cache the document.
 - `fastMode?` Optional. Set to true to use the fast mode. This mode will skip OCR of images, and table/heading reconstruction. Note: Non-compatible with `gpt4oMode`.
 - `doNotUnrollColumns?` Optional. Set to true to keep the text according to document layout. Reduce reconstruction accuracy, and LLMs/embeddings performances in most cases.
-- `pageSeparator?` Optional. The page separator to use. Defaults is `\\n---\\n`.
 - `pageSeparator?` Optional. A templated page separator to use to split the text. If the results contain `{page_number}` (e.g. JSON mode), it will be replaced by the next page number. If not set the default separator `\\n---\\n` will be used.
 - `pagePrefix?` Optional. A templated prefix to add to the beginning of each page. If the results contain `{page_number}`, it will be replaced by the page number.
 - `pageSuffix?` Optional. A templated suffix to add to the end of each page. If the results contain `{page_number}`, it will be replaced by the page number.
 - `gpt4oMode` Deprecated. Use vendorMultimodal params. Set to true to use GPT-4o to extract content. Default is `false`.
 - `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
 - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document.

diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts
@@ -127,8 127,12 @@ export class LlamaParseReader extends FileReader {
  fastMode?: boolean;
  // Wether to keep column in the text according to document layout. Reduce reconstruction accuracy, and LLM's/embedings performances in most cases.
  doNotUnrollColumns?: boolean;
- // The page separator to use to split the text. Default is None, which means the parser will use the default separator '\\n---\\n'.
  // A templated page separator to use to split the text. If the results contain `{page_number}` (e.g. JSON mode), it will be replaced by the next page number. If not set the default separator '\\n---\\n' will be used.
  pageSeparator?: string;
  //A templated prefix to add to the beginning of each page. If the results contain `{page_number}`, it will be replaced by the page number.>
  pagePrefix?: string;
  // A templated suffix to add to the end of each page. If the results contain `{page_number}`, it will be replaced by the page number.
  pageSuffix?: string;
  // Deprecated. Use vendorMultimodal params. Whether to use gpt-4o to extract text from documents.
  gpt4oMode: boolean = false;
  // Deprecated. Use vendorMultimodal params. The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY.
@@ -198,6 202,8 @@ export class LlamaParseReader extends FileReader {
  fast_mode: this.fastMode?.toString(),
  do_not_unroll_columns: this.doNotUnrollColumns?.toString(),
  page_separator: this.pageSeparator,
  page_prefix: this.pagePrefix,
  page_suffix: this.pageSuffix,
  gpt4o_mode: this.gpt4oMode?.toString(),
  gpt4o_api_key: this.gpt4oApiKey,
  bounding_box: this.boundingBox,
@@ -207,8 213,17 @@ export class LlamaParseReader extends FileReader {
  vendor_multimodal_api_key: this.vendorMultimodalApiKey,
  };
 
  // Filter out params with invalid values that would cause issues on the backend.
  const filteredParams = this.filterSpecificParams(LlamaParseBodyParams, [
  "page_separator",
  "page_prefix",
  "page_suffix",
  "bounding_box",
  "target_pages",
  ]);
 
  // Appends body with any defined LlamaParseBodyParams
- Object.entries(LlamaParseBodyParams).forEach(([key, value]) => {
  Object.entries(filteredParams).forEach(([key, value]) => {
  if (value !== undefined) {
  body.append(key, value);
  }
@@ -452,6 467,24 @@ export class LlamaParseReader extends FileReader {
  await fs.writeFile(imagePath, buffer);
  }
 
  // Filters out invalid values (null, undefined, empty string) of specific params.
  private filterSpecificParams(
  params: Record<string, any>,
  keysToCheck: string[],
  ): Record<string, any> {
  const filteredParams: Record<string, any> = {};
  for (const [key, value] of Object.entries(params)) {
  if (keysToCheck.includes(key)) {
  if (value !== null && value !== undefined && value !== "") {
  filteredParams[key] = value;
  }
  } else {
  filteredParams[key] = value;
  }
  }
  return filteredParams;
  }
 
  static async getMimeType(
  data: Uint8Array,
  ): Promise<{ mime: string; extension: string }> {