merterbak commited on
Commit
fab12fa
Β·
verified Β·
1 Parent(s): 0d625e0

add pdf support

Browse files
Files changed (1) hide show
  1. app.py +41 -31
app.py CHANGED
@@ -171,36 +171,25 @@ def process_image(image, mode, task, custom_prompt):
171
 
172
  return cleaned, markdown, result, img_out, crops
173
 
174
- @spaces.GPU(duration=300)
175
- def process_pdf(path, mode, task, custom_prompt):
176
  doc = fitz.open(path)
177
- texts, markdowns, raws, all_crops = [], [], [], []
178
-
179
- for i in range(len(doc)):
180
- page = doc.load_page(i)
181
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
182
- img = Image.open(BytesIO(pix.tobytes("png")))
183
-
184
- text, md, raw, _, crops = process_image(img, mode, task, custom_prompt)
185
-
186
- if text and text != "No text":
187
- texts.append(f"### Page {i + 1}\n\n{text}")
188
- markdowns.append(f"### Page {i + 1}\n\n{md}")
189
- raws.append(f"=== Page {i + 1} ===\n{raw}")
190
- all_crops.extend(crops)
191
-
192
  doc.close()
193
 
194
- return ("\n\n---\n\n".join(texts) if texts else "No text in PDF",
195
- "\n\n---\n\n".join(markdowns) if markdowns else "No text in PDF",
196
- "\n\n".join(raws), None, all_crops)
197
 
198
- def process_file(path, mode, task, custom_prompt):
199
  if not path:
200
  return "Error Upload file", "", "", None, []
201
-
202
  if path.lower().endswith('.pdf'):
203
- return process_pdf(path, mode, task, custom_prompt)
204
  else:
205
  return process_image(Image.open(path), mode, task, custom_prompt)
206
 
@@ -211,12 +200,21 @@ def toggle_prompt(task):
211
  return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
212
  return gr.update(visible=False)
213
 
214
- def load_image(file_path):
 
 
 
 
 
 
 
 
215
  if not file_path:
216
  return None
217
  if file_path.lower().endswith('.pdf'):
218
  doc = fitz.open(file_path)
219
- page = doc.load_page(0)
 
220
  pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
221
  img = Image.open(BytesIO(pix.tobytes("png")))
222
  doc.close()
@@ -224,10 +222,19 @@ def load_image(file_path):
224
  else:
225
  return Image.open(file_path)
226
 
 
 
 
 
 
 
 
 
 
227
  with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
228
  gr.Markdown("""
229
  # πŸš€ DeepSeek-OCR Demo
230
- **Convert documents to markdown, extract raw text, and locate specific content with bounding boxes. It takes 20~ sec for markdown and 3~ sec for locate task. Check the info at the bottom of the page for more information.**
231
 
232
  **Hope this tool was helpful! If so, a quick like ❀️ would mean a lot :)**
233
  """)
@@ -236,6 +243,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
236
  with gr.Column(scale=1):
237
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
238
  input_img = gr.Image(label="Input Image", type="pil", height=300)
 
239
  mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Gundam", label="Mode")
240
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="πŸ“‹ Markdown", label="Task")
241
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
@@ -280,17 +288,19 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
280
  - **Custom**: Your own prompt (add `<|grounding|>` for boxes)
281
  """)
282
 
283
- file_in.change(load_image, [file_in], [input_img])
 
 
284
  task.change(toggle_prompt, [task], [prompt])
285
 
286
- def run(image, file_path, mode, task, custom_prompt):
 
 
287
  if image is not None:
288
  return process_image(image, mode, task, custom_prompt)
289
- if file_path:
290
- return process_file(file_path, mode, task, custom_prompt)
291
  return "Error uploading file or image", "", "", None, []
292
 
293
- btn.click(run, [input_img, file_in, mode, task, prompt],
294
  [text_out, md_out, raw_out, img_out, gallery])
295
 
296
  if __name__ == "__main__":
 
171
 
172
  return cleaned, markdown, result, img_out, crops
173
 
174
+ @spaces.GPU(duration=60)
175
+ def process_pdf(path, mode, task, custom_prompt, page_num):
176
  doc = fitz.open(path)
177
+ total_pages = len(doc)
178
+ if page_num < 1 or page_num > total_pages:
179
+ doc.close()
180
+ return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
181
+ page = doc.load_page(page_num - 1)
182
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
183
+ img = Image.open(BytesIO(pix.tobytes("png")))
 
 
 
 
 
 
 
 
184
  doc.close()
185
 
186
+ return process_image(img, mode, task, custom_prompt)
 
 
187
 
188
+ def process_file(path, mode, task, custom_prompt, page_num):
189
  if not path:
190
  return "Error Upload file", "", "", None, []
 
191
  if path.lower().endswith('.pdf'):
192
+ return process_pdf(path, mode, task, custom_prompt, page_num)
193
  else:
194
  return process_image(Image.open(path), mode, task, custom_prompt)
195
 
 
200
  return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
201
  return gr.update(visible=False)
202
 
203
+ def get_pdf_page_count(file_path):
204
+ if not file_path or not file_path.lower().endswith('.pdf'):
205
+ return 1
206
+ doc = fitz.open(file_path)
207
+ count = len(doc)
208
+ doc.close()
209
+ return count
210
+
211
+ def load_image(file_path, page_num=1):
212
  if not file_path:
213
  return None
214
  if file_path.lower().endswith('.pdf'):
215
  doc = fitz.open(file_path)
216
+ page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
217
+ page = doc.load_page(page_idx)
218
  pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
219
  img = Image.open(BytesIO(pix.tobytes("png")))
220
  doc.close()
 
222
  else:
223
  return Image.open(file_path)
224
 
225
+ def update_page_selector(file_path):
226
+ if not file_path:
227
+ return gr.update(visible=False)
228
+ if file_path.lower().endswith('.pdf'):
229
+ page_count = get_pdf_page_count(file_path)
230
+ return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
231
+ label=f"Select Page (1-{page_count})")
232
+ return gr.update(visible=False)
233
+
234
  with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
235
  gr.Markdown("""
236
  # πŸš€ DeepSeek-OCR Demo
237
+ **Convert documents to markdown, extract raw text, and locate specific content with bounding boxes. It takes 20~ sec for markdown and 3~ sec for locate task examples. Check the info at the bottom of the page for more information.**
238
 
239
  **Hope this tool was helpful! If so, a quick like ❀️ would mean a lot :)**
240
  """)
 
243
  with gr.Column(scale=1):
244
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
245
  input_img = gr.Image(label="Input Image", type="pil", height=300)
246
+ page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
247
  mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Gundam", label="Mode")
248
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="πŸ“‹ Markdown", label="Task")
249
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
 
288
  - **Custom**: Your own prompt (add `<|grounding|>` for boxes)
289
  """)
290
 
291
+ file_in.change(load_image, [file_in, page_selector], [input_img])
292
+ file_in.change(update_page_selector, [file_in], [page_selector])
293
+ page_selector.change(load_image, [file_in, page_selector], [input_img])
294
  task.change(toggle_prompt, [task], [prompt])
295
 
296
+ def run(image, file_path, mode, task, custom_prompt, page_num):
297
+ if file_path:
298
+ return process_file(file_path, mode, task, custom_prompt, int(page_num))
299
  if image is not None:
300
  return process_image(image, mode, task, custom_prompt)
 
 
301
  return "Error uploading file or image", "", "", None, []
302
 
303
+ btn.click(run, [input_img, file_in, mode, task, prompt, page_selector],
304
  [text_out, md_out, raw_out, img_out, gallery])
305
 
306
  if __name__ == "__main__":