Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Sleeping

App Files Files Community

Soumen commited on Sep 8, 2023

Commit

9d5dc1c

1 Parent(s): b6fbd57

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -11

app.py CHANGED Viewed

@@ -40,16 +40,26 @@ headers1 = {"Authorization": "Bearer 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"}
 API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
 headers2 = {"Authorization": "Bearer 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"}
-def read_pdf(file):
-#     images=pdf2image.convert_from_path(file)
-#     # print(type(images))
-    pdfReader = PdfFileReader(file)
-    count = pdfReader.numPages
-    all_page_text = " "
-    for i in range(count):
-        page = pdfReader.getPage(i)
-        all_page_text += page.extractText()+" "
-    return all_page_text
 def engsum(output):
     def query(payload):
@@ -90,7 +100,7 @@ def main():
             #file = uploaded_photo.read() # Read the data
             #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
             #image_result.write(file)
-            tet = read_pdf(uploaded_photo)
             #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
             text = tet[values[0]*7*10:values[1]*7*100] if values[0]!=len(tet)//(7*100) else tet[len(tet)//(7*100):]

 API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
 headers2 = {"Authorization": "Bearer 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"}
+# def read_pdf(file):
+# #     images=pdf2image.convert_from_path(file)
+# #     # print(type(images))
+#     pdfReader = PdfFileReader(file)
+#     count = pdfReader.numPages
+#     all_page_text = " "
+#     for i in range(count):
+#         page = pdfReader.getPage(i)
+#         all_page_text += page.extractText()+" "
+#     return all_page_text
+def read_pdf_with_pdfplumber(file):
+    # Open the uploaded PDF file with pdfplumber
+    with pdfplumber.open(file) as pdf:
+        extracted_text = ''
+        for page in pdf.pages:
+            extracted_text += page.extract_text()
+    # Display the extracted text
+    #st.text(extracted_text)
+    return extracted_text
 def engsum(output):
     def query(payload):
             #file = uploaded_photo.read() # Read the data
             #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
             #image_result.write(file)
+            tet = read_pdf_with_pdfplumber(uploaded_photo)
             #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
             text = tet[values[0]*7*10:values[1]*7*100] if values[0]!=len(tet)//(7*100) else tet[len(tet)//(7*100):]