Source code for resumeanalyser.text_reading

# Imports
import os
from docx import Document
from pypdf import PdfReader

[docs] def docx_to_text(filepath): """ Basic function to extract text from a Word document, given a specified file path. Parameters: filepath (str): A string containing the filepath. Returns: text (str): A string containing the extracted text. Example: >>> pdf_to_text('~/alphabet.docx') 'abcdefghijklmnopqrstuvwxyz' """ # Check that file path ends with docx if not filepath.lower().endswith('.docx'): raise ValueError("Please provide a .docx file. Consider using pdf_to_text if you are using a PDF.") try: with open(filepath, 'rb') as f: document = Document(filepath) full_text = [] for paragraph in document.paragraphs: full_text.append(paragraph.text) text = str(' '.join(full_text)) return text except Exception as e: print('File reading error.')
[docs] def pdf_to_text(filepath): """ Basic function to extract text from a PDF file, given a specified file path. Parameters: filepath (str): A string containing the filepath. Returns: text (str): A string containing the extracted text. Example: >>> pdf_to_text('~/alphabet.pdf') 'abcdefghijklmnopqrstuvwxyz' """ if not filepath.lower().endswith('.pdf'): raise ValueError("Please provide a .pdf file. Consider using docx_to_text if you are using a docx document.") try: with open(filepath, 'rb') as f: reader = PdfReader(f) text = "" for page in reader.pages: text += page.extract_text() + " " # Stripping whitespace text = text.strip() return text except Exception as e: print('File reading error.')
# Website text extraction has not been implemented yet # def website_to_text(url): # """ # Basic function to extract text from a website, given a URL. # Parameters: # url (str): A string containing the filepath. # Returns: # text (str): A string containing the extracted text. # Example: # >>> website_to_text('www.alphabet.com') # 'abcdefghijklmnopqrstuvwxyz' # """ # return text