legilo/text_from_ebook.py at main · christianrosdahl/legilo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import ebooklib
import re
import unicodedata

from bs4 import BeautifulSoup
from ebooklib import epub
from pypdf import PdfReader
from PyQt5.QtWidgets import QFileDialog


def get_text_from_epub_or_pdf(parent_window):
    file_path, _ = QFileDialog.getOpenFileName(
        parent_window, "Open EPUB or PDF File", ".", "EPUB and PDF Files (*.epub *.pdf)"
    )

    file_ending = file_path.split(".")[-1]
    if not file_path or not file_ending in ["epub", "pdf"]:
        return None

    text = None
    if file_ending == "epub":
        text = epub_to_text(file_path)
    elif file_ending == "pdf":
        text = pdf_to_text(file_path)
    if text:
        text = clean_text(text)

    return text


def epub_to_text(epub_path):
    book = epub.read_epub(epub_path)
    text = ""
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            soup = BeautifulSoup(item.get_content(), "html.parser")
            text += soup.get_text(separator="\n")
    return text


def pdf_to_text(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []
    for page in reader.pages:
        page_text = re.sub(r"\t", " ", page.extract_text())
        pages.append(page_text)
    return "\n".join(pages)


def clean_text(text):
    text = replace_ligatures(text)
    text = merge_split_words(text)
    text = remove_superfluous_newlines(text)
    return text


def replace_ligatures(text):
    return "".join(
        (
            unicodedata.normalize("NFKD", char)
            if "LATIN" in unicodedata.name(char, "")
            else char
        )
        for char in text
    )


def merge_split_words(text):
    lines = text.split("\n")
    for i in range(len(lines) - 1):
        if lines[i].endswith("-") and lines[i + 1]:
            rest_of_word = lines[i + 1].split()[0]
            lines[i] = lines[i][:-1] + rest_of_word
            lines[i + 1] = lines[i + 1].lstrip(rest_of_word).lstrip()
    return "\n".join(lines)


def remove_superfluous_newlines(text):
    # Replace three or more consecutive newlines with just two
    text = re.sub(r"\n{3,}", "\n\n", text)
    # Remove newlines at the beginning and end of the text
    text = text.strip("\n")
    return text