-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.py
97 lines (88 loc) · 3.21 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
from flask import Flask, request, render_template
import fitz # PyMuPDF
import numpy as np
import glob
def levenshtein(seq1, seq2):
size_x = len(seq1) + 1
size_y = len(seq2) + 1
# defining a zero matrix of size of first string * second string
matrix = np.zeros ((size_x, size_y))
for x in range(size_x):
matrix [x, 0] = x # row aray with elements of x
for y in range(size_y):
matrix [0, y] = y # column array with elements of y
for x in range(1, size_x):
for y in range(1, size_y):
if seq1[x-1] == seq2[y-1]: # if the alphabets at the postion is same
matrix [x,y] = min(
matrix[x-1, y] + 1,
matrix[x-1, y-1],
matrix[x, y-1] + 1
)
else: # if the alphabbets at the position are different
matrix [x,y] = min(
matrix[x-1,y] + 1,
matrix[x-1,y-1] + 1,
matrix[x,y-1] + 1
)
# returning the levenshtein distance i.e last element of the matrix
return (matrix[size_x - 1, size_y - 1])
app = Flask(__name__)
UPLOAD_FOLDER = 'uploads'
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
@app.route('/')
def index():
return render_template('index.html')
@app.route('/upload', methods=['POST'])
def upload_files():
if 'file' not in request.files:
return 'No file part'
files = request.files.getlist('file')
plagarism = request.form.get('num', type = int)
pdf_dict = {}
for file in files:
if file.filename.endswith('.pdf'):
# Create subdirectories if they do not exist
file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
file.save(file_path)
pdf_text = extract_text_from_pdf(file_path)
pdf_dict[file.filename] = pdf_text
plagfiles = []
k=0
for key1,value1 in pdf_dict.items() :
for key2,value2 in pdf_dict.items():
if key1!=key2:
str1=value1.replace(' ', '')
str2=value2.replace(' ', '')
if(len(str1)>len(str2)):
length=len(str1)
else:
length=len(str2)
n = 100-round((levenshtein(str1,str2)/length)*100,2)
if plagarism<n:
a1 = "For the files "
a2 = str(key1)
a3 = " and "
a4 = str(key2)
a5 = " has "
a6 = str(n)
a7 = " "
a8 = "% plagiarised"
a= [a1,a2,a3,a4,a5,a6,a7,a8]
plagfiles.append(a)
k = k+1
if k == 0:
plagfiles.append("No documents are plagiarised")
return render_template('index.html', plagfiles=plagfiles)
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
if __name__ == '__main__':
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
app.run(debug=True)