Membuat Deteksi Plagiasi Dengan Python

Membuat Deteksi Plagiasi Dengan Python

ON NOVEMBER 22, 2020 / IN Guide, Hacker, Innovation, Technology

Dataset

Kodingan

Crawl Dokumen Dengan Glob

import glob
filedok = glob.glob('datasets/*.txt')
text_file = [open(file).read() for file in filedok]print(filedok)
print('='*48)
print(text_file[0])
['datasets/siswa3.txt', 'datasets/siswa2.txt', 'datasets/siswa1.txt']
================================================
The dimension of the smartphone is 164.9 x 75.1 x 8.5 mm and it weighs 188 grams. It is powered by Qualcomm SM4250 Snapdragon 460 processor and comes in 6.52 inches IPS LCD, which is protected by Corning Gorilla Glass 3.
from sklearn.feature_extraction.text import TfidfVectorizer
#mennggunakan tfidf untuk pembobotan
vectorizer = TfidfVectorizer()
vec = vectorizer.fit_transform(text_file).toarray()
['164', '188', '460', '52', '75', 'ago', 'an', 'and', 'another', 'birds', 'british', 'by', 'comes', 'corning', 'creation', 'creatures', 'dimension', 'farmer', 'fishes', 'for', 'from', 'glass', 'gorilla', 'goverment', 'government', 'grams', 'happy', 'here', 'history', 'home', 'hundred', 'immaculate', 'in', 'inches', 'innumerable', 'ips', 'is', 'islands', 'it', 'lay', 'lcd', 'lions', 'lived', 'long', 'millions', 'mm', 'more', 'no', 'of', 'other', 'powered', 'predestined', 'processor', 'protected', 'qualcomm', 'quite', 'republican', 'resort', 'sea', 'sm4250', 'smartphone', 'snapdragon', 'sort', 'store', 'the', 'there', 'these', 'times', 'to', 'up', 'was', 'wealth', 'weighs', 'were', 'when', 'which', 'world', 'written']
[0.         0.         0.         0.         0.         0.09769707
0.09769707 0.15174098 0.09769707 0.09769707 0.09769707 0.
0. 0. 0.09769707 0.19539414 0. 0.09769707
0.09769707 0.19539414 0.09769707 0. 0. 0.12845991
0. 0. 0.09769707 0.09769707 0.09769707 0.09769707
0.09769707 0.09769707 0. 0. 0.09769707 0.
0. 0.09769707 0. 0.09769707 0. 0.09769707
0.09769707 0.09769707 0.19539414 0. 0.09769707 0.09769707
0.53109344 0.09769707 0. 0.09769707 0. 0.
0. 0.09769707 0. 0.09769707 0.09769707 0.
0. 0. 0.09769707 0.19539414 0.37935246 0.09769707
0.09769707 0.09769707 0.09769707 0.09769707 0.09769707 0.09769707
0. 0.09769707 0.09769707 0. 0.09769707 0.09769707]
vec_list = list(zip(filedok, vec))
from sklearn.metrics.pairwise import cosine_similarityplag =set()
for siswa, text_vector in vec_list:
new_vec = vec_list.copy()
indexx = new_vec.index((siswa, text_vector))
del new_vec[indexx]
for siswa_a, text_vector_a in new_vec:
sim = cosine_similarity([text_vector, text_vector_a])[0][1]
student_pair = sorted((siswa, siswa_a))
score = (student_pair[0],student_pair[1], "{:.1f}".format(sim*100)+'%')
plag.add(score)
for x in plag:
print(x)
('siswa1.txt', 'siswa2.txt', '97.5%')
('siswa1.txt', 'siswa3.txt', '13.9%')
('siswa2.txt', 'siswa3.txt', '14.0%')

Referensi