68 lines
2.6 KiB
Python
68 lines
2.6 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
from Levenshtein import distance
|
|
|
|
|
|
def compare_data(new, sys, count_test, path, school):
|
|
print(f"\nEinträge in System Liste: {len(sys)}")
|
|
print(f"Einträge in Import Liste: {len(new)}")
|
|
|
|
bool_class = 'klasse' in new.columns
|
|
|
|
if bool_class:
|
|
if 'index' in new.columns:
|
|
new = new.drop('index', axis=1)
|
|
unique_classes(new, school)
|
|
unique_classes(sys, school)
|
|
sys = sys.drop(columns=['klasse'])
|
|
merged_df = pd.merge(new[['name', 'vorname', 'klasse']], sys, on=['name', 'vorname'], how='outer', indicator=True)
|
|
matches = pd.merge(new, sys, on=['name', 'vorname'])
|
|
|
|
else:
|
|
merged_df = pd.merge(new[['name', 'vorname']], sys, on=['name', 'vorname'], how='outer', indicator=True)
|
|
matches = pd.merge(new, sys, on=['name', 'vorname'])
|
|
|
|
# Subsets für Zeilen erstellen, die nur in einem der DataFrames vorhanden sind
|
|
only_new = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
|
|
only_sys = merged_df[merged_df['_merge'] == 'right_only'].drop(columns=['_merge'])
|
|
|
|
print_status(matches, only_new, only_sys, count_test, path)
|
|
search_typos(only_new[['name', 'vorname']], only_sys[['name', 'vorname']])
|
|
|
|
return matches, only_new
|
|
|
|
|
|
def search_typos(new, sys):
|
|
typos = []
|
|
col1 = 'vorname'
|
|
col2 = 'name'
|
|
for idx1, row1 in new.iterrows():
|
|
for idx2, row2 in sys.iterrows():
|
|
if distance(row1[col1], row2[col1]) <= 2 and distance(row1[col2], row2[col2]) <= 2:
|
|
typos.append(([row1[col1], row1[col2]], [row2[col1], row2[col2]]))
|
|
typo_df = pd.DataFrame(typos, columns=['Import', 'System'])
|
|
if len(typo_df) > 0:
|
|
print('Mögliche Tippfehler:', len(typo_df), '\n', typo_df)
|
|
else:
|
|
print('Mögliche Tippfehler: keine Fehler gefunden!')
|
|
|
|
|
|
def unique_classes(df, school):
|
|
df['klasse'] = df['klasse'].str.split(',')
|
|
df = df.explode('klasse')
|
|
eindeutige_klassen = df['klasse'].unique()
|
|
with open(f'./Data/{school}/gen_klassen.txt', 'a') as file:
|
|
file.write(np.array_str(eindeutige_klassen))
|
|
# print(eindeutige_klassen)
|
|
|
|
|
|
def print_status(matches, new, old, count_test, path):
|
|
print("\nAnzahl neuer Nutzer:", len(new))
|
|
print("Anzahl Übereinstimmungen:", len(matches) + count_test)
|
|
print("Anzahl veralteter Nutzer:", len(old) - count_test)
|
|
with pd.ExcelWriter(path, engine='openpyxl') as writer:
|
|
|
|
matches.to_excel(writer, sheet_name='Matches', index=False)
|
|
new.to_excel(writer, sheet_name='Neu', index=False)
|
|
old.to_excel(writer, sheet_name='Alt', index=False)
|