Files
UCS_Import_Python/src/step2.py
2024-08-27 13:04:25 +02:00

68 lines
2.6 KiB
Python

import numpy as np
import pandas as pd
from Levenshtein import distance
def compare_data(new, sys, count_test, path, school):
print(f"\nEinträge in System Liste: {len(sys)}")
print(f"Einträge in Import Liste: {len(new)}")
bool_class = 'klasse' in new.columns
if bool_class:
if 'index' in new.columns:
new = new.drop('index', axis=1)
unique_classes(new, school)
unique_classes(sys, school)
sys = sys.drop(columns=['klasse'])
merged_df = pd.merge(new[['name', 'vorname', 'klasse']], sys, on=['name', 'vorname'], how='outer', indicator=True)
matches = pd.merge(new, sys, on=['name', 'vorname'])
else:
merged_df = pd.merge(new[['name', 'vorname']], sys, on=['name', 'vorname'], how='outer', indicator=True)
matches = pd.merge(new, sys, on=['name', 'vorname'])
# Subsets für Zeilen erstellen, die nur in einem der DataFrames vorhanden sind
only_new = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
only_sys = merged_df[merged_df['_merge'] == 'right_only'].drop(columns=['_merge'])
print_status(matches, only_new, only_sys, count_test, path)
search_typos(only_new[['name', 'vorname']], only_sys[['name', 'vorname']])
return matches, only_new
def search_typos(new, sys):
typos = []
col1 = 'vorname'
col2 = 'name'
for idx1, row1 in new.iterrows():
for idx2, row2 in sys.iterrows():
if distance(row1[col1], row2[col1]) <= 2 and distance(row1[col2], row2[col2]) <= 2:
typos.append(([row1[col1], row1[col2]], [row2[col1], row2[col2]]))
typo_df = pd.DataFrame(typos, columns=['Import', 'System'])
if len(typo_df) > 0:
print('Mögliche Tippfehler:', len(typo_df), '\n', typo_df)
else:
print('Mögliche Tippfehler: keine Fehler gefunden!')
def unique_classes(df, school):
df['klasse'] = df['klasse'].str.split(',')
df = df.explode('klasse')
eindeutige_klassen = df['klasse'].unique()
with open(f'./Data/{school}/gen_klassen.txt', 'a') as file:
file.write(np.array_str(eindeutige_klassen))
# print(eindeutige_klassen)
def print_status(matches, new, old, count_test, path):
print("\nAnzahl neuer Nutzer:", len(new))
print("Anzahl Übereinstimmungen:", len(matches) + count_test)
print("Anzahl veralteter Nutzer:", len(old) - count_test)
with pd.ExcelWriter(path, engine='openpyxl') as writer:
matches.to_excel(writer, sheet_name='Matches', index=False)
new.to_excel(writer, sheet_name='Neu', index=False)
old.to_excel(writer, sheet_name='Alt', index=False)