import numpy as np import pandas as pd from Levenshtein import distance def compare_data(new, sys, count_test, path, school): print(f"\nEinträge in System Liste: {len(sys)}") print(f"Einträge in Import Liste: {len(new)}") bool_class = 'klasse' in new.columns if bool_class: if 'index' in new.columns: new = new.drop('index', axis=1) unique_classes(new, school) unique_classes(sys, school) sys = sys.drop(columns=['klasse']) merged_df = pd.merge(new[['name', 'vorname', 'klasse']], sys, on=['name', 'vorname'], how='outer', indicator=True) matches = pd.merge(new, sys, on=['name', 'vorname']) else: merged_df = pd.merge(new[['name', 'vorname']], sys, on=['name', 'vorname'], how='outer', indicator=True) matches = pd.merge(new, sys, on=['name', 'vorname']) # Subsets für Zeilen erstellen, die nur in einem der DataFrames vorhanden sind only_new = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge']) only_sys = merged_df[merged_df['_merge'] == 'right_only'].drop(columns=['_merge']) print_status(matches, only_new, only_sys, count_test, path) search_typos(only_new[['name', 'vorname']], only_sys[['name', 'vorname']]) return matches, only_new def search_typos(new, sys): typos = [] col1 = 'vorname' col2 = 'name' for idx1, row1 in new.iterrows(): for idx2, row2 in sys.iterrows(): if distance(row1[col1], row2[col1]) <= 2 and distance(row1[col2], row2[col2]) <= 2: typos.append(([row1[col1], row1[col2]], [row2[col1], row2[col2]])) typo_df = pd.DataFrame(typos, columns=['Import', 'System']) if len(typo_df) > 0: print('Mögliche Tippfehler:', len(typo_df), '\n', typo_df) else: print('Mögliche Tippfehler: keine Fehler gefunden!') def unique_classes(df, school): df['klasse'] = df['klasse'].str.split(',') df = df.explode('klasse') eindeutige_klassen = df['klasse'].unique() with open(f'./Data/{school}/gen_klassen.txt', 'a') as file: file.write(np.array_str(eindeutige_klassen)) # print(eindeutige_klassen) def print_status(matches, new, old, count_test, path): print("\nAnzahl neuer Nutzer:", len(new)) print("Anzahl Übereinstimmungen:", len(matches) + count_test) print("Anzahl veralteter Nutzer:", len(old) - count_test) with pd.ExcelWriter(path, engine='openpyxl') as writer: matches.to_excel(writer, sheet_name='Matches', index=False) new.to_excel(writer, sheet_name='Neu', index=False) old.to_excel(writer, sheet_name='Alt', index=False)