From 8ab6fee3c15f232548a02a8ae12979242ab37c82 Mon Sep 17 00:00:00 2001 From: Patrick vom Hagen Date: Fri, 19 Jan 2024 15:01:23 +0100 Subject: [PATCH] changes bei EGS Import --- main.py | 10 +-- src/compareL.py | 140 ++++++++++++++++++++++++++++++++ src/{compare.py => compareS.py} | 56 +++++++------ 3 files changed, 177 insertions(+), 29 deletions(-) create mode 100644 src/compareL.py rename src/{compare.py => compareS.py} (63%) diff --git a/main.py b/main.py index 02cf6a8..c43e983 100644 --- a/main.py +++ b/main.py @@ -4,13 +4,13 @@ if __name__ == "__main__": oldlist = 'Data/alte-liste-utf.csv' newlist = 'Data/neue-liste-utf.csv' - test = 'GGS/downloadS.csv' - new_test = 'GGS/ggsSold.csv' - clean = 'GGS/ggsSold2.cvs' + test = 'GPS/lehrer_HL0707113.csv_intern' + new_test = 'GPS/gpsLold.csv' + clean = 'GPS/gpsLold2.cvs' # Liste nach Fehler prüfen und Zeichen ersetzen - # check_file(test) - # format_csv(test, new_test) + check_file(test) + format_csv(test, new_test) clean_data(new_test, clean) # path_old_csv = input('Pfad zur alten Liste eingeben eingeben: ') diff --git a/src/compareL.py b/src/compareL.py new file mode 100644 index 0000000..d4fdac0 --- /dev/null +++ b/src/compareL.py @@ -0,0 +1,140 @@ +import csv +from Levenshtein import distance +import pandas as pd +import uuid + + +# TODO Filter für Spalten, ggfs. Klasse benötigt + +# TODO Filter für Dublikate hier wird dann die Klasse benötigt + + +def read_csv(file_path): + data = [] + with open(file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.reader(csvfile, delimiter=';') + for row in reader: + data.append((row[0].strip(), row[1].strip())) + return data + + +def similar_sets(pair, data): + similar_pairs = [] + for item in data: + if distance(pair[0], item[0]) <= 1 and distance(pair[1], item[1]) <= 1: + similar_pairs.append(item) + return similar_pairs + + +def compare_csv(file1, file2): + data1 = read_csv(file1) + data2 = read_csv(file2) + + common_pairs = set(data1) & set(data2) + unique_pairs1 = set(data1) - common_pairs + unique_pairs2 = set(data2) - common_pairs + + return common_pairs, unique_pairs1, unique_pairs2, data1, data2 + + +def find_similar_pairs(pair, other_data): + similar_pairs = [] + for item in other_data: + if distance(pair[0], item[0]) <= 2 and distance(pair[1], item[1]) <= 2: + similar_pairs.append(item) + return similar_pairs + + +def create_uuid(): + return str(uuid.uuid4()) + + +def add_hl_tag(row): + klasse = str(row['klasse']).lstrip('0') + return 'HL0707104-' + klasse + + +def create_import_list(path, path_new, old_pairs, new_pairs, common_pairs): + system_data = pd.read_csv(path, sep=';', encoding='utf-8') + system_data = system_data[~system_data[['name', 'vorname']].apply(tuple, axis=1).isin(old_pairs)] + # print(len(system_data)) + # print(system_data) + new_data = pd.read_csv(path_new, sep=';', encoding='utf-8') + + # Bei Schüler: alte Klassen gelöscht, mit neuen Klassen aus new-data auffüllen + + matches = new_data[~new_data[['name', 'vorname']].apply(tuple, axis=1).isin(new_pairs)] + # matches.loc[:, 'klasse'] = matches.apply(add_hl_tag, axis=1) + # print(len(matches)) + # print(matches) + system_data = pd.merge(system_data, matches, how='outer', left_on=['name', 'vorname'], right_on=['name', 'vorname']) + system_data = system_data[['name', 'vorname', 'klasse', 'schuelerid']] + system_data = system_data.drop('klasse', axis=1, errors='ignore') + print(system_data.columns) + print("Passende Einträge:" + str(len(system_data))) + + new_data = new_data[~new_data[['name', 'vorname']].apply(tuple, axis=1).isin(common_pairs)] + # new_data = new_data.drop('Unnamed: 2', axis=1, errors='ignore') + new_uuids = [] + for row in range(len(new_data)): + new_uuids.append(create_uuid()) + + new_data.insert(loc=2, column='schuelerid', value=new_uuids) + mailUserQuota = 2048 + oxUserQuota = 20480 + oxContext = 25 + + print(new_data.columns) + print("New Data:" + str(len(new_data))) + + # vor dem merge daten ergänzen + # import_df = pd.merge(system_data, new_data, how='outer', left_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext'], right_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext']) + # import_df = pd.merge(system_data, new_data, how='outer', + # left_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', + # 'oxContext'], + # right_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', + # 'oxContext']) + import_df = pd.merge(system_data, new_data, how='outer', + left_on=['name', 'vorname', 'schuelerid'], + right_on=['name', 'vorname', 'schuelerid']) + import_df['mailUserQuota'] = mailUserQuota + import_df['oxUserQuota'] = oxUserQuota + import_df['oxContext'] = oxContext + import_df['klasse'] = None + import_df = import_df[['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', 'oxContext']] + + # pd.set_option('display.max_rows', None) + # pd.set_option('display.max_columns', None) + # print(import_df) + # pd.reset_option('display.max_rows') + # pd.reset_option('display.max_columns') + out_path = '../GPS/outputLehrer.csv' + import_df.to_csv(out_path, sep=';', index=False) + print(len(import_df)) + print('Lehrer Import Liste erzeugt') + print('Testuser manuell nachtragen!!') + + +def main(): + file1_path = '../GPS/gpsLnew.csv' + file2_path = '../GPS/gpsLold2.cvs' + file3_path = '../GPS/gpsLold.csv' + + common_pairs, new_pairs, old_pairs, data1, data2 = compare_csv(file1_path, file2_path) + + print(f"Anzahl der übereinstimmenden Paare: {len(common_pairs)}") + print(f"Anzahl der neuen Einträge: {len(new_pairs)}") + print(f"Anzahl der veralteten Einträge: {len(old_pairs)}") + + # Paare nur aus nicht zugeordneten Paaren aus neuer Liste erstellen + for pair in data1: + similar_pairs_list2 = find_similar_pairs(pair, set(data2) - {pair}) + + if similar_pairs_list2: + print(f"Ähnliche Paare in neuer Liste {pair} aktuell im System: {similar_pairs_list2}") + + create_import_list(file3_path, file1_path, old_pairs, new_pairs, common_pairs) + + +if __name__ == "__main__": + main() diff --git a/src/compare.py b/src/compareS.py similarity index 63% rename from src/compare.py rename to src/compareS.py index 3f9e6ad..89083c8 100644 --- a/src/compare.py +++ b/src/compareS.py @@ -55,12 +55,12 @@ def add_hl_tag(row): def create_import_list(path, path_new, old_pairs, new_pairs, common_pairs): - df = pd.read_csv(path, sep=';', encoding='utf-8') - df = df[~df[['name', 'vorname']].apply(tuple, axis=1).isin(old_pairs)] - df = df.drop('username', axis=1, errors='ignore') - df = df.drop('klasse', axis=1, errors='ignore') - # print(len(df)) - # print(df) + system_data = pd.read_csv(path, sep=';', encoding='utf-8') + system_data = system_data[~system_data[['name', 'vorname']].apply(tuple, axis=1).isin(old_pairs)] + system_data = system_data.drop('username', axis=1, errors='ignore') + system_data = system_data.drop('klasse', axis=1, errors='ignore') + # print(len(system_data)) + # print(system_data) new_data = pd.read_csv(path_new, sep=';', encoding='utf-8') # Bei Schüler: alte Klassen gelöscht, mit neuen Klassen aus new-data auffüllen @@ -69,11 +69,10 @@ def create_import_list(path, path_new, old_pairs, new_pairs, common_pairs): matches.loc[:, 'klasse'] = matches.apply(add_hl_tag, axis=1) # print(len(matches)) # print(matches) - df = pd.merge(df, matches, how='outer', left_on=['name', 'vorname'], right_on=['name', 'vorname']) - df = df[['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', 'oxContext']] - # print(df) - print(len(df)) - + system_data = pd.merge(system_data, matches, how='outer', left_on=['name', 'vorname'], right_on=['name', 'vorname']) + system_data = system_data[['name', 'vorname', 'klasse', 'schuelerid']] + # print(system_data) + print(len(system_data)) new_data = new_data[~new_data[['name', 'vorname']].apply(tuple, axis=1).isin(common_pairs)] # new_data = new_data.drop('Unnamed: 2', axis=1, errors='ignore') @@ -81,28 +80,36 @@ def create_import_list(path, path_new, old_pairs, new_pairs, common_pairs): for row in range(len(new_data)): new_uuids.append(create_uuid()) # Klasse?? Unterschied zwischen Lehrer und Schüler - #new_data['klasse'] = None + # new_data['klasse'] = None new_data.loc[:, 'klasse'] = new_data.apply(add_hl_tag, axis=1) new_data.insert(loc=2, column='schuelerid', value=new_uuids) mailUserQuota = 1024 oxUserQuota = 5120 - oxContext = 16 - new_data['mailUserQuota'] = mailUserQuota - new_data['oxUserQuota'] = oxUserQuota - new_data['oxContext'] = oxContext + oxContext = 25 + # print(new_data) print(len(new_data)) # vor dem merge daten ergänzen - # import_df = pd.merge(df, new_data, how='outer', left_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext'], right_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext']) - import_df = pd.merge(df, new_data, how='outer', left_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', 'oxContext'], right_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', 'oxContext']) + # import_df = pd.merge(system_data, new_data, how='outer', left_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext'], right_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext']) + # import_df = pd.merge(system_data, new_data, how='outer', + # left_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', + # 'oxContext'], + # right_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', + # 'oxContext']) + import_df = pd.merge(system_data, new_data, how='outer', + left_on=['name', 'vorname', 'klasse', 'schuelerid'], + right_on=['name', 'vorname', 'klasse', 'schuelerid']) + import_df['mailUserQuota'] = mailUserQuota + import_df['oxUserQuota'] = oxUserQuota + import_df['oxContext'] = oxContext + # pd.set_option('display.max_rows', None) # pd.set_option('display.max_columns', None) # print(import_df) - # print(len(import_df)) # pd.reset_option('display.max_rows') # pd.reset_option('display.max_columns') - out_path = '../GGS/outputSchueler.csv' + out_path = '../GPS/outputSchueler.csv' import_df.to_csv(out_path, sep=';', index=False) print(len(import_df)) print('Schüler Import Liste erzeugt') @@ -110,9 +117,9 @@ def create_import_list(path, path_new, old_pairs, new_pairs, common_pairs): def main(): - file1_path = '../GGS/ggsSnew.csv' # Pfad zur ersten CSV-Datei - file2_path = '../GGS/ggsSold2.cvs' # Pfad zur zweiten CSV-Datei - file3_path = '../GGS/downloadS.csv' + file1_path = '../GPS/gpsSnew.csv' + file2_path = '../GPS/gpsSold2.cvs' + file3_path = '../GPS/gpsSold.csv' common_pairs, new_pairs, old_pairs, data1, data2 = compare_csv(file1_path, file2_path) @@ -120,11 +127,12 @@ def main(): print(f"Anzahl der neuen Einträge: {len(new_pairs)}") print(f"Anzahl der veralteten Einträge: {len(old_pairs)}") + # Paare nur aus nicht zugeordneten Paaren aus neuer Liste erstellen # for pair in data1: # similar_pairs_list2 = find_similar_pairs(pair, set(data2) - {pair}) # # if similar_pairs_list2: - # print(f"Ähnliche Paare zu {pair} in Liste 2: {similar_pairs_list2}") + # print(f"Ähnliche Paare in neuer Liste {pair} aktuell im System: {similar_pairs_list2}") create_import_list(file3_path, file1_path, old_pairs, new_pairs, common_pairs)