import csv from Levenshtein import distance import pandas as pd import uuid # TODO Filter für Spalten, ggfs. Klasse benötigt # TODO Filter für Dublikate hier wird dann die Klasse benötigt def read_csv(file_path): data = [] with open(file_path, newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter=';') for row in reader: data.append((row[0].strip(), row[1].strip())) return data def similar_sets(pair, data): similar_pairs = [] for item in data: if distance(pair[0], item[0]) <= 1 and distance(pair[1], item[1]) <= 1: similar_pairs.append(item) return similar_pairs def compare_csv(file1, file2): data1 = read_csv(file1) data2 = read_csv(file2) common_pairs = set(data1) & set(data2) unique_pairs1 = set(data1) - common_pairs unique_pairs2 = set(data2) - common_pairs return common_pairs, unique_pairs1, unique_pairs2, data1, data2 def find_similar_pairs(pair, other_data): similar_pairs = [] for item in other_data: if distance(pair[0], item[0]) <= 2 and distance(pair[1], item[1]) <= 2: similar_pairs.append(item) return similar_pairs def create_uuid(): return str(uuid.uuid4()) def add_hl_tag(row): klasse = str(row['klasse']).lstrip('0') return 'HL0707104-' + klasse def create_import_list(path, path_new, old_pairs, new_pairs, common_pairs): system_data = pd.read_csv(path, sep=';', encoding='utf-8') system_data = system_data[~system_data[['name', 'vorname']].apply(tuple, axis=1).isin(old_pairs)] system_data = system_data.drop('username', axis=1, errors='ignore') system_data = system_data.drop('klasse', axis=1, errors='ignore') # print(len(system_data)) # print(system_data) new_data = pd.read_csv(path_new, sep=';', encoding='utf-8') # Bei Schüler: alte Klassen gelöscht, mit neuen Klassen aus new-data auffüllen matches = new_data[~new_data[['name', 'vorname']].apply(tuple, axis=1).isin(new_pairs)] matches.loc[:, 'klasse'] = matches.apply(add_hl_tag, axis=1) # print(len(matches)) # print(matches) system_data = pd.merge(system_data, matches, how='outer', left_on=['name', 'vorname'], right_on=['name', 'vorname']) system_data = system_data[['name', 'vorname', 'klasse', 'schuelerid']] # print(system_data) print(len(system_data)) new_data = new_data[~new_data[['name', 'vorname']].apply(tuple, axis=1).isin(common_pairs)] # new_data = new_data.drop('Unnamed: 2', axis=1, errors='ignore') new_uuids = [] for row in range(len(new_data)): new_uuids.append(create_uuid()) # Klasse?? Unterschied zwischen Lehrer und Schüler # new_data['klasse'] = None new_data.loc[:, 'klasse'] = new_data.apply(add_hl_tag, axis=1) new_data.insert(loc=2, column='schuelerid', value=new_uuids) mailUserQuota = 1024 oxUserQuota = 5120 oxContext = 25 # print(new_data) print(len(new_data)) # vor dem merge daten ergänzen # import_df = pd.merge(system_data, new_data, how='outer', left_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext'], right_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext']) # import_df = pd.merge(system_data, new_data, how='outer', # left_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', # 'oxContext'], # right_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', # 'oxContext']) import_df = pd.merge(system_data, new_data, how='outer', left_on=['name', 'vorname', 'klasse', 'schuelerid'], right_on=['name', 'vorname', 'klasse', 'schuelerid']) import_df['mailUserQuota'] = mailUserQuota import_df['oxUserQuota'] = oxUserQuota import_df['oxContext'] = oxContext # pd.set_option('display.max_rows', None) # pd.set_option('display.max_columns', None) # print(import_df) # pd.reset_option('display.max_rows') # pd.reset_option('display.max_columns') out_path = '../Data/GPS/outputSchueler.csv' import_df.to_csv(out_path, sep=';', index=False) print(len(import_df)) print('Schüler Import Liste erzeugt') print('Testuser manuell nachtragen!!') def main(): file1_path = '../Data/GPS/gpsSnew.csv' file2_path = '../Data/GPS/gpsSold2.cvs' file3_path = '../Data/GPS/gpsSold.csv' common_pairs, new_pairs, old_pairs, data1, data2 = compare_csv(file1_path, file2_path) print(f"Anzahl der übereinstimmenden Paare: {len(common_pairs)}") print(f"Anzahl der neuen Einträge: {len(new_pairs)}") print(f"Anzahl der veralteten Einträge: {len(old_pairs)}") # Paare nur aus nicht zugeordneten Paaren aus neuer Liste erstellen # for pair in data1: # similar_pairs_list2 = find_similar_pairs(pair, set(data2) - {pair}) # # if similar_pairs_list2: # print(f"Ähnliche Paare in neuer Liste {pair} aktuell im System: {similar_pairs_list2}") create_import_list(file3_path, file1_path, old_pairs, new_pairs, common_pairs) if __name__ == "__main__": main()