From 048ae8d71d185edb49cf0018c305a9fa5fdd7790 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 1 Apr 2026 13:29:35 +0200 Subject: [PATCH] cleanup step 1 --- gewinnspiel/cleanUpCsv.php | 185 +++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 gewinnspiel/cleanUpCsv.php diff --git a/gewinnspiel/cleanUpCsv.php b/gewinnspiel/cleanUpCsv.php new file mode 100644 index 0000000..0783aa1 --- /dev/null +++ b/gewinnspiel/cleanUpCsv.php @@ -0,0 +1,185 @@ +\n"; + exit(1); +} + +$filename = $argv[1]; +$filepath = __DIR__ . '/input-csv/' . $filename; + +if (!file_exists($filepath)) { + echo "Fehler: Datei '$filepath' nicht gefunden.\n"; + exit(1); +} + +$handle = fopen($filepath, 'r'); +if (!$handle) { + echo "Fehler: Datei konnte nicht geöffnet werden.\n"; + exit(1); +} + +const COL_STREET = 5; +const COL_EMAIL = 7; + +// Delimiter auto-erkennen +$firstLine = fgets($handle); +$firstLine = str_replace("\r", '', $firstLine); +$delimiter = substr_count($firstLine, ',') >= substr_count($firstLine, ';') ? ',' : ';'; +rewind($handle); + +echo "Erkannter Delimiter: '$delimiter'\n"; + +// Header einlesen +$header = fgetcsv($handle, 0, $delimiter); +$header = array_map(fn($cell) => mb_convert_encoding($cell, 'UTF-8', 'ISO-8859-1'), $header); + +$emailMap = []; +$streetMap = []; +$rows = []; +$rowNumber = 1; + +while (($row = fgetcsv($handle, 0, $delimiter)) !== false) { + $rowNumber++; + $row = array_map(fn($cell) => mb_convert_encoding($cell, 'UTF-8', 'ISO-8859-1'), $row); + $rows[$rowNumber] = $row; + + $email = mb_strtolower(trim($row[COL_EMAIL] ?? '')); + $street = mb_strtolower(trim($row[COL_STREET] ?? '')); + + if ($email !== '') { + $emailMap[$email]['original'] = trim($row[COL_EMAIL]); + $emailMap[$email]['rows'][] = $rowNumber; + } + + if ($street !== '') { + $streetMap[$street]['original'] = trim($row[COL_STREET]); + $streetMap[$street]['rows'][] = $rowNumber; + } +} + +fclose($handle); + +// ------------------------------------------------------- +// Hilfsfunktionen +// ------------------------------------------------------- + +function printBlock(array $data, string $label, array $header, array $rows): void +{ + $colWidth = 22; + $separator = str_repeat('-', count($header) * ($colWidth + 3)) . "\n"; + $headerLine = implode(' | ', array_map(fn($h) => str_pad(mb_substr($h, 0, $colWidth), $colWidth), $header)); + + echo "\n" . $separator; + echo "$label: {$data['original']}\n"; + echo $separator; + echo "Zeile | $headerLine\n"; + echo $separator; + + foreach ($data['rows'] as $rowNum) { + $cells = array_map( + fn($cell) => str_pad(mb_substr($cell, 0, $colWidth), $colWidth), + $rows[$rowNum] + ); + echo " $rowNum | " . implode(' | ', $cells) . "\n"; + } + + echo $separator; +} + +function askDelete(array $data, string $label, array $header, array $rows, array &$rowsToDelete): void +{ + printBlock($data, $label, $header, $rows); + + $keep = $data['rows'][0]; + $delete = array_slice($data['rows'], 1); + + echo "Behalten: Zeile $keep – Löschen: Zeilen " . implode(', ', $delete) . "\n"; + echo "[1] Löschen [Enter] Überspringen: "; + $input = trim(fgets(STDIN)); + + if ($input === '1') { + foreach ($delete as $rowNum) { + $rowsToDelete[$rowNum] = true; + } + echo "Zeilen " . implode(', ', $delete) . " zum Löschen vorgemerkt.\n"; + } else { + echo "Übersprungen.\n"; + } +} + +// ------------------------------------------------------- +// SCHRITT 1: E-Mail-Duplikate +// ------------------------------------------------------- +$emailDuplicates = array_filter($emailMap, fn($entry) => count($entry['rows']) > 1); +$rowsToDelete = []; + +echo "\n========================================\n"; +echo " SCHRITT 1: Doppelte E-Mail-Adressen\n"; +echo "========================================\n"; + +if (empty($emailDuplicates)) { + echo "Keine gefunden.\n"; +} else { + echo count($emailDuplicates) . " doppelte E-Mail-Adresse(n) gefunden.\n"; + + foreach ($emailDuplicates as $data) { + askDelete($data, 'E-Mail', $header, $rows, $rowsToDelete); + } +} + +// ------------------------------------------------------- +// SCHRITT 2: Straßen-Duplikate (bereits gelöschte Zeilen ausblenden) +// ------------------------------------------------------- +echo "\n========================================\n"; +echo " SCHRITT 2: Doppelte Straßen\n"; +echo "========================================\n"; + +foreach ($streetMap as $key => $data) { + $streetMap[$key]['rows'] = array_values(array_filter($data['rows'], fn($r) => !isset($rowsToDelete[$r]))); + if (count($streetMap[$key]['rows']) < 2) { + unset($streetMap[$key]); + } +} + +$streetDuplicates = array_filter($streetMap, fn($entry) => count($entry['rows']) > 1); + +if (empty($streetDuplicates)) { + echo "Keine gefunden.\n"; +} else { + echo count($streetDuplicates) . " doppelte Straße(n) gefunden.\n"; + + foreach ($streetDuplicates as $data) { + askDelete($data, 'Straße', $header, $rows, $rowsToDelete); + } +} + +// ------------------------------------------------------- +// CSV schreiben falls Zeilen zum Löschen vorgemerkt +// ------------------------------------------------------- +if (empty($rowsToDelete)) { + echo "\nKeine Zeilen zum Löschen vorgemerkt. Keine Ausgabedatei erstellt.\n"; + exit(0); +} + +$outputDir = __DIR__ . '/output-csv'; +$outputPath = $outputDir . '/' . $filename; + +if (!is_dir($outputDir)) { + mkdir($outputDir, 0755, true); +} + +$out = fopen($outputPath, 'w'); +fputcsv($out, $header, $delimiter); + +foreach ($rows as $rowNum => $row) { + if (isset($rowsToDelete[$rowNum])) { + continue; + } + fputcsv($out, $row, $delimiter); +} + +fclose($out); + +echo "\nBereinigte CSV gespeichert: output-csv/$filename\n"; +echo "Gelöschte Zeilen gesamt: " . count($rowsToDelete) . "\n"; \ No newline at end of file