Browse Source

clean up

master
Daniel 2 months ago
parent
commit
d387d468c7
3 changed files with 271 additions and 58 deletions
  1. +1
    -0
      .gitignore
  2. +8
    -0
      .idea/.gitignore
  3. +262
    -58
      gewinnspiel/cleanUpCsv.php

+ 1
- 0
.gitignore View File

@@ -0,0 +1 @@
.idea/encodings.xml

+ 8
- 0
.idea/.gitignore View File

@@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

+ 262
- 58
gewinnspiel/cleanUpCsv.php View File

@@ -19,8 +19,12 @@ if (!$handle) {
exit(1);
}

const COL_STREET = 5;
const COL_EMAIL = 7;
const COL_FIRSTNAME = 3;
const COL_LASTNAME = 4;
const COL_STREET = 5;
const COL_PLZORT = 6;
const COL_EMAIL = 7;
const COL_PHONE = 8;

// Delimiter auto-erkennen
$firstLine = fgets($handle);
@@ -30,12 +34,9 @@ rewind($handle);

echo "Erkannter Delimiter: '$delimiter'\n";

// Header einlesen
$header = fgetcsv($handle, 0, $delimiter);
$header = array_map(fn($cell) => mb_convert_encoding($cell, 'UTF-8', 'ISO-8859-1'), $header);

$emailMap = [];
$streetMap = [];
$rows = [];
$rowNumber = 1;

@@ -43,19 +44,6 @@ while (($row = fgetcsv($handle, 0, $delimiter)) !== false) {
$rowNumber++;
$row = array_map(fn($cell) => mb_convert_encoding($cell, 'UTF-8', 'ISO-8859-1'), $row);
$rows[$rowNumber] = $row;

$email = mb_strtolower(trim($row[COL_EMAIL] ?? ''));
$street = mb_strtolower(trim($row[COL_STREET] ?? ''));

if ($email !== '') {
$emailMap[$email]['original'] = trim($row[COL_EMAIL]);
$emailMap[$email]['rows'][] = $rowNumber;
}

if ($street !== '') {
$streetMap[$street]['original'] = trim($row[COL_STREET]);
$streetMap[$street]['rows'][] = $rowNumber;
}
}

fclose($handle);
@@ -64,35 +52,127 @@ fclose($handle);
// Hilfsfunktionen
// -------------------------------------------------------

function printBlock(array $data, string $label, array $header, array $rows): void
function normalizeUmlauts(string $str): string
{
return str_replace(
['ä', 'ö', 'ü', 'ß', 'Ä', 'Ö', 'Ü'],
['ae', 'oe', 'ue', 'ss', 'ae', 'oe', 'ue'],
$str
);
}

function normalizeName(string $name): string
{
return normalizeUmlauts(mb_strtolower(trim($name)));
}

function firstToken(string $name): string
{
return preg_split('/\s+/', trim($name))[0] ?? '';
}

function isSimilar(string $a, string $b, int $threshold = 80): bool
{
similar_text($a, $b, $percent);
return $percent >= $threshold;
}

function nameSimilar(string $a, string $b): bool
{
return isSimilar(normalizeName($a), normalizeName($b));
}

function firstNameSimilar(string $a, string $b): bool
{
return isSimilar(normalizeName(firstToken($a)), normalizeName(firstToken($b)));
}

function extractPlz(string $plzOrt): string
{
preg_match('/\d{4,5}/', $plzOrt, $matches);
return $matches[0] ?? '';
}

function extractOrt(string $plzOrt): string
{
return mb_strtolower(trim(preg_replace('/^\d{4,5}\s*/', '', trim($plzOrt))));
}

function normalizePhone(string $phone): string
{
return preg_replace('/\D/', '', $phone);
}

function isDuplicate(array $rowA, array $rowB)
{
if (!nameSimilar($rowA[COL_LASTNAME], $rowB[COL_LASTNAME])) return false;
if (!firstNameSimilar($rowA[COL_FIRSTNAME], $rowB[COL_FIRSTNAME])) return false;

$reasons = [];

$plzA = extractPlz($rowA[COL_PLZORT] ?? '');
$plzB = extractPlz($rowB[COL_PLZORT] ?? '');
$ortA = extractOrt($rowA[COL_PLZORT] ?? '');
$ortB = extractOrt($rowB[COL_PLZORT] ?? '');
$strA = mb_strtolower(trim($rowA[COL_STREET] ?? ''));
$strB = mb_strtolower(trim($rowB[COL_STREET] ?? ''));
$phoneA = normalizePhone($rowA[COL_PHONE] ?? '');
$phoneB = normalizePhone($rowB[COL_PHONE] ?? '');

if ($plzA !== '' && $plzB !== '' && $plzA === $plzB) {
$reasons[] = "gleiche PLZ ($plzA)";
}

if ($ortA !== '' && $ortB !== '' && isSimilar($ortA, $ortB)) {
$reasons[] = "ähnlicher Ort ($ortA ≈ $ortB)";
}

if ($strA !== '' && $strB !== '' && isSimilar($strA, $strB)) {
$reasons[] = "ähnliche Straße ($strA ≈ $strB)";
}

if ($phoneA !== '' && $phoneB !== '' && $phoneA === $phoneB) {
$reasons[] = "gleiche Telefonnummer ($phoneA)";
}

if ($plzA === '' || $plzB === '') {
$reasons[] = "PLZ/Ort fehlt bei einem Eintrag";
}

if (empty($reasons)) return false;

return $reasons;
}

function printBlock(array $rowNums, array $header, array $rows, array $reasons): void
{
$colWidth = 22;
$colWidth = 20;
$separator = str_repeat('-', count($header) * ($colWidth + 3)) . "\n";
$headerLine = implode(' | ', array_map(fn($h) => str_pad(mb_substr($h, 0, $colWidth), $colWidth), $header));

echo "\n" . $separator;
echo "$label: {$data['original']}\n";
echo "Grund: " . implode(', ', $reasons) . "\n";
echo $separator;
echo "Zeile | $headerLine\n";
echo $separator;

foreach ($data['rows'] as $rowNum) {
foreach ($rowNums as $rowNum) {
$cells = array_map(
fn($cell) => str_pad(mb_substr($cell, 0, $colWidth), $colWidth),
$rows[$rowNum]
);
echo " $rowNum | " . implode(' | ', $cells) . "\n";
echo " " . str_pad((string)$rowNum, 5) . "| " . implode(' | ', $cells) . "\n";
}

echo $separator;
}

function askDelete(array $data, string $label, array $header, array $rows, array &$rowsToDelete): void
function askDelete(array $rowNums, array $header, array $rows, array $reasons, array &$rowsToDelete): void
{
printBlock($data, $label, $header, $rows);
printBlock($rowNums, $header, $rows, $reasons);

$keep = $data['rows'][0];
$delete = array_slice($data['rows'], 1);
$keep = $rowNums[0];
$delete = array_slice($rowNums, 1);

echo "Behalten: Zeile $keep – Löschen: Zeilen " . implode(', ', $delete) . "\n";
echo "[1] Löschen [Enter] Überspringen: ";
@@ -108,54 +188,177 @@ function askDelete(array $data, string $label, array $header, array $rows, array
}
}

function progressBar(int $current, int $total, int $width = 30): string
{
$pct = $total > 0 ? $current / $total : 1;
$filled = (int)round($pct * $width);
$bar = str_repeat('█', $filled) . str_repeat('░', $width - $filled);
return '[' . $bar . '] ' . str_pad((int)round($pct * 100), 3) . "% ($current/$total)";
}

function processDuplicateGroups(array $groups, array $header, array $rows, array &$rowsToDelete, string $stepLabel): void
{
$total = count($groups);
$current = 0;

if ($total === 0) {
echo "Keine gefunden.\n";
return;
}

echo "$total Gruppe(n) gefunden.\n";

foreach ($groups as $group) {
$current++;
$groupRows = array_values(array_filter($group['rows'], fn($r) => !isset($rowsToDelete[$r])));
if (count($groupRows) < 2) continue;

echo "\n" . progressBar($current, $total) . " – $stepLabel\n";
askDelete($groupRows, $header, $rows, $group['reasons'], $rowsToDelete);
}

echo "\n" . progressBar($total, $total) . " – $stepLabel abgeschlossen.\n";
}

// -------------------------------------------------------
// SCHRITT 1: E-Mail-Duplikate
// -------------------------------------------------------
$emailDuplicates = array_filter($emailMap, fn($entry) => count($entry['rows']) > 1);
$rowsToDelete = [];

echo "\n========================================\n";
echo " SCHRITT 1: Doppelte E-Mail-Adressen\n";
echo " SCHRITT 1: E-Mail-Duplikate\n";
echo "========================================\n";

if (empty($emailDuplicates)) {
echo "Keine gefunden.\n";
} else {
echo count($emailDuplicates) . " doppelte E-Mail-Adresse(n) gefunden.\n";
$emailMap = [];
foreach ($rows as $rowNum => $row) {
$email = mb_strtolower(trim($row[COL_EMAIL] ?? ''));
if ($email !== '') {
$emailMap[$email][] = $rowNum;
}
}

$autoDeleteGroups = [];
$manualReviewGroups = [];

foreach ($emailDuplicates as $data) {
askDelete($data, 'E-Mail', $header, $rows, $rowsToDelete);
foreach ($emailMap as $email => $rowNums) {
if (count($rowNums) < 2) continue;

// Prüfen ob alle Einträge gleichen Vor- + Nachnamen haben
$autoDelete = true;
$firstFirst = normalizeName(firstToken($rows[$rowNums[0]][COL_FIRSTNAME] ?? ''));
$firstLast = normalizeName($rows[$rowNums[0]][COL_LASTNAME] ?? '');

foreach ($rowNums as $rowNum) {
$first = normalizeName(firstToken($rows[$rowNum][COL_FIRSTNAME] ?? ''));
$last = normalizeName($rows[$rowNum][COL_LASTNAME] ?? '');
if (!isSimilar($first, $firstFirst) || !isSimilar($last, $firstLast)) {
$autoDelete = false;
break;
}
}

if ($autoDelete) {
$autoDeleteGroups[] = ['rows' => $rowNums, 'reasons' => ["gleiche E-Mail + gleicher Name ($email)"]];
} else {
$manualReviewGroups[] = ['rows' => $rowNums, 'reasons' => ["E-Mail: $email"]];
}
}

$rowsToDelete = [];
$autoDeleteCount = 0;

// Auto-Delete: gleiche E-Mail + gleicher Name
if (!empty($autoDeleteGroups)) {
echo "\n--- Auto-Delete: gleiche E-Mail + gleicher Name ---\n";
foreach ($autoDeleteGroups as $group) {
$keep = $group['rows'][0];
$delete = array_slice($group['rows'], 1);

printBlock($group['rows'], $header, $rows, $group['reasons']);
echo "Behalten: Zeile $keep – Auto-gelöscht: Zeilen " . implode(', ', $delete) . "\n";

foreach ($delete as $rowNum) {
$rowsToDelete[$rowNum] = true;
$autoDeleteCount++;
}
}
echo "\n========================================\n";
echo " Auto-Delete Zusammenfassung\n";
echo "========================================\n";
echo "Gruppen: " . count($autoDeleteGroups) . "\n";
echo "Gelöschte Zeilen: $autoDeleteCount\n";
echo "Zeilen: " . implode(', ', array_keys($rowsToDelete)) . "\n";
}

// Manuelle Prüfung: gleiche E-Mail aber unterschiedliche Namen
if (!empty($manualReviewGroups)) {
echo "\n--- Manuelle Prüfung: gleiche E-Mail, unterschiedliche Namen ---\n";
processDuplicateGroups($manualReviewGroups, $header, $rows, $rowsToDelete, 'E-Mail');
} else {
echo "\nKeine manuellen E-Mail-Duplikate.\n";
}

// -------------------------------------------------------
// SCHRITT 2: Straßen-Duplikate (bereits gelöschte Zeilen ausblenden)
// SCHRITT 2: Mehrfachanmeldungen
// -------------------------------------------------------
echo "\n========================================\n";
echo " SCHRITT 2: Doppelte Straßen\n";
echo " SCHRITT 2: Mehrfachanmeldungen\n";
echo "========================================\n";

foreach ($streetMap as $key => $data) {
$streetMap[$key]['rows'] = array_values(array_filter($data['rows'], fn($r) => !isset($rowsToDelete[$r])));
if (count($streetMap[$key]['rows']) < 2) {
unset($streetMap[$key]);
$byLastname = [];
foreach ($rows as $rowId => $row) {
if (isset($rowsToDelete[$rowId])) continue;
$key = mb_substr(normalizeName($row[COL_LASTNAME] ?? ''), 0, 3);
if ($key !== '') {
$byLastname[$key][] = $rowId;
}
}

$streetDuplicates = array_filter($streetMap, fn($entry) => count($entry['rows']) > 1);
$duplicateGroups = [];
$checked = [];
$bucketCount = count($byLastname);
$bucketDone = 0;

if (empty($streetDuplicates)) {
echo "Keine gefunden.\n";
} else {
echo count($streetDuplicates) . " doppelte Straße(n) gefunden.\n";
foreach ($byLastname as $bucket) {
$bucketDone++;
if ($bucketDone % 20 === 0) {
echo "\r" . progressBar($bucketDone, $bucketCount, 40) . " Analysiere... ";
}

foreach ($streetDuplicates as $data) {
askDelete($data, 'Straße', $header, $rows, $rowsToDelete);
for ($i = 0; $i < count($bucket); $i++) {
for ($j = $i + 1; $j < count($bucket); $j++) {
$idA = $bucket[$i];
$idB = $bucket[$j];
$key = "$idA-$idB";
if (isset($checked[$key])) continue;
$checked[$key] = true;

$reasons = isDuplicate($rows[$idA], $rows[$idB]);
if ($reasons === false) continue;

$merged = false;
foreach ($duplicateGroups as &$group) {
if (in_array($idA, $group['rows']) || in_array($idB, $group['rows'])) {
if (!in_array($idA, $group['rows'])) $group['rows'][] = $idA;
if (!in_array($idB, $group['rows'])) $group['rows'][] = $idB;
$group['reasons'] = array_unique(array_merge($group['reasons'], $reasons));
$merged = true;
break;
}
}
unset($group);

if (!$merged) {
$duplicateGroups[] = ['rows' => [$idA, $idB], 'reasons' => $reasons];
}
}
}
}

echo "\r" . progressBar($bucketCount, $bucketCount, 40) . " Analyse abgeschlossen.\n\n";

processDuplicateGroups($duplicateGroups, $header, $rows, $rowsToDelete, 'Mehrfachanmeldung');

// -------------------------------------------------------
// CSV schreiben falls Zeilen zum Löschen vorgemerkt
// CSV schreiben
// -------------------------------------------------------
if (empty($rowsToDelete)) {
echo "\nKeine Zeilen zum Löschen vorgemerkt. Keine Ausgabedatei erstellt.\n";
@@ -163,23 +366,24 @@ if (empty($rowsToDelete)) {
}

$outputDir = __DIR__ . '/output-csv';
$outputPath = $outputDir . '/' . $filename;
$outputPath = $outputDir . '/clean_' . $filename;

if (!is_dir($outputDir)) {
mkdir($outputDir, 0755, true);
}

$out = fopen($outputPath, 'w');
fputcsv($out, $header, $delimiter);

$headerIso = array_map(fn($cell) => mb_convert_encoding($cell, 'ISO-8859-1', 'UTF-8'), $header);
fputcsv($out, $headerIso, $delimiter);

foreach ($rows as $rowNum => $row) {
if (isset($rowsToDelete[$rowNum])) {
continue;
}
fputcsv($out, $row, $delimiter);
if (isset($rowsToDelete[$rowNum])) continue;
$rowIso = array_map(fn($cell) => mb_convert_encoding($cell, 'ISO-8859-1', 'UTF-8'), $row);
fputcsv($out, $rowIso, $delimiter);
}

fclose($out);

echo "\nBereinigte CSV gespeichert: output-csv/$filename\n";
echo "\nBereinigte CSV gespeichert: output-csv/clean_$filename\n";
echo "Gelöschte Zeilen gesamt: " . count($rowsToDelete) . "\n";

Loading…
Cancel
Save