changeset 6:bc6045ed0b2e

Added script to fix and re-index database fields and references
author Ivo Smits <Ivo@UCIS.nl>
date Tue, 12 Apr 2011 11:41:35 +0200
parents 5d62af5270dd
children 01dc7eeaf5df
files dbreindex.php todo.txt
diffstat 2 files changed, 94 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dbreindex.php	Tue Apr 12 11:41:35 2011 +0200
@@ -0,0 +1,92 @@
+#!/usr/bin/php
+<?php
+chdir(__DIR__);
+require_once './common.php';
+
+$lastposted = NULL;
+while (TRUE) {
+	if ($lastposted === NULL) {
+		$articles = $db->evalAllAssoc('SELECT * FROM `messages` LIMIT 10');
+	} else {
+		$articles = $db->evalAllAssoc('SELECT * FROM `messages` WHERE `id` > ? LIMIT 10', $lastposted);
+	}
+	if (!count($articles)) break;
+	foreach ($articles as $article) {
+		$headers = array();
+		$header = array();
+		$headerchanged = FALSE;
+		foreach (explode("\r\n", $article['header']) as $line) {
+			if (!strlen($line) || $line == '.') {
+				print("Article $article[id] Contains empty or terminating header line\n");
+				continue;
+			}
+			$parts = explode(': ', $line, 2);
+			$headername = strtoupper($parts[0]);
+			switch ($headername) {
+				case 'PATH': case 'FROM': case 'NEWSGROUPS': case 'SUBJECT': case 'DATE': case 'MESSAGE-ID': case 'SENDER':
+					if (isset($headers[$headername])) {
+						print("Article $article[id] Contains duplicate header $headername, removing.\n");
+						$headerchanged = TRUE;
+						break;
+					}
+					$header[] = $line;
+					$headers[strtoupper($parts[0])] = $parts[1];
+					break;
+				case 'ORGANIZATION': case 'LINES':
+				case 'MIME-VERSION': case 'CONTENT-TYPE': case 'CONTENT-TRANSFER-ENCODING': case 'USER-AGENT':
+				case 'REFERENCES': case 'REPLY-TO': case 'SENDER': case 'FOLLOWUP-TO': case 'IN-REPLY-TO':
+				case 'EXPIRES': case 'CONTROL': case 'DISTRIBUTION': case 'KEYWORDS': case 'SUMMARY':
+					$header[] = $line;
+					break;
+				case 'NNTP-POSTING-HOST': case 'X-TRACE': case 'XREF': case 'X-COMPLAINTS-TO':
+				case 'NNTP-POSTING-DATE':
+					print("Article $article[id] Contains unacceptable header $headername\n");
+					$headerchanged = TRUE;
+					break;
+				default:
+					$header[] = $line;
+					break;
+			}
+		}
+		foreach (explode("\r\n", $article['body']) as $line) if ($line == '.') print("Article $article[id] Contains terminating body line\n");
+		if (!isset($headers['NEWSGROUPS'])) {
+			print("Article $article[id] Missing required Newsgroups header\n");
+			continue;
+		}
+		$newsgroups = array();
+		foreach (explode(',', $headers['NEWSGROUPS']) as $groupname) {
+			$group = $db->evalRowAssoc('SELECT * FROM `groups` WHERE `name` = ?', $groupname);
+			if ($group === FALSE) continue;
+			$newsgroups[] = $group['id'];
+		}
+		if (!count($newsgroups)) {
+			print("Article $article[id] No known newsgroups listed\n");
+			continue;
+		}
+		if (!isset($headers['MESSAGE-ID'])) {
+			print("Article $article[id] Missing required Message-ID header\n");
+			continue;
+		}
+		$msgid = $headers['MESSAGE-ID'];
+		if (strlen($msgid) <= 2 || $msgid[0] != '<' || $msgid[strlen($msgid)-1] != '>') {
+			print("Article $article[id] Malformed Message-ID\n");
+		} else {
+			$msgid = substr($msgid, 1, -1);
+			if ($msgid != $article['messageid']) {
+				print("Article $article[id] Message-ID header does not match database, fixing.\n");
+				$db->update('UPDATE `messages` SET `messageid` = ? WHERE `id` = ?', array($msgid, $article['id']));
+			}
+		}
+		if ($headerchanged) {
+			print("Article $article[id] Updating headers.\n");
+			$db->update('UPDATE `messages` SET `header` = ? WHERE `id` = ?', array(implode("\r\n", $header), $article['id']));
+		}
+		foreach ($newsgroups as $groupid) {
+			if (FALSE === $db->evalRow('SELECT * FROM `groupmessages` WHERE `group` = ? AND `message` = ?', array($groupid, $article['id']))) {
+				print("Article $article[id] Missing link in group $groupid, fixing.\n");
+				$db->insert('INSERT INTO `groupmessages` (`group`, `message`) VALUES (?, ?)', array($groupid, $article['id']));
+			}
+		}
+		if ($article['id'] > $lastposted) $lastposted = $article['id'];
+	}
+}
--- a/todo.txt	Tue Apr 12 02:23:22 2011 +0200
+++ b/todo.txt	Tue Apr 12 11:41:35 2011 +0200
@@ -1,4 +1,4 @@
 - Make sure that group article numbers are never reused, not even if the last one is deleted (groupmessages table)
 - Support IHAVE command to speed up synchronization
-- Handle received cross-posted messages (according to Newsgroups header)
-- Add script for re-indexing articles (based on Newsgroups header)
+- Use STAT before POSTing articles
+- Allow to store (part of) article data in file