> The midas history system always had the problem that the database can get
> corrupted if the disk gets full where the history records (*.hst & *.idx) are
> stored.
Stefan - big thanks for fixing this problem - it is one of those cases "how come I
did not think of do it!".
This change should fix the last remaining problem with history at CERN - we seem to
be unable to avoid running out of disk space once in a while (run away scripts, fat
fingers, etc) and history got corrupted every time.
But to make things more interesting we had another history outage this week - we
happen to write history files to an NFS server (not recommened! do not do this!) and
when the NFS server had a glitch, history files got corrupted - because during the
glitch NFS was not available, I think this roll-back feature would not have helped.
Anyhow, I now have a patch to allow hs_read() to "skip the bad spots" in the history
files. (hs_gen_index() also needs a patch).
In the nutshell, if invalid history data is detected, the code continues to read the
data one byte at a time, looking for valid event_id markers (etc).
The code looks sane by inspection, and if nobody objects, I would like to commit it
in the next few days.
Here is the diff against src/history.c rev 4114
Index: history.c
===================================================================
--- history.c (revision 4118)
+++ history.c (working copy)
@@ -129,6 +129,7 @@
HIST_RECORD rec;
INDEX_RECORD irec;
DEF_RECORD def_rec;
+ int recovering = 0;
printf("Recovering index files...\n");
@@ -171,7 +172,7 @@
/* skip tags */
lseek(fh, rec.data_size, SEEK_CUR);
- } else {
+ } else if (rec.record_type == RT_DATA) {
/* write index record */
irec.event_id = rec.event_id;
irec.time = rec.time;
@@ -180,6 +181,15 @@
/* skip data */
lseek(fh, rec.data_size, SEEK_CUR);
+ } else {
+
+ if (!recovering)
+ cm_msg(MERROR, "hs_gen_index", "broken history file %d, trying to
recover", (int)ltime);
+
+ recovering = 1;
+ lseek(fh, -sizeof(rec)+1, SEEK_CUR);
+
+ continue;
}
} while (TRUE);
@@ -220,6 +230,7 @@
time_t lt;
int fh, fhd, fhi;
struct tm *tms;
+ int idxsize = 0;
if (*ltime == 0)
*ltime = ss_time();
@@ -250,12 +261,15 @@
hs_open_file(*ltime, "idf", O_RDONLY, &fhd);
hs_open_file(*ltime, "idx", O_RDONLY, &fhi);
+ if (fhi >= 0)
+ idxsize = lseek(fhi, 0, SEEK_END);
+
close(fh);
close(fhd);
close(fhi);
/* generate them if not */
- if (fhd < 0 || fhi < 0)
+ if (fhd < 0 || fhi < 0 || idxsize == 0)
hs_gen_index(*ltime);
return HS_SUCCESS;
@@ -1480,12 +1494,33 @@
i = -1;
M_FREE(cache);
cache = NULL;
- } else
+ } else {
+
+ try_again:
+
i = sizeof(irec);
-
- if (cp < cache_size) {
memcpy(&irec, cache + cp, sizeof(irec));
cp += sizeof(irec);
+
+ /* if history file is broken ... */
+ if (irec.time < last_irec_time) {
+ //printf("time %d -> %d, cache_size %d, cp %d\n", last_irec_time, irec.time,
cache_size, cp);
+
+ //printf("Seeking next record...\n");
+
+ while (cp < cache_size)
+ {
+ DWORD* evidp = (DWORD*)(cache + cp);
+ if (*evidp == event_id) {
+ //printf("Found at cp %d\n", cp);
+ goto try_again;
+ }
+
+ cp++;
+ }
+
+ i = -1;
+ }
}
} else
i = read(fhi, (char *) &irec, sizeof(irec));
K.O. |