Date: Thu, 22 Oct 1998 16:25:46 -0400 (EDT)
From: Joseph S D Yao <[email protected]>
To: (Chuck W.) <[email protected]>
Subject: Re: "chat"
Cc: [email protected]
> Doesn't a finite state automaton reading one character at a time slow
> things down a bit too much? I thought about the same thing using getchar()
> or some derivative therof, but I was afraid it would gum up the works a
> bit too much. I must admit I have never taken any metrics on the two. Do
> you have any code to pony up for a test?
Not in the least. That's what you have buffered I/O for. In fact, it's
often faster. Consider that fgets() looks something like:
while (cp < ep) {
c = getchar();
if (c == EOF)
break;
*cp++ = c;
if (c == NL)
break;
}
*cp = NUL;
return(buf);
and, if you want to be safe, you then have to test for the existence of
a NL, and take appropriate actions (e.g., flush to EOL) if not.
Try this. I needed to write something like this anyway.
============================= cut here ================================
/*********************************************************************\
**
** logsplit - split 'way-too-big log files with syslog-like dates.
**
** Syntax:
** logsplit < logfile
**
** Description:
** Splits the file into files named year00, year01, ..., breaking
** every time it sees a new "Jan " not preceded by another "Jan ".
**
** Uses FSA - states are location in "start" X value of "state"
** X value of "match". States are collapsed.
**
** $Log:$
**
** Files:
** year?? - output files.
**
** Routines:
** int main(int argc, char **argv, char **envp)
**
** Data:
** typedef char bool;
** typedef int boolean;
** static char start[] = "Jan ";
** static char outfile[] = "year\0\0";
** char *myname = "logsplit";
**
\*********************************************************************/
#ifndef lint
static char RCS_id[] = "@(#)$Id:$";
#endif/*lint*/
/* Only Standard C Library calls used. */
#include <stdio.h>
#include <string.h>
/* No constants in code!!! ;-) */
/* Special characters in the code. */
#define NUL '\0'
#define NL '\n'
#define DIRC '/'
#define FLAGC '-'
/* State values. */
#define OUTJAN 0 /* I've seen a non-Jan line. */
#define INJAN 1 /* I've seen a Jan line. */
#define START 2 /* I've not seen any lines. */
/* Match state values. */
#define MATCHING 0 /* Still trying to match. */
#define NOT_MATCHING 1 /* Not currently trying. */
/* Temporary state to help collapsing states */
#define MATCH_DONE 2 /* Match has just completed. */
/* Arguments to fopen(). */
#define READ "r"
#define WRITE "w"
#define APPEND "a"
/* Number of digits in the year, and number of values that holds. */
#define YRDIGS 2
#define YRMAX 100
/* Boolean values. */
#define TRUE (1)
#define FALSE (0)
/*
** Boolean data types - bool for small values, boolean for args and
** return values.
*/
typedef char bool;
typedef int boolean;
/* The string to match at the beginning of the line. */
static char start[] = "Jan ";
/* The name of the output file, with YRDIGS placeholders for year. */
static char outfile[] = "year\0\0";
/* The name of this program. */
char *myname = "logsplit";
/*
** main routine - reads argv[0] for 'myname', otherwise uses no args.
** always returns 0.
*/
int main(int argc, char **argv, char **envp)
{
register char *cp, *ep; /* pointers into start[] */
register int state, match; /* other state variables */
register int c; /* the character read. */
char *yearptr; /* where to save year #? */
unsigned int year; /* year number */
FILE *outf; /* output file handle */
#ifdef lint
/* Lint complains that envp's not used. OK. */
argv = envp;
/* This code should never be seen by a compiler. */
#endif/*lint*/
/* Get the program's name */
if (argc > 0) {
/* There is an argv[0]. Get the name from there. */
myname = strrchr(*argv, DIRC);
/*
** If there is no DIRC, use the whole name; otherwise,
** use what's after DIRC.
*/
if (myname == (char *) NULL)
myname = *argv;
else
++myname;
}
/* Initialize program variables. */
year = 0; /* Start with "year00". */
yearptr = strchr(outfile, NUL); /* Find First NUL. */
outf = (FILE *) NULL; /* No file yet. */
/* cp points to start of start; ep points to NUL at end. */
cp = start;
for (ep = cp; *ep != NUL; ++ep);
/* Initial states. */
state = START;
match = MATCHING;
/*
** Loop on reading one character at a time. Do something.
** Change state (values of state & match, location in start[])
** depending on value.
*/
while ((c = getchar()) != EOF) {
/*
** If we're currently still matching, then try to match.
*/
if (match == MATCHING) {
if (c != *cp++) {
/* Not a match - change state only. */
state = OUTJAN;
match = MATCH_DONE;
} else if (cp == ep) {
/* Match! */
/*
** If we had previously seen a non-Jan
** line, then if we had been writing to
** a file [and we should have been],
** close that file, and bump the year
** up.
*/
if (state == OUTJAN) {
if (outf != (FILE *) NULL) {
(void) fclose(outf);
outf = (FILE *) NULL;
++year;
}
}
/* Change the state. */
state = INJAN;
match = MATCH_DONE;
}
}
/*
** If we're still trying to match, don't write anything
** out yet - it might be to the wrong file!
*/
if (match == MATCHING)
continue;
/*
** If no file is currently open, create a name using the
** current value of "year" [00-99], and create a file of
** that name. If any of that fails, complain, and break
** out of this rut.
*/
if (outf == (FILE *) NULL) {
/* Will the year fit? */
if (year >= YRMAX) {
fprintf(stderr,
"%s: year is %d, too many years.\n",
myname, year);
break;
}
/* Create the file name. */
(void) sprintf(yearptr, "%*.*u",
YRDIGS, YRDIGS, year);
/* Create the file. */
outf = fopen(outfile, WRITE);
/* Did it get created? */
if (outf == (FILE *) NULL) {
perror(myname);
fprintf(stderr, "%s: Can't open \"%s\".",
myname, outfile);
break;
}
}
/*
** If we have only just now finished matching [or not,
** as the case may be], write out the portion that
** matched prior to the current character. Then drop
** the temporary state.
*/
if (match == MATCH_DONE) {
if (--cp > start)
(void) fwrite(start, 1, cp - start,
outf);
match = NOT_MATCHING;
}
/* Print the current character. */
putc(c, outf);
/*
** If the current character is a NL, change the state to
** start matching again with the next character.
*/
if (c == NL) {
cp = start;
match = MATCHING;
}
}
/* Clean up. Just because it's always a Good Idea(tm). */
if (outf != (FILE *) NULL)
(void) fclose(outf);
/* Always return SUCCESS. */
return(0);
}
============================= cut here ================================
--
/*********************************************************************\
**
** Joe Yao [email protected] - Joseph S. D. Yao
**
\*********************************************************************/