// Page break is automatic paragraph break
// include ^_ as EOP as well for recognizing Babyl files
inline int pagebreak(char v)
{
return (v == '\014' || v == '\037');
}
// Read char into variable, returning nz if end of line or end of page.
inline int eolgetc(char & v, istream & i)
{
i.get(v);
return (!i.eof() && (v == '\n' || pagebreak(v)));
}
// Add a char to the save buffer.
inline void reset(line_buffer & lb) { lb.set_buffer(0); }
void add(line_buffer & lb, char c) { lb.replace(lb.textlen(), &c, 1, 0); }
// Complain about inappropriate args
extern "C" void exit(int);
const char * progname = "m2html";
void usage()
{
cerr << "usage: " << progname << " < mailfile > htmlfile\n";
exit (1);
}
// shared variables between process() and main()
static enum { filestart, msgheader, msgtext } filestate = filestart;
static line_buffer para;
// process a line of text
// input = pointer to start of line
// output = pointer to start of next line
//
// does untabification, < > conversion, @ and URL HREFification
static void outword(const char * w, const char * s)
{
while (w != s) {
if (*w == '<') cout << "<";
else if (*w == '>') cout << ">";
else if (*w == '&') cout << "&";
else cout << *w;
w++;
}
}
static const char * processline(const char * s)
{
int inword = 1; // for HREF
const char * w = s; // word start for HREF
while (*s != '\0' && *s != '\n') {
switch(*s) {
case '<': case '>': case ',': case ' ':
case '(': case ')': case '[': case ']':
case '\t': case '"': case '\'':
case ';': // no colon tho because of http: etc
outword(w,s+1); // punctuation, output
w = s+1; // including punc char itself
inword = 1;
break;
case '@':
if (inword && w != s &&
((w - para.text()) < 12
|| strncmp(w-12,"In article <",12) != 0)) {
// email address?
while ((*++s >= 'a' && *s <= 'z') ||
(*s >= 'A' && *s <= 'Z') ||
(*s >= '0' && *s <= '9') ||
(*s == '.' || *s == '-'))
; // find end of hostname
while (*(s-1) == '.') s--; // back over period
cout << "";
outword(w,s);
cout << "";
w = s;
s--; // make up for extra s++ below
break;
}
// atsign after nonword, treat as default nonword char
inword = 0;
break;
case '$':
// this character shows up in many msg-ids and few real email addrs
inword = 0;
break;
case ':': // maybe URL?
if (inword && (strncmp(w,"mailto:",7) == 0 ||
strncmp(s,"://",3) == 0))
{
while (*++s != '<' && *s != '>' && *s != ' '
&& *s != '(' && *s != ')' && *s != '\t'
&& *s != ','
&& *s != '\n' && *s != '\0')
; // find end of URL by looking for break char
while (*(s-1) == '.') s--; // back over period
cout << "";
outword(w,s);
cout << "";
w = s;
s--; // make up for extra s++ below
break;
}
// colon after nonword, treat as default nonword char
inword = 0;
break;
default: // any other character
break;
}
s++;
}
outword(w,s); // flush final word
cout << "\n";
if (*s == '\n') s++;
return s;
}
// check if a header line should be output
// if yes, output it and return nonzero
static int testhead(const char * s, const char * t)
{
int l = strlen(t);
if (strncmp(s,t,l) != 0) return 0; // not that header, give up
cout << "" << t << "";
cout << " "+l; // pad to 16th col so tabs line up
s += l; // skip past header
while (*s == ' ' || *s == '\t') s++; // skip past whitespace
processline(s); // output header contents
return 1;
}
// process a paragraph known to be message header
static void processheader()
{
const char * s = para.text();
int cont = 0;
while (*s != '\0') {
switch(*s) {
case ' ': case '\t':
if (cont) testhead(s,"");
break;
case 'D':
cont = testhead(s,"Date:");
break;
case 'F':
cont = testhead(s,"From:");
break;
case 'K':
cont = testhead(s,"Keywords:");
break;
case 'N':
cont = testhead(s,"Newsgroups:");
break;
case 'O':
cont = testhead(s,"Organization:");
case 'R':
cont = ( testhead(s,"Reply-to:")
|| testhead(s,"Reply-To:") );
case 'S':
cont = (testhead(s,"Subject:")
|| testhead(s,"Summary:"));
break;
case 'T':
cont = testhead(s,"To:");
break;
default:
cont = 0;
}
while (*++s != '\n' && *s != '\0') ; // find eol
if (*s == '\n') s++; // and skip to next line
}
}
// process a paragraph known to be message text
static void processtext()
{
const char * s = para.text();
while (*s != '\0') s = processline(s);
}
// process a paragraph of text
static void process()
{
if (para.textlen() == 0) return; // nothing to process
// check trailing newline
if (para.text()[para.textlen() - 1] != '\n')
add(para,'\n');
// handle Babyl format junk
// ignore file header, saved full headers, start of abbrev headers
if (strncmp(para.text(), "1,,\n", 4) == 0) {
// DE 9 Jun 1999: VM doesn't leave blank line before *** EOOH ***
// so look for it in here rather than immediately returning
int i;
const char * s = para.text();
for (i = 0; i < para.textlen() - 14; i++)
if (strncmp(para.text()+i,"\n*** EOOH ***\n", 14) == 0) break;
if (i < para.textlen()) para.replace(0, 0, 0, i+14);
else return;
}
if (strncmp(para.text(), "BABYL OPTIONS:", 14) == 0) return;
// DE 1 Aug 1996: remove newline from string
// since emacs 19 adds " -*- rmail -*-" on same line
if (strncmp(para.text(), "*** EOOH ***\n", 13) == 0)
para.replace(0, 0, 0, 13); // Babyl real hdrs
// test if header by searching for "From: " line
int is_header = 0;
char * s = (char *) para.text();
while (*s != '\0') {
if (strncmp(s, "From:", 5) == 0) {
is_header = 1;
break;
}
while (*s != '\n' && *s != '\0') s++;
if (*s == '\n') s++;
}
// output glue from old state
switch (filestate) {
case filestart:
if (!is_header) cerr << "Warning: file does not begin w/headers\n";
cout << "
\n";
break;
case msgheader:
if (is_header) cerr << "Warning: two headers in a row\n";
cout << "
\n";
break;
case msgtext:
if (is_header) cout << "
\n";
else cout << "\n";
break;
default:
cerr << "Error: bad state in process glue\n";
}
// set new state and handle para
if (is_header) {
filestate = msgheader;
processheader();
} else {
filestate = msgtext;
processtext();
}
}
// The main program.
main (int argc, char ** argv)
{
if (argc > 1) usage();
int was_eol = 1;
int have_nonspace = 0;
for (;;) { // looping over chars in file
char inchar;
if (eolgetc(inchar, cin)) { // end of line?
if (was_eol || pagebreak(inchar) || !have_nonspace) {
// found paragraph break
if (have_nonspace) process(); // output old paragraph
reset(para); // and start new one
have_nonspace = 0;
}
else {
was_eol = 1; // one newline, remember the next
add(para,'\n'); // but assume internal to para for now
}
continue; // back to top of loop
}
if (cin.eof()) {
// maybe eolgetc() above hit eof?
if (have_nonspace) process(); // process final paragraph
cout << "
\n"; // finish output
break; // break out of loop
}
was_eol = 0; // if we got here, have non-line-break char
if (inchar != ' ' && inchar != '\t') have_nonspace = 1;
add(para, inchar); // add it to buffer
} // end loop over chars in line
}