// Very primitive RTF 2 HTML reader // Converts tiny subset of RTF (from VS IDE) into html. // Gets input RTF from clipboard. using System; using System.Collections.Generic; using System.Text;using System.Windows.Forms; using System.Text.RegularExpressions; using System.IO; namespace ClipBoard1{ class Program { [STAThread()] static void Main(string[] args) { Console.WriteLine("Get RTF from the clipboard."); IDataObject iData = Clipboard.GetDataObject(); string[] f = iData.GetFormats(); string rtf = (string)iData.GetData(DataFormats.Rtf); Console.WriteLine(iData.GetData(DataFormats.Text)); // We assume the colortable and fontable are a standard preset used by VS. // Avoids hassle of parsing them. // Skip past {\colortbl.*;} and to the start of the real data // @todo - regular expression would be good here. int i1 = rtf.IndexOf(@"{\colortbl"); if (i1 <= 0) throw new ArgumentException("Bad input RTF."); int i2 = rtf.IndexOf(";}", i1); if (i2 <= 0) throw new ArgumentException("Bad input RTF."); string data = rtf.Substring(i2 + 2, rtf.Length - (i2 + 2) - 1); TextWriter tw = new StreamWriter("out.html"); Format(tw, data); tw.Close(); } // Default color table used by VS's IDE. static string[] m_colorTable = new string[] { // rrGGbb "#000000", // default, starts at index 0 "#000000", // real color table starts at index 1 "#0000FF", "#00ffFF", "#00FF00", "#FF00FF", "#FF0000", "#FFFF00", "#FFffFF", "#000080", "#008080", "#008000", "#800080", "#800000", "#808000", "#808080", "#c0c0c0" }; // Escape HTML chars static string Escape(string st) { st = st.Replace("&", "&"); st = st.Replace("<", "<"); st = st.Replace(">", ">"); return st; } // Convert the RTF data into an HTML stream. // This rtf snippet is past the font + color tables, so we're just transfering control words now. // Write out HTML to the text writer. static void Format(TextWriter tw, string rtf) { tw.Write(""); tw.Write(""); // Example: \fs20 \cf2 using\cf0 System; // root --> ('text' '\' ('control word' | 'escaped char'))+ // 'control word' --> (alpha)+ (numeric*) space? // 'escaped char' = 'x'. Some characters \, {, } are escaped: '\x' --> 'x' // @todo - handle embedded groups (begin with '{') int idx = 0; while (idx < rtf.Length) { // Get any text up to a '\'. Regex r1 = new Regex(@"(.*?)\\", RegexOptions.Singleline | RegexOptions.IgnoreCase); Match m = r1.Match(rtf, idx); if (m.Length == 0) break; // text will be empty if we have adjacent control words string stText = m.Groups[1].ToString(); tw.Write(Escape(stText)); idx += m.Length; // check for RTF escape characters. According to the spec, these are the only escaped chars. char chNext = rtf[idx]; if (chNext == '{' || chNext == '}' || chNext == '\\') { // Escaped char tw.Write(chNext); idx++; continue; } // Must be a control char. @todo- delimeter includes more than just space, right? Regex r2 = new Regex(@"([\{a-z]+)([0-9]*) ", RegexOptions.Singleline | RegexOptions.IgnoreCase); m = r2.Match(rtf, idx); string stCtrlWord = m.Groups[1].ToString(); string stCtrlParam = m.Groups[2].ToString(); if (stCtrlWord == "cf") { // Set font color. int iColor = Int32.Parse(stCtrlParam); tw.Write(""); // close previous span, and start a new one for the given color. tw.Write(""); } else if (stCtrlWord == "fs") { // Sets font size. ignore } else if (stCtrlWord == "par") { // This is a newline. ignore // @todo- I think the only reason we can ignore this is because the \par in our input are always followed by // a '\r\n' and we're accidentally writing that. } else { throw new ArgumentException("Unrecognized control word '" + stCtrlWord + stCtrlParam + "'after:" + stText); } idx += m.Length; } tw.Write(Escape(rtf.Substring(idx))); // rest of string tw.Write(""); } // end Format() } }