From d16b15f3eb7d5753b60fbc7907e85ca996674bdb Mon Sep 17 00:00:00 2001
From: John McLear <john@mclear.co.uk>
Date: Sun, 10 Feb 2013 17:34:34 +0000
Subject: [PATCH] begin support for better txt output

---
 src/node/handler/ExportHandler.js |  78 ++++-
 src/node/utils/ExportTxt.js       | 477 ++++++++++++++++++++++++++++++
 2 files changed, 543 insertions(+), 12 deletions(-)
 create mode 100644 src/node/utils/ExportTxt.js

diff --git a/src/node/handler/ExportHandler.js b/src/node/handler/ExportHandler.js
index 1b7fcc26d..8ff5bc488 100644
--- a/src/node/handler/ExportHandler.js
+++ b/src/node/handler/ExportHandler.js
@@ -20,6 +20,7 @@
 
 var ERR = require("async-stacktrace");
 var exporthtml = require("../utils/ExportHtml");
+var exporttxt = require("../utils/ExportTxt");
 var exportdokuwiki = require("../utils/ExportDokuWiki");
 var padManager = require("../db/PadManager");
 var async = require("async");
@@ -48,22 +49,75 @@ exports.doExport = function(req, res, padId, type)
   res.attachment(padId + "." + type);
 
   //if this is a plain text export, we can do this directly
+  // We have to over engineer this because tabs are stored as attributes and not plain text
+
   if(type == "txt")
   {
-    padManager.getPad(padId, function(err, pad)
-    {
-      ERR(err);
-      if(req.params.rev){
-        pad.getInternalRevisionAText(req.params.rev, function(junk, text)
-        {
-          res.send(text.text ? text.text : null);
-        });
-      }
-      else
+    var txt;
+    var randNum;
+    var srcFile, destFile;
+
+    async.series([
+      //render the txt document
+      function(callback)
       {
-        res.send(pad.text());
+        exporttxt.getPadTXTDocument(padId, req.params.rev, false, function(err, _txt)
+        {
+          if(ERR(err, callback)) return;
+          txt = _txt;
+          callback();
+        });
+      },
+      //decide what to do with the txt export
+      function(callback)
+      {
+        //if this is a txt export, we can send this from here directly
+        res.send(txt);
+        callback("stop");
+      },
+      //send the convert job to abiword
+      function(callback)
+      {
+        //ensure html can be collected by the garbage collector
+        txt = null;
+
+        destFile = tempDirectory + "/eplite_export_" + randNum + "." + type;
+        abiword.convertFile(srcFile, destFile, type, callback);
+      },
+      //send the file
+      function(callback)
+      {
+        res.sendfile(destFile, null, callback);
+      },
+      //clean up temporary files
+      function(callback)
+      {
+        async.parallel([
+          function(callback)
+          {
+            fs.unlink(srcFile, callback);
+          },
+          function(callback)
+          {
+            //100ms delay to accomidate for slow windows fs
+            if(os.type().indexOf("Windows") > -1)
+            {
+              setTimeout(function()
+              {
+                fs.unlink(destFile, callback);
+              }, 100);
+            }
+            else
+            {
+              fs.unlink(destFile, callback);
+            }
+          }
+        ], callback);
       }
-    });
+    ], function(err)
+    {
+      if(err && err != "stop") ERR(err);
+    })
   }
   else if(type == 'dokuwiki')
   {
diff --git a/src/node/utils/ExportTxt.js b/src/node/utils/ExportTxt.js
new file mode 100644
index 000000000..99f6085e3
--- /dev/null
+++ b/src/node/utils/ExportTxt.js
@@ -0,0 +1,477 @@
+/**
+ * Copyright 2009 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS-IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+var async = require("async");
+var Changeset = require("ep_etherpad-lite/static/js/Changeset");
+var padManager = require("../db/PadManager");
+var ERR = require("async-stacktrace");
+var Security = require('ep_etherpad-lite/static/js/security');
+var hooks = require('ep_etherpad-lite/static/js/pluginfw/hooks');
+function getPadPlainText(pad, revNum)
+{
+  var atext = ((revNum !== undefined) ? pad.getInternalRevisionAText(revNum) : pad.atext());
+  var textLines = atext.text.slice(0, -1).split('\n');
+  var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text);
+  var apool = pad.pool();
+
+  var pieces = [];
+  for (var i = 0; i < textLines.length; i++)
+  {
+    var line = _analyzeLine(textLines[i], attribLines[i], apool);
+    if (line.listLevel)
+    {
+      var numSpaces = line.listLevel * 2 - 1;
+      var bullet = '*';
+      pieces.push(new Array(numSpaces + 1).join(' '), bullet, ' ', line.text, '\n');
+    }
+    else
+    {
+      pieces.push(line.text, '\n');
+    }
+  }
+
+  return pieces.join('');
+}
+
+function getPadTXT(pad, revNum, callback)
+{
+  var atext = pad.atext;
+  var html;
+  async.waterfall([
+  // fetch revision atext
+
+
+  function (callback)
+  {
+    if (revNum != undefined)
+    {
+      pad.getInternalRevisionAText(revNum, function (err, revisionAtext)
+      {
+        if(ERR(err, callback)) return;
+        atext = revisionAtext;
+        callback();
+      });
+    }
+    else
+    {
+      callback(null);
+    }
+  },
+
+  // convert atext to html
+
+
+  function (callback)
+  {
+    html = getTXTFromAtext(pad, atext);
+    callback(null);
+  }],
+  // run final callback
+
+
+  function (err)
+  {
+    if(ERR(err, callback)) return;
+    callback(null, html);
+  });
+}
+
+exports.getPadTXT = getPadTXT;
+exports.getTXTFromAtext = getTXTFromAtext;
+
+function getTXTFromAtext(pad, atext, authorColors)
+{
+  var apool = pad.apool();
+  var textLines = atext.text.slice(0, -1).split('\n');
+  var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text);
+
+  var tags = ['h1', 'h2', 'strong', 'em', 'u', 's'];
+  var props = ['heading1', 'heading2', 'bold', 'italic', 'underline', 'strikethrough'];
+  var anumMap = {};
+  var css = "";
+
+  props.forEach(function (propName, i)
+  {
+    var propTrueNum = apool.putAttrib([propName, true], true);
+    if (propTrueNum >= 0)
+    {
+      anumMap[propTrueNum] = i;
+    }
+  });
+
+  function getLineTXT(text, attribs)
+  {
+    var propVals = [false, false, false];
+    var ENTER = 1;
+    var STAY = 2;
+    var LEAVE = 0;
+
+    // Use order of tags (b/i/u) as order of nesting, for simplicity
+    // and decent nesting.  For example,
+    // <b>Just bold<b> <b><i>Bold and italics</i></b> <i>Just italics</i>
+    // becomes
+    // <b>Just bold <i>Bold and italics</i></b> <i>Just italics</i>
+    var taker = Changeset.stringIterator(text);
+    var assem = Changeset.stringAssembler();
+    var openTags = [];
+
+    var urls = _findURLs(text);
+
+    var idx = 0;
+
+    function processNextChars(numChars)
+    {
+      if (numChars <= 0)
+      {
+        return;
+      }
+
+      var iter = Changeset.opIterator(Changeset.subattribution(attribs, idx, idx + numChars));
+      idx += numChars;
+
+      while (iter.hasNext())
+      {
+        var o = iter.next();
+        var propChanged = false;
+        Changeset.eachAttribNumber(o.attribs, function (a)
+        {
+          if (a in anumMap)
+          {
+            var i = anumMap[a]; // i = 0 => bold, etc.
+            if (!propVals[i])
+            {
+              propVals[i] = ENTER;
+              propChanged = true;
+            }
+            else
+            {
+              propVals[i] = STAY;
+            }
+          }
+        });
+        for (var i = 0; i < propVals.length; i++)
+        {
+          if (propVals[i] === true)
+          {
+            propVals[i] = LEAVE;
+            propChanged = true;
+          }
+          else if (propVals[i] === STAY)
+          {
+            propVals[i] = true; // set it back
+          }
+        }
+        // now each member of propVal is in {false,LEAVE,ENTER,true}
+        // according to what happens at start of span
+        if (propChanged)
+        {
+          // leaving bold (e.g.) also leaves italics, etc.
+          var left = false;
+          for (var i = 0; i < propVals.length; i++)
+          {
+            var v = propVals[i];
+            if (!left)
+            {
+              if (v === LEAVE)
+              {
+                left = true;
+              }
+            }
+            else
+            {
+              if (v === true)
+              {
+                propVals[i] = STAY; // tag will be closed and re-opened
+              }
+            }
+          }
+
+          var tags2close = [];
+
+          for (var i = propVals.length - 1; i >= 0; i--)
+          {
+            if (propVals[i] === LEAVE)
+            {
+              //emitCloseTag(i);
+              tags2close.push(i);
+              propVals[i] = false;
+            }
+            else if (propVals[i] === STAY)
+            {
+              //emitCloseTag(i);
+              tags2close.push(i);
+            }
+          }
+          
+          for (var i = 0; i < propVals.length; i++)
+          {
+            if (propVals[i] === ENTER || propVals[i] === STAY)
+            {
+              emitOpenTag(i);
+              propVals[i] = true;
+            }
+          }
+          // propVals is now all {true,false} again
+        } // end if (propChanged)
+        var chars = o.chars;
+        if (o.lines)
+        {
+          chars--; // exclude newline at end of line, if present
+        }
+        
+        var s = taker.take(chars);
+        
+        //removes the characters with the code 12. Don't know where they come 
+        //from but they break the abiword parser and are completly useless
+        s = s.replace(String.fromCharCode(12), "");
+        
+        assem.append(_encodeWhitespace(Security.escapeHTML(s)));
+      } // end iteration over spans in line
+      
+      var tags2close = [];
+      for (var i = propVals.length - 1; i >= 0; i--)
+      {
+        if (propVals[i])
+        {
+          tags2close.push(i);
+          propVals[i] = false;
+        }
+      }
+      
+    } // end processNextChars
+    if (urls)
+    {
+      urls.forEach(function (urlData)
+      {
+        var startIndex = urlData[0];
+        var url = urlData[1];
+        var urlLength = url.length;
+        processNextChars(startIndex - idx);
+        console.warn(url);
+        // assem.append('<a href="' + Security.escapeHTMLAttribute(url) + '">');
+        assem.append(url);
+        processNextChars(urlLength);
+        // assem.append('</a>');
+      });
+    }
+    processNextChars(text.length - idx);
+
+    return _processSpaces(assem.toString());
+  } // end getLineHTML
+  var pieces = [css];
+
+  // Need to deal with constraints imposed on HTML lists; can
+  // only gain one level of nesting at once, can't change type
+  // mid-list, etc.
+  // People might use weird indenting, e.g. skip a level,
+  // so we want to do something reasonable there.  We also
+  // want to deal gracefully with blank lines.
+  // => keeps track of the parents level of indentation
+  var lists = []; // e.g. [[1,'bullet'], [3,'bullet'], ...]
+  for (var i = 0; i < textLines.length; i++)
+  {
+    var line = _analyzeLine(textLines[i], attribLines[i], apool);
+    var lineContent = getLineTXT(line.text, line.aline);
+    if(line.listTypeName == "bullet"){
+      lineContent = "* " + lineContent; // add a bullet
+    }
+    if(line.listLevel > 0){
+      for (var j = line.listLevel - 1; j >= 0; j--){
+        pieces.push('\t');
+      }
+      if(line.listTypeName == "number"){
+        pieces.push(line.listLevel + ". ");
+        // This is bad because it doesn't truly reflect what the user
+        // sees because browsers do magic on nested <ol><li>s
+      }
+      pieces.push(lineContent, '\n');
+    }else{
+      console.warn(line);
+      pieces.push(lineContent, '\n');
+    }
+
+    // I'm not too keen about using teh HTML export filters here, they could cause real pain in the future
+    // I'd suggest supporting getLineTXTForExport
+    var lineContentFromHook = hooks.callAllStr("getLineHTMLForExport", 
+    {
+      line: line,
+      apool: apool,
+      attribLine: attribLines[i],
+      text: textLines[i]
+    }, " ", " ", "");
+    if (lineContentFromHook)
+    {
+      pieces.push(lineContentFromHook, '');
+    } 
+    else 
+    {
+      // pieces.push(lineContent, '\n');
+    }
+  }
+
+  return pieces.join('');
+}
+
+function _analyzeLine(text, aline, apool)
+{
+  var line = {};
+
+  // identify list
+  var lineMarker = 0;
+  line.listLevel = 0;
+  if (aline)
+  {
+    var opIter = Changeset.opIterator(aline);
+    if (opIter.hasNext())
+    {
+      var listType = Changeset.opAttributeValue(opIter.next(), 'list', apool);
+      if (listType)
+      {
+        lineMarker = 1;
+        listType = /([a-z]+)([12345678])/.exec(listType);
+        if (listType)
+        {
+          line.listTypeName = listType[1];
+          line.listLevel = Number(listType[2]);
+        }
+      }
+    }
+  }
+  if (lineMarker)
+  {
+    line.text = text.substring(1);
+    line.aline = Changeset.subattribution(aline, 1);
+  }
+  else
+  {
+    line.text = text;
+    line.aline = aline;
+  }
+
+  return line;
+}
+
+exports.getPadTXTDocument = function (padId, revNum, noDocType, callback)
+{
+  padManager.getPad(padId, function (err, pad)
+  {
+    if(ERR(err, callback)) return;
+
+    getPadTXT(pad, revNum, function (err, html)
+    {
+      if(ERR(err, callback)) return;
+      callback(null, html);
+    });
+  });
+}
+
+function _encodeWhitespace(s) {
+  return s.replace(/[^\x21-\x7E\s\t\n\r]/g, function(c)
+  {
+    return "&#" +c.charCodeAt(0) + ";"
+  });
+}
+
+// copied from ACE
+function _processSpaces(s)
+{
+  var doesWrap = true;
+  if (s.indexOf("<") < 0 && !doesWrap)
+  {
+    // short-cut
+    return s.replace(/ /g, '&nbsp;');
+  }
+  var parts = [];
+  s.replace(/<[^>]*>?| |[^ <]+/g, function (m)
+  {
+    parts.push(m);
+  });
+  if (doesWrap)
+  {
+    var endOfLine = true;
+    var beforeSpace = false;
+    // last space in a run is normal, others are nbsp,
+    // end of line is nbsp
+    for (var i = parts.length - 1; i >= 0; i--)
+    {
+      var p = parts[i];
+      if (p == " ")
+      {
+        if (endOfLine || beforeSpace) parts[i] = '&nbsp;';
+        endOfLine = false;
+        beforeSpace = true;
+      }
+      else if (p.charAt(0) != "<")
+      {
+        endOfLine = false;
+        beforeSpace = false;
+      }
+    }
+    // beginning of line is nbsp
+    for (var i = 0; i < parts.length; i++)
+    {
+      var p = parts[i];
+      if (p == " ")
+      {
+        parts[i] = '&nbsp;';
+        break;
+      }
+      else if (p.charAt(0) != "<")
+      {
+        break;
+      }
+    }
+  }
+  else
+  {
+    for (var i = 0; i < parts.length; i++)
+    {
+      var p = parts[i];
+      if (p == " ")
+      {
+        parts[i] = '&nbsp;';
+      }
+    }
+  }
+  return parts.join('');
+}
+
+
+// copied from ACE
+var _REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/;
+var _REGEX_SPACE = /\s/;
+var _REGEX_URLCHAR = new RegExp('(' + /[-:@a-zA-Z0-9_.,~%+\/\\?=&#;()$]/.source + '|' + _REGEX_WORDCHAR.source + ')');
+var _REGEX_URL = new RegExp(/(?:(?:https?|s?ftp|ftps|file|smb|afp|nfs|(x-)?man|gopher|txmt):\/\/|mailto:)/.source + _REGEX_URLCHAR.source + '*(?![:.,;])' + _REGEX_URLCHAR.source, 'g');
+
+// returns null if no URLs, or [[startIndex1, url1], [startIndex2, url2], ...]
+
+function _findURLs(text)
+{
+  _REGEX_URL.lastIndex = 0;
+  var urls = null;
+  var execResult;
+  while ((execResult = _REGEX_URL.exec(text)))
+  {
+    urls = (urls || []);
+    var startIndex = execResult.index;
+    var url = execResult[0];
+    urls.push([startIndex, url]);
+  }
+
+  return urls;
+}
+