From d16b15f3eb7d5753b60fbc7907e85ca996674bdb Mon Sep 17 00:00:00 2001 From: John McLear Date: Sun, 10 Feb 2013 17:34:34 +0000 Subject: [PATCH] begin support for better txt output --- src/node/handler/ExportHandler.js | 78 ++++- src/node/utils/ExportTxt.js | 477 ++++++++++++++++++++++++++++++ 2 files changed, 543 insertions(+), 12 deletions(-) create mode 100644 src/node/utils/ExportTxt.js diff --git a/src/node/handler/ExportHandler.js b/src/node/handler/ExportHandler.js index 1b7fcc26d..8ff5bc488 100644 --- a/src/node/handler/ExportHandler.js +++ b/src/node/handler/ExportHandler.js @@ -20,6 +20,7 @@ var ERR = require("async-stacktrace"); var exporthtml = require("../utils/ExportHtml"); +var exporttxt = require("../utils/ExportTxt"); var exportdokuwiki = require("../utils/ExportDokuWiki"); var padManager = require("../db/PadManager"); var async = require("async"); @@ -48,22 +49,75 @@ exports.doExport = function(req, res, padId, type) res.attachment(padId + "." + type); //if this is a plain text export, we can do this directly + // We have to over engineer this because tabs are stored as attributes and not plain text + if(type == "txt") { - padManager.getPad(padId, function(err, pad) - { - ERR(err); - if(req.params.rev){ - pad.getInternalRevisionAText(req.params.rev, function(junk, text) - { - res.send(text.text ? text.text : null); - }); - } - else + var txt; + var randNum; + var srcFile, destFile; + + async.series([ + //render the txt document + function(callback) { - res.send(pad.text()); + exporttxt.getPadTXTDocument(padId, req.params.rev, false, function(err, _txt) + { + if(ERR(err, callback)) return; + txt = _txt; + callback(); + }); + }, + //decide what to do with the txt export + function(callback) + { + //if this is a txt export, we can send this from here directly + res.send(txt); + callback("stop"); + }, + //send the convert job to abiword + function(callback) + { + //ensure html can be collected by the garbage collector + txt = null; + + destFile = tempDirectory + "/eplite_export_" + randNum + "." + type; + abiword.convertFile(srcFile, destFile, type, callback); + }, + //send the file + function(callback) + { + res.sendfile(destFile, null, callback); + }, + //clean up temporary files + function(callback) + { + async.parallel([ + function(callback) + { + fs.unlink(srcFile, callback); + }, + function(callback) + { + //100ms delay to accomidate for slow windows fs + if(os.type().indexOf("Windows") > -1) + { + setTimeout(function() + { + fs.unlink(destFile, callback); + }, 100); + } + else + { + fs.unlink(destFile, callback); + } + } + ], callback); } - }); + ], function(err) + { + if(err && err != "stop") ERR(err); + }) } else if(type == 'dokuwiki') { diff --git a/src/node/utils/ExportTxt.js b/src/node/utils/ExportTxt.js new file mode 100644 index 000000000..99f6085e3 --- /dev/null +++ b/src/node/utils/ExportTxt.js @@ -0,0 +1,477 @@ +/** + * Copyright 2009 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS-IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +var async = require("async"); +var Changeset = require("ep_etherpad-lite/static/js/Changeset"); +var padManager = require("../db/PadManager"); +var ERR = require("async-stacktrace"); +var Security = require('ep_etherpad-lite/static/js/security'); +var hooks = require('ep_etherpad-lite/static/js/pluginfw/hooks'); +function getPadPlainText(pad, revNum) +{ + var atext = ((revNum !== undefined) ? pad.getInternalRevisionAText(revNum) : pad.atext()); + var textLines = atext.text.slice(0, -1).split('\n'); + var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text); + var apool = pad.pool(); + + var pieces = []; + for (var i = 0; i < textLines.length; i++) + { + var line = _analyzeLine(textLines[i], attribLines[i], apool); + if (line.listLevel) + { + var numSpaces = line.listLevel * 2 - 1; + var bullet = '*'; + pieces.push(new Array(numSpaces + 1).join(' '), bullet, ' ', line.text, '\n'); + } + else + { + pieces.push(line.text, '\n'); + } + } + + return pieces.join(''); +} + +function getPadTXT(pad, revNum, callback) +{ + var atext = pad.atext; + var html; + async.waterfall([ + // fetch revision atext + + + function (callback) + { + if (revNum != undefined) + { + pad.getInternalRevisionAText(revNum, function (err, revisionAtext) + { + if(ERR(err, callback)) return; + atext = revisionAtext; + callback(); + }); + } + else + { + callback(null); + } + }, + + // convert atext to html + + + function (callback) + { + html = getTXTFromAtext(pad, atext); + callback(null); + }], + // run final callback + + + function (err) + { + if(ERR(err, callback)) return; + callback(null, html); + }); +} + +exports.getPadTXT = getPadTXT; +exports.getTXTFromAtext = getTXTFromAtext; + +function getTXTFromAtext(pad, atext, authorColors) +{ + var apool = pad.apool(); + var textLines = atext.text.slice(0, -1).split('\n'); + var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text); + + var tags = ['h1', 'h2', 'strong', 'em', 'u', 's']; + var props = ['heading1', 'heading2', 'bold', 'italic', 'underline', 'strikethrough']; + var anumMap = {}; + var css = ""; + + props.forEach(function (propName, i) + { + var propTrueNum = apool.putAttrib([propName, true], true); + if (propTrueNum >= 0) + { + anumMap[propTrueNum] = i; + } + }); + + function getLineTXT(text, attribs) + { + var propVals = [false, false, false]; + var ENTER = 1; + var STAY = 2; + var LEAVE = 0; + + // Use order of tags (b/i/u) as order of nesting, for simplicity + // and decent nesting. For example, + // Just bold Bold and italics Just italics + // becomes + // Just bold Bold and italics Just italics + var taker = Changeset.stringIterator(text); + var assem = Changeset.stringAssembler(); + var openTags = []; + + var urls = _findURLs(text); + + var idx = 0; + + function processNextChars(numChars) + { + if (numChars <= 0) + { + return; + } + + var iter = Changeset.opIterator(Changeset.subattribution(attribs, idx, idx + numChars)); + idx += numChars; + + while (iter.hasNext()) + { + var o = iter.next(); + var propChanged = false; + Changeset.eachAttribNumber(o.attribs, function (a) + { + if (a in anumMap) + { + var i = anumMap[a]; // i = 0 => bold, etc. + if (!propVals[i]) + { + propVals[i] = ENTER; + propChanged = true; + } + else + { + propVals[i] = STAY; + } + } + }); + for (var i = 0; i < propVals.length; i++) + { + if (propVals[i] === true) + { + propVals[i] = LEAVE; + propChanged = true; + } + else if (propVals[i] === STAY) + { + propVals[i] = true; // set it back + } + } + // now each member of propVal is in {false,LEAVE,ENTER,true} + // according to what happens at start of span + if (propChanged) + { + // leaving bold (e.g.) also leaves italics, etc. + var left = false; + for (var i = 0; i < propVals.length; i++) + { + var v = propVals[i]; + if (!left) + { + if (v === LEAVE) + { + left = true; + } + } + else + { + if (v === true) + { + propVals[i] = STAY; // tag will be closed and re-opened + } + } + } + + var tags2close = []; + + for (var i = propVals.length - 1; i >= 0; i--) + { + if (propVals[i] === LEAVE) + { + //emitCloseTag(i); + tags2close.push(i); + propVals[i] = false; + } + else if (propVals[i] === STAY) + { + //emitCloseTag(i); + tags2close.push(i); + } + } + + for (var i = 0; i < propVals.length; i++) + { + if (propVals[i] === ENTER || propVals[i] === STAY) + { + emitOpenTag(i); + propVals[i] = true; + } + } + // propVals is now all {true,false} again + } // end if (propChanged) + var chars = o.chars; + if (o.lines) + { + chars--; // exclude newline at end of line, if present + } + + var s = taker.take(chars); + + //removes the characters with the code 12. Don't know where they come + //from but they break the abiword parser and are completly useless + s = s.replace(String.fromCharCode(12), ""); + + assem.append(_encodeWhitespace(Security.escapeHTML(s))); + } // end iteration over spans in line + + var tags2close = []; + for (var i = propVals.length - 1; i >= 0; i--) + { + if (propVals[i]) + { + tags2close.push(i); + propVals[i] = false; + } + } + + } // end processNextChars + if (urls) + { + urls.forEach(function (urlData) + { + var startIndex = urlData[0]; + var url = urlData[1]; + var urlLength = url.length; + processNextChars(startIndex - idx); + console.warn(url); + // assem.append(''); + assem.append(url); + processNextChars(urlLength); + // assem.append(''); + }); + } + processNextChars(text.length - idx); + + return _processSpaces(assem.toString()); + } // end getLineHTML + var pieces = [css]; + + // Need to deal with constraints imposed on HTML lists; can + // only gain one level of nesting at once, can't change type + // mid-list, etc. + // People might use weird indenting, e.g. skip a level, + // so we want to do something reasonable there. We also + // want to deal gracefully with blank lines. + // => keeps track of the parents level of indentation + var lists = []; // e.g. [[1,'bullet'], [3,'bullet'], ...] + for (var i = 0; i < textLines.length; i++) + { + var line = _analyzeLine(textLines[i], attribLines[i], apool); + var lineContent = getLineTXT(line.text, line.aline); + if(line.listTypeName == "bullet"){ + lineContent = "* " + lineContent; // add a bullet + } + if(line.listLevel > 0){ + for (var j = line.listLevel - 1; j >= 0; j--){ + pieces.push('\t'); + } + if(line.listTypeName == "number"){ + pieces.push(line.listLevel + ". "); + // This is bad because it doesn't truly reflect what the user + // sees because browsers do magic on nested
  1. s + } + pieces.push(lineContent, '\n'); + }else{ + console.warn(line); + pieces.push(lineContent, '\n'); + } + + // I'm not too keen about using teh HTML export filters here, they could cause real pain in the future + // I'd suggest supporting getLineTXTForExport + var lineContentFromHook = hooks.callAllStr("getLineHTMLForExport", + { + line: line, + apool: apool, + attribLine: attribLines[i], + text: textLines[i] + }, " ", " ", ""); + if (lineContentFromHook) + { + pieces.push(lineContentFromHook, ''); + } + else + { + // pieces.push(lineContent, '\n'); + } + } + + return pieces.join(''); +} + +function _analyzeLine(text, aline, apool) +{ + var line = {}; + + // identify list + var lineMarker = 0; + line.listLevel = 0; + if (aline) + { + var opIter = Changeset.opIterator(aline); + if (opIter.hasNext()) + { + var listType = Changeset.opAttributeValue(opIter.next(), 'list', apool); + if (listType) + { + lineMarker = 1; + listType = /([a-z]+)([12345678])/.exec(listType); + if (listType) + { + line.listTypeName = listType[1]; + line.listLevel = Number(listType[2]); + } + } + } + } + if (lineMarker) + { + line.text = text.substring(1); + line.aline = Changeset.subattribution(aline, 1); + } + else + { + line.text = text; + line.aline = aline; + } + + return line; +} + +exports.getPadTXTDocument = function (padId, revNum, noDocType, callback) +{ + padManager.getPad(padId, function (err, pad) + { + if(ERR(err, callback)) return; + + getPadTXT(pad, revNum, function (err, html) + { + if(ERR(err, callback)) return; + callback(null, html); + }); + }); +} + +function _encodeWhitespace(s) { + return s.replace(/[^\x21-\x7E\s\t\n\r]/g, function(c) + { + return "&#" +c.charCodeAt(0) + ";" + }); +} + +// copied from ACE +function _processSpaces(s) +{ + var doesWrap = true; + if (s.indexOf("<") < 0 && !doesWrap) + { + // short-cut + return s.replace(/ /g, ' '); + } + var parts = []; + s.replace(/<[^>]*>?| |[^ <]+/g, function (m) + { + parts.push(m); + }); + if (doesWrap) + { + var endOfLine = true; + var beforeSpace = false; + // last space in a run is normal, others are nbsp, + // end of line is nbsp + for (var i = parts.length - 1; i >= 0; i--) + { + var p = parts[i]; + if (p == " ") + { + if (endOfLine || beforeSpace) parts[i] = ' '; + endOfLine = false; + beforeSpace = true; + } + else if (p.charAt(0) != "<") + { + endOfLine = false; + beforeSpace = false; + } + } + // beginning of line is nbsp + for (var i = 0; i < parts.length; i++) + { + var p = parts[i]; + if (p == " ") + { + parts[i] = ' '; + break; + } + else if (p.charAt(0) != "<") + { + break; + } + } + } + else + { + for (var i = 0; i < parts.length; i++) + { + var p = parts[i]; + if (p == " ") + { + parts[i] = ' '; + } + } + } + return parts.join(''); +} + + +// copied from ACE +var _REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/; +var _REGEX_SPACE = /\s/; +var _REGEX_URLCHAR = new RegExp('(' + /[-:@a-zA-Z0-9_.,~%+\/\\?=&#;()$]/.source + '|' + _REGEX_WORDCHAR.source + ')'); +var _REGEX_URL = new RegExp(/(?:(?:https?|s?ftp|ftps|file|smb|afp|nfs|(x-)?man|gopher|txmt):\/\/|mailto:)/.source + _REGEX_URLCHAR.source + '*(?![:.,;])' + _REGEX_URLCHAR.source, 'g'); + +// returns null if no URLs, or [[startIndex1, url1], [startIndex2, url2], ...] + +function _findURLs(text) +{ + _REGEX_URL.lastIndex = 0; + var urls = null; + var execResult; + while ((execResult = _REGEX_URL.exec(text))) + { + urls = (urls || []); + var startIndex = execResult.index; + var url = execResult[0]; + urls.push([startIndex, url]); + } + + return urls; +} +