/* Parsing from mangini/gdocs2md. Modified by clearf to add files to the google directory structure. Modified by lmmx to write Rmarkdown, with emphasis on chunks rather than HTML-rendering code. Usage: Adding this script to your doc: - Tools > Script Manager > New - Select "Blank Project", then paste this code in and save. Running the script: - Tools > Script Manager - Select "convertDocumentToRmarkdown" function. - Click Run button. - Converted doc will be added to a "Rmarkdown" folder in the source document's directories. - Images will be added to a subfolder of the "Rmarkdown" folder. */ function onInstall(e) { onOpen(e); } function onOpen() { // Add a menu with some items, some separators, and a sub-menu. setupScript(); // In future: // DocumentApp.getUi().createAddonMenu(); DocumentApp.getUi().createMenu('Rmarkdown') .addItem('Export \u2192 Rmd', 'convertSingleDoc') .addItem('Export folder \u2192 Rmd', 'convertFolder') .addToUi(); } function setupScript() { var scriptProperties = PropertiesService.getScriptProperties(); // manual way to do the following: // scriptProperties.setProperty("folder_id", "INSERT_FOLDER_ID_HERE"); // scriptProperties.setProperty("document_id", "INSERT_FILE_ID_HERE"); var doc_id = DocumentApp.getActiveDocument().getId(); scriptProperties.setProperty("document_id", doc_id); var doc_parents = DriveApp.getFileById(doc_id).getParents(); var folders = doc_parents; while (folders.hasNext()) { var folder = folders.next(); var folder_id = folder.getId(); } scriptProperties.setProperty("folder_id", folder_id); scriptProperties.setProperty("image_folder_prefix", "/images/"); } function getDocComments(comment_list_args) { if (typeof(comment_list_args) == 'undefined') { var comment_list_args = {}; } var possible_args = ['images', 'include_deleted']; for (var i in possible_args) { var possible_arg = possible_args[i]; if (comment_list_args.propertyIsEnumerable(possible_arg)) { eval(possible_arg + " = " + comment_list_args[possible_arg]); } else { eval(possible_arg + " = " + false); } } /* Looks bad but more sensible than repeatedly checking if arg undefined. Sets every variable named in the possible_args array to false if it wasn't passed into the comment_list_args object. */ var scriptProperties = PropertiesService.getScriptProperties(); var document_id = scriptProperties.getProperty("document_id"); var comments_list = Drive.Comments.list(document_id, {includeDeleted: include_deleted, maxResults: 100 }); // 0 to 100, default 20 // See https://developers.google.com/drive/v2/reference/comments/list for all options var comment_array = []; for (var i = 0; i < comments_list.items.length; i++) { var comment_text = comments_list.items[i].content; /* images is a generic parameter passed in as a switch to return image URL-containing comments only. If the parameter is provided, it's no longer undefined. */ if (images) { if (/(https?:\/\/.+?\.(png|gif|jpe?g))/.test(comment_text)) { comment_array.push(RegExp.$1); } // otherwise there's no image URL here, skip it } else { comment_array.push(comment_text); } } return comment_array; } function getImageComments() { // for testing/maybe easy shorthand getDocComments({images: true}); } function convertSingleDoc() { var scriptProperties = PropertiesService.getScriptProperties(); // renew comments list on every export var doc_comments = getDocComments(); var image_urls = getDocComments({images: true}); // NB assumed false - any value will do scriptProperties.setProperty("comments", doc_comments); scriptProperties.setProperty("image_srcs", image_urls); var folder_id = scriptProperties.getProperty("folder_id"); var document_id = scriptProperties.getProperty("document_id"); var source_folder = DriveApp.getFolderById(folder_id); var Rmarkdown_folders = source_folder.getFoldersByName("Rmarkdown"); var Rmarkdown_folder; if (Rmarkdown_folders.hasNext()) { Rmarkdown_folder = Rmarkdown_folders.next(); } else { // Create a Rmarkdown folder if it doesn't exist. Rmarkdown_folder = source_folder.createFolder("Rmarkdown") } convertDocumentToRmarkdown(DocumentApp.openById(document_id), Rmarkdown_folder); } function convertFolder() { var scriptProperties = PropertiesService.getScriptProperties(); var folder_id = scriptProperties.getProperty("folder_id"); var source_folder = DriveApp.getFolderById(folder_id); var Rmarkdown_folders = source_folder.getFoldersByName("Rmarkdown"); var Rmarkdown_folder; if (Rmarkdown_folders.hasNext()) { Rmarkdown_folder = Rmarkdown_folders.next(); } else { // Create a Rmarkdown folder if it doesn't exist. Rmarkdown_folder = source_folder.createFolder("Rmarkdown"); } // Only try to convert google docs files. var gdoc_files = source_folder.getFilesByType("application/vnd.google-apps.document"); // For every file in this directory while(gdoc_files.hasNext()) { var gdoc_file = gdoc_files.next() var filename = gdoc_file.getName(); var Rmd_files = Rmarkdown_folder.getFilesByName(filename + ".Rmd"); var update_file = false; if (Rmd_files.hasNext()) { var Rmd_file = Rmd_files.next(); if (Rmd_files.hasNext()){ // There are multiple Rmarkdown files; delete and rerun update_file = true; } else if (Rmd_file.getLastUpdated() < gdoc_file.getLastUpdated()) { update_file = true; } } else { // There is no folder and the conversion needs to be rerun update_file = true; } if (update_file) { convertDocumentToRmarkdown(DocumentApp.openById(gdoc_file.getId()), Rmarkdown_folder); } } } function convertDocumentToRmarkdown(document, destination_folder) { var scriptProperties = PropertiesService.getScriptProperties(); var image_prefix=scriptProperties.getProperty("image_folder_prefix"); var numChildren = document.getActiveSection().getNumChildren(); var text = ""; var Rmd_filename = document.getName()+".Rmd"; var image_foldername = document.getName()+"_images"; var inSrc = false; var inChunk = false; var globalImageCounter = 0; var globalListCounters = {}; // edbacher: added a variable for indent in src
 block. Let style sheet do margin.
  var srcIndent = "";
  
  var postHasImages = false; 
  
  var files = [];
  
  // Walk through all the child elements of the doc.
  for (var i = 0; i < numChildren; i++) {
    var child = document.getActiveSection().getChild(i);
    var result = processParagraph(i, child, inSrc, inChunk, globalImageCounter, globalListCounters, image_prefix + image_foldername);
    globalImageCounter += (result && result.images) ? result.images.length : 0;
    if (result!==null) {
      if (result.sourcePretty==="start" && !inSrc) {
        inSrc=true;
        text+="
\n";
      } else if (result.sourcePretty==="end" && inSrc) {
        inSrc=false;
        text+="
\n\n"; } else if (result.source==="start" && !inSrc) { inSrc=true; text+="
\n";
      } else if (result.source==="end" && inSrc) {
        inSrc=false;
        text+="
\n\n"; } else if (result.inChunk==="start" && !inChunk) { inChunk=true; if (result.className==='') { text+="```{r}\n"; } else { text+="```{r "+result.className+"}\n"; } } else if (result.inChunk==="end" && inChunk) { inChunk=false; text+="```\n\n"; } else if (inChunk) { text+=result.text+"\n"; } else if (inSrc) { text+=(srcIndent+escapeHTML(result.text)+"\n"); } else if (result.text && result.text.length>0) { text+=result.text+"\n\n"; } if (result.images && result.images.length>0) { for (var j=0; j/g, '>'); } function standardQMarks(text) { return text.replace(/\u2018|\u8216|\u2019|\u8217/g,"'").replace(/\u201c|\u8220|\u201d|\u8221/g, '"') } // Process each child element (not just paragraphs). function processParagraph(index, element, inSrc, inChunk, imageCounter, listCounters, image_path) { // First, check for things that require no processing. if (element.getNumChildren()==0) { return null; } // Skip on TOC. if (element.getType() === DocumentApp.ElementType.TABLE_OF_CONTENTS) { return {"text": "[[TOC]]"}; } // Set up for real results. var result = {}; var pOut = ""; var textElements = []; var imagePrefix = "image_"; // Handle Table elements. Pretty simple-minded now, but works for simple tables. // Note that Rmarkdown does not process within block-level HTML, so it probably // doesn't make sense to add markup within tables. if (element.getType() === DocumentApp.ElementType.TABLE) { textElements.push("\n"); var nCols = element.getChild(0).getNumCells(); for (var i = 0; i < element.getNumChildren(); i++) { textElements.push(" \n"); // process this row for (var j = 0; j < nCols; j++) { textElements.push(" \n"); } textElements.push(" \n"); } textElements.push("
" + element.getChild(i).getChild(j).getText() + "
\n"); } // Process various types (ElementType). for (var i = 0; i < element.getNumChildren(); i++) { var t=element.getChild(i).getType(); if (t === DocumentApp.ElementType.TABLE_ROW) { // do nothing: already handled TABLE_ROW } else if (t === DocumentApp.ElementType.TEXT) { var txt=element.getChild(i); pOut += txt.getText(); textElements.push(txt); } else if (t === DocumentApp.ElementType.INLINE_IMAGE) { result.images = result.images || []; var blob = element.getChild(i).getBlob() var contentType = blob.getContentType(); var extension = ""; if (/\/png$/.test(contentType)) { extension = ".png"; } else if (/\/gif$/.test(contentType)) { extension = ".gif"; } else if (/\/jpe?g$/.test(contentType)) { extension = ".jpg"; } else { throw "Unsupported image type: "+contentType; } var name = imagePrefix + imageCounter + extension; blob.setName(name); imageCounter++; textElements.push('![](' + image_path + '/' + name + ')'); //result.images.push( { // "bytes": blob.getBytes(), // "type": contentType, // "name": name}); result.images.push({ "blob" : blob } ) } else if (t === DocumentApp.ElementType.PAGE_BREAK) { // ignore } else if (t === DocumentApp.ElementType.HORIZONTAL_RULE) { textElements.push('* * *\n'); } else if (t === DocumentApp.ElementType.FOOTNOTE) { textElements.push(' ('+element.getChild(i).getFootnoteContents().getText()+')'); } else { throw "Paragraph "+index+" of type "+element.getType()+" has an unsupported child: " +t+" "+(element.getChild(i)["getText"] ? element.getChild(i).getText():'')+" index="+index; } } if (textElements.length==0) { // Isn't result empty now? return result; } // evb: Add source pretty too. (And abbreviations: src and srcp.) // process source code block: if (/^\s*---\s+srcp\s*$/.test(pOut) || /^\s*---\s+source pretty\s*$/.test(pOut)) { result.sourcePretty = "start"; } else if (/^\s*---\s+src\s*$/.test(pOut) || /^\s*---\s+source code\s*$/.test(pOut)) { result.source = "start"; } else if (/^\s*~~~\s*$/.test(pOut) && typeof(inChunk) !== 'undefined' && inChunk) { result.inChunk = "end"; } else if (/^\s*~~~\s*(.*)\s*$/.test(pOut)) { result.inChunk = "start"; result.className = standardQMarks(RegExp.$1); } else if (/^\s*---\s*$/.test(pOut)) { result.source = "end"; result.sourcePretty = "end"; } else if (/^\s*---\s+jsperf\s*([^ ]+)\s*$/.test(pOut)) { result.text = ''; } else { prefix = findPrefix(inSrc, element, listCounters); var pOut = ""; for (var i=0; i): if (gt === DocumentApp.GlyphType.BULLET || gt === DocumentApp.GlyphType.HOLLOW_BULLET || gt === DocumentApp.GlyphType.SQUARE_BULLET) { prefix += "* "; } else { // Ordered list (
    ): var key = listItem.getListId() + '.' + listItem.getNestingLevel(); var counter = listCounters[key] || 0; counter++; listCounters[key] = counter; prefix += counter+". "; } } } return prefix; } function processTextElement(inSrc, txt) { if (typeof(txt) === 'string') { return txt; } var pOut = txt.getText(); if (! txt.getTextAttributeIndices) { return pOut; } Logger.log("Initial String: " + pOut) // CRC introducing reformatted_txt to let us apply rational formatting that we can actually parse var reformatted_txt = txt.copy(); reformatted_txt.deleteText(0,pOut.length-1); reformatted_txt = reformatted_txt.setText(pOut); var attrs=txt.getTextAttributeIndices(); var lastOff=pOut.length; // We will run through this loop multiple times for the things we care about. // Font // URL // Then for bold // Then for italiac. // FONTs var lastOff=pOut.length; for (var i=attrs.length-1; i>=0; i--) { var off=attrs[i]; var font=txt.getFontFamily(off) if (font) { while (i>=1 && txt.getFontFamily(attrs[i-1])==font) { // detect fonts that are in multiple pieces because of errors on formatting: i-=1; off=attrs[i]; } reformatted_txt.setFontFamily(off, lastOff-1, font); } lastOff=off; } // URL // XXX TODO actually convert to URL text here. var lastOff=pOut.length; for (var i=attrs.length-1; i>=0; i--) { var off=attrs[i]; var url=txt.getLinkUrl(off); if (url) { while (i>=1 && txt.getLinkUrl(attrs[i-1]) == url) { // detect urls that are in multiple pieces because of errors on formatting: i-=1; off=attrs[i]; } reformatted_txt.setLinkUrl(off, lastOff-1, url); } lastOff=off; } // bold var lastOff=pOut.length; for (var i=attrs.length-1; i>=0; i--) { var off=attrs[i]; var bold=txt.isBold(off); if (bold) { while (i>=1 && txt.isBold(attrs[i-1])) { i-=1; off=attrs[i]; } reformatted_txt.setBold(off, lastOff-1, bold); } lastOff=off; } // italics var lastOff=pOut.length; for (var i=attrs.length-1; i>=0; i--) { var off=attrs[i]; var italic=txt.isItalic(off); if (italic) { while (i>=1 && txt.isItalic(attrs[i-1])) { i-=1; off=attrs[i]; } reformatted_txt.setItalic(off, lastOff-1, italic); } lastOff=off; } var mOut=""; // Modified out string var harmonized_attrs = reformatted_txt.getTextAttributeIndices(); reformatted_txt.getTextAttributeIndices(); pOut = reformatted_txt.getText(); // Rmarkdown is farily picky about how it will let you intersperse spaces around words and strong/italics chars. This regex (hopefully) clears this up // Match any number of \*, followed by spaces/workd boundaries against anything that is not the \*, followed by boundaries, spaces and * again. // Test case at http://jsfiddle.net/ovqLv0s9/2/ var reAlignStars = /(\*+)(\s*\b)([^\*]+)(\b\s*)(\*+)/g; var lastOff=pOut.length; for (var i=harmonized_attrs.length-1; i>=0; i--) { var off=harmonized_attrs[i]; var raw_text = pOut.substring(off, lastOff) var d1 = "" var d2 = ""; var end_font; var mark_bold = false; var mark_italic = false; var mark_code = false; // The end of the text block is a special case. if (lastOff == pOut.length) { end_font = reformatted_txt.getFontFamily(lastOff - 1) if (end_font) { if (!inSrc && end_font===end_font.COURIER_NEW) { mark_code = true; } } if (reformatted_txt.isBold(lastOff -1)) { mark_bold = true; } if (reformatted_txt.isItalic(lastOff - 1)) { // edbacher: changed this to handle bold italic properly. mark_italic = true; } } else { end_font = reformatted_txt.getFontFamily(lastOff -1 ) if (end_font) { if (!inSrc && end_font===end_font.COURIER_NEW && reformatted_txt.getFontFamily(lastOff) != end_font) { mark_code=true; } } if (reformatted_txt.isBold(lastOff - 1) && !reformatted_txt.isBold(lastOff) ) { mark_bold=true; } if (reformatted_txt.isItalic(lastOff - 1) && !reformatted_txt.isItalic(lastOff)) { mark_italic=true; } } if (mark_code) { d2 = '`'; } if (mark_bold) { d2 = "**" + d2; } if (mark_italic) { d2 = "*" + d2; } mark_bold = mark_italic = mark_code = false; var font=reformatted_txt.getFontFamily(off); if (off == 0) { if (font) { if (!inSrc && font===font.COURIER_NEW) { mark_code = true; } } if (reformatted_txt.isBold(off)) { mark_bold = true; } if (reformatted_txt.isItalic(off)) { mark_italic = true; } } else { if (font) { if (!inSrc && font===font.COURIER_NEW && reformatted_txt.getFontFamily(off - 1) != font) { mark_code=true; } } if (reformatted_txt.isBold(off) && !reformatted_txt.isBold(off -1) ) { mark_bold=true; } if (reformatted_txt.isItalic(off) && !reformatted_txt.isItalic(off - 1)) { mark_italic=true; } } if (mark_code) { d1 = '`'; } if (mark_bold) { d1 = d1 + "**"; } if (mark_italic) { d1 = d1 + "*"; } var url=reformatted_txt.getLinkUrl(off); if (url) { mOut = d1 + '['+ raw_text +']('+url+')' + d2 + mOut; } else { var new_text = d1 + raw_text + d2; new_text = new_text.replace(reAlignStars, "$2$1$3$5$4"); mOut = new_text + mOut; } lastOff=off; Logger.log("Modified String: " + mOut) } mOut = pOut.substring(0, off) + mOut; return mOut; }