Home > Enterprise >  Improve performance google apps script copy data from multiple Spreadsheets to one
Improve performance google apps script copy data from multiple Spreadsheets to one

Time:02-02

I have around 300 Spreadsheets that I need to copy all data from each spreadsheet and merge into a Master Spreadsheet. I have a spreadsheet that lists all 300 spreadsheet Ids. This script works however its Very slow!

I also tried to manually enter all document Ids as a variable and it did not seem to make a difference.

Is there a better way to handle?

function combineData() {
  const masterID = "ID";
  const masterSheet = SpreadsheetApp.openById(masterID).getSheets()[0];
  let targetSheets = docIds();
  for (let i = 0, len = targetSheets.length; i < len; i  ) {
    let sSheet = SpreadsheetApp.openById(targetSheets[i]).getActiveSheet();
    let sData = sSheet.getDataRange().getValues();
    sData.shift() //Remove header row
     if (sData.length > 0) { //Needed to add to remove errors on Spreadsheets with no data
      let fRow = masterSheet.getRange("A"   (masterSheet.getLastRow())).getRow()   1;
      let filter = sData.filter(function (row) { 
         return row.some(function (cell) {
           return cell !== "";  //If sheets have blank rows in between doesnt grab
         })
       })
  
      masterSheet.getRange(fRow, 1, filter.length, filter[0].length).setValues(filter)
     }
  }
}

function docIds() {
    let listOfId = SpreadsheetApp.openById('ID').getSheets()[0]; //list of 300 Spreadsheet IDs
  let values = listOfID.getDataRange().getValues()
  let arrayId = []
  for (let i = 1, len = values.length; i < len; i  ) {
    let data = values[i];
    let ssID = data[1];
    arrayId.push(ssID)
  }
  return arrayId
}

CodePudding user response:

Use the Sheets API, depending on the data it is about 5x faster than the native SpreadsheetApp. Add the Google Sheets API under Services in the left pane of the Apps Script editor.

Here is a code snipped of how we use one or the other API:

  if(gridData && gridHeight) {
    let range = sheet.getRange(startRow, 1, gridHeight, gridData[0].length);
    if(useSheetsAPI) {
      try {
        SpreadsheetApp.flush();
        let valueRange = Sheets.newValueRange();
        valueRange.values = gridData;
        let idAndName = getSpreadsheetIdAndSheetNameByName_(sheetName);
        let rangeA1 = idAndName.sheetName   '!'   range.getA1Notation();
        let options = { valueInputOption: 'USER_ENTERED' };
        let result = Sheets.Spreadsheets.Values.update(valueRange, idAndName.spreadsheetId, rangeA1, options);
        debugLog_('sheetReplace(): Sheets.Spreadsheets.Values.update result: ' result);
      } catch (err) {
        Logger.log('sheetReplace() ERROR: %s', err.message);
        return 'ERROR: sheetReplace() failed: '   err.message;
      }
    } else {
      range.setValues(gridData);
    }
  }


/**
 * Get spreadsheet Id and sheet name by sheet name
 * 
 * @param  {string|null}  name    name of sheet, either "sheet_id:Tab Name", "Tab Name"
 * @return {object}       object  object with spreadsheetId and sheetName
 */
function getSpreadsheetIdAndSheetNameByName_(name) {
  let spreadsheetId = '';
  if(name && name.length > 44 && name.indexOf(':') > 40) {
    // assume format: "sheet_id:Tab Name"
    spreadsheetId = name.replace(/:.*$/, '');
    name = name.replace(/^.*?:/, '');
  } else {
    // assume format "Tab Name"
    spreadsheetId = SpreadsheetApp.getActiveSpreadsheet().getId();
  }
  return { spreadsheetId: spreadsheetId, sheetName: name };
}

Also, I submitted an enhancement request for better performance, see https://issuetracker.google.com/issues/222337394 and vote for it.

CodePudding user response:

The .setValues() and .getValues() function themselves already run quite heavily specially if you have large data in the sheet, and using it together with for loop will really cause it to be slow since it iterates over 1 by 1. How about changing the for loop to forEach()

Try:

function combineData() {
  const masterID = "1aRQ7rW9tGF25xdmjAfOtT6HtyZKQq0_AIYOGSZMKOcA";
  const masterSheet = SpreadsheetApp.openById(masterID).getSheetByName("Master");
  let targetSheets = docIds();
  targetSheets.forEach(function(x){
    let sSheet = SpreadsheetApp.openById(x).getActiveSheet();
    let sData = sSheet.getDataRange().getValues();
    sData.shift() //Remove header row
     if (sData.length > 0) { //Needed to add to remove errors on Spreadsheets with no data
      let fRow = masterSheet.getRange("A"   (masterSheet.getLastRow())).getRow()   1;
      let filter = sData.filter(function (row) { 
         return row.some(function (cell) {
           return cell !== "";  //If sheets have blank rows in between doesnt grab
         })
       })
  
      masterSheet.getRange(fRow, 1, filter.length, filter[0].length).setValues(filter)
     }
  })
}

function docIds() {
  let listOfId = SpreadsheetApp.openById('1aRQ7rW9tGF25xdmjAfOtT6HtyZKQq0_AIYOGSZMKOcA').getSheets()[0]; //list of 300 Spreadsheet IDs
  let values = listOfId.getDataRange().getValues();
  values.shift()
  let arrayId = []
  values.forEach(function(val){
    let data = val;
    let ssID = data[1];
    arrayId.push(ssID)
  })
  return arrayId
}

Also here are some of the best practices to improve the performance of the script: Best Practices

More details on forEach:

Let me know if this helps!

CodePudding user response:

I believe your goal is as follows.

  • You have 300 Spreadsheets.
  • You want to retrieve the values from the 1st tab of all Spreadsheets and also, you want to put the retrieved values to the 1st tab of the master Spreadsheet.
  • You want to reduce the process cost of the script.

Issue and workaround:

In the current stage, unfortunately, there is no method for retrieving the values from multiple Spreadsheets, simultaneously. If the sample script is prepared, it is required to obtain the values from each spreadsheet in a loop. In this case, the process cost becomes high. I think that this might be the reason for your current issue.

In this answer, as another approach, I would like to propose the following flow.

  1. Create the URL list for exporting the values from Spreadsheets.
    • In the current stage, when Sheets API is used in a loop, an error occurs. So, in this workaround, I use the URL for exporting Spreadsheet as CSV data. In this case, it seems that even when this URL is accessed with a loop, no error occurs.
  2. Retrieve CSV values from the URLs using UrlFetchApp.fetchAll.
  3. Merge the retrieved values by parsing CSV data as an array.
  4. Put the values to the master Spreadsheet using Sheets API.

By this flow, I thought that the process cost can be reduced. When this flow is reflected in a sample script, how about the following sample script?

Sample script:

Please set masterID and ssId. And, please enable Sheets API at Advanced Google services. And, please run myFunction.

function myFunction() {
  const masterID = "###"; // Please set the master Spreadsheet ID.
  const ssId = "###"; // Please set the Spreadsheet ID including the Spreadsheet IDs you want to retrieve in column "B".

  // Retrieve Spreadsheet IDs.
  const sheet = SpreadsheetApp.openById(ssId).getSheets()[0];
  const ssIds = sheet.getRange("B2:B"   sheet.getLastRow()).getDisplayValues().reduce((ar, [b]) => {
    if (b) ar.push(b);
    return ar;
  }, []);

  // Retrieve values from all Spreadsheets.
  const workers = 50; // Please adjust this value.
  const headers = { authorization: "Bearer "   ScriptApp.getOAuthToken() };
  const reqs = [...Array(Math.ceil(ssIds.length / workers))].map(_ => ssIds.splice(0, workers).map(id => ({ url: `https://docs.google.com/spreadsheets/export?exportFormat=csv&id=${id}`, headers, muteHttpExceptions: true })));
  const values = reqs.flatMap(r =>
    UrlFetchApp.fetchAll(r).flatMap(rr => {
      if (rr.getResponseCode() == 200) {
        const [, ...val] = Utilities.parseCsv(rr.getContentText());
        return val;
      }
      return [];
    })
  );

  // Put values to the master sheet.
  const masterSheet = SpreadsheetApp.openById(masterID).getSheets()[0];
  Sheets.Spreadsheets.Values.update({ values }, masterID, `'${masterSheet.getSheetName()}'!A${masterSheet.getLastRow()   1}`, { valueInputOption: "USER_ENTERED" });

  // DriveApp.getFiles(); // This comment line is used for automatically detecting the scope for Drive API. So, please don't remove this line.
}
  • When this script is run,

    1. Spreadsheet IDs are retrieved from column "B" of the 1st sheet in the Spreadsheet of ssId.
    2. Values are retrieved from all Spreadsheets.
      • In this script, the values are retrieved from every 50 Spreadsheets with the asynchronous process. If you increase const workers = 50; to const workers = 100;, the values are retrieved from every 100 Spreadsheets. But, if an error occurs when this value is increased, please adjust the value.
    3. Put values using Sheets API.
  • When I tested this script for 50 Spreadsheet, the processing time was about 20 seconds. But, I'm not sure about your actual situation. So, please test this script.

Note:

  • In your script, listOfID is not declared. Please be careful about this.

  • Unfortunately, I cannot know your all Spreadsheets. So, if all values are more than 10,000,000 cells, an error occurs because of the maximum number of cells in a Spreadsheet. Please be careful about this.

  • If the number of values is large, an error might occur. At that time, please check my report.

References:

  • Related