Compare two tables and display the difference using Google App scripts

well, I'm trying to do what is described in the header. Both tables have only one sheet, which I am comparing. One spreadsheet is and updates another, so I am trying to get only new content. (if it were a fc (dos command) function, how easy it would be ...)

After doing the search, I have the following script, which should work in most cases, which uses arrays for each sheet.

function test() { var Folder = DriveApp.getFoldersByName('theFolder').next(); var FolderId =Folder.getId(); //call old_spreadsheet var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents"; var files = DriveApp.searchFiles(searchFor); var old_file = files.next(); var old_spreadsheet = SpreadsheetApp.openById(old_file.getId()); var old_sheet = old_spreadsheet.getSheets()[0]; var old_sheetname = old_sheet.getName(); var old_array = old_sheet.getDataRange().getValues(); Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length); //call spreadsheet var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents"; var files = DriveApp.searchFiles(searchFor); var file = files.next(); var spreadsheet = SpreadsheetApp.openById(file.getId()); var sheet = spreadsheet.getSheets()[0]; var sheetname = sheet.getName(); var array = sheet.getDataRange().getValues(); Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length); var newarray = getNewData(array,old_array); Logger.log('there are ' + newarray.length + 'different rows'); } function getNewData(array1,array2){ var diff =array2; for (var i = 0; i<array1.length; i++){ var duplicate = false; for (var j = 0;j<diff.length;j++){ if (array1[i].join() == diff[j].join()){ Logger.log('duplicated line found on rows ' + i + ':' + j); diff.splice(j,1); var duplicate= true; break; } } if (duplicate==false) { Logger.log('not duplicated line found on row ' + i); diff.push(array1[i]); } } return diff; } 

The fact is that the files are too large, almost 30,000 lines, so the scripts exceed the limit of 5 minutes to execute.

Is there a way to improve this, for example, to eliminate the inner loop? Or is there a way to do this piecemeal? like the first first 5000 lines, etc.

Hello,

EDIT: After a little analysis of the table, I found out that for each row there is an identifier, so now I can concentrate the search in only one column of each table. So here is my new implementation:

 function test(){ var Folder = DriveApp.getFoldersByName('theFolder').next(); var FolderId =Folder.getId(); //call old_spreadsheet var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents"; var files = DriveApp.searchFiles(searchFor); var old_file = files.next(); var old_spreadsheet = SpreadsheetApp.openById(old_file.getId()); var old_sheet = old_spreadsheet.getSheets()[0]; var old_sheetname = old_sheet.getName(); var old_array = old_sheet.getDataRange().getValues(); Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length); //call spreadsheet var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents"; var files = DriveApp.searchFiles(searchFor); var file = files.next(); var spreadsheet = SpreadsheetApp.openById(file.getId()); var sheet = spreadsheet.getSheets()[0]; var sheetname = sheet.getName(); var array = sheet.getDataRange().getValues(); Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length); //The COlumn has an indicator, so i search for that. I don't control the formatting of the files, so i search in both spreadsheet for the indicator var searchString = 'NAME'; for (var i = 0; i < old_array.length; i++) { for (var j = 0; j < old_array[i].length; j++) { if (old_array[i][j] == searchString) { var Row_old = i+1; var Column_old = j; break; } } if (Row_old != undefined){ break; } } for (var i = 0; i < array.length; i++) { for (var j = 0; j < array[i].length; j++) { if (array[i][j] == searchString) { var Row = i+1; var Column = j; break; } } if (Row != undefined){ break; } } Logger.log(Row_old+':::'+Column_old+'\n'+Row+':::'+Column); var diff_index =[]; var row_ind = 0; for (var i=Row;i<array.length;i++){ Logger.log(i); var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]); if (existe==-1){ Logger.log(row_ind+'!!!'); diff_index[row_ind]=i; row_ind++; } } Logger.log(diff_index); } 

This still does not have enough time ... Now I will try to include your comments.

+5
source share
4 answers

Finally, I decided to go for the Cache service option, here is the code, and I am testing it to see if I support it.

 function getNewData() { //deleting triggers var triggers = ScriptApp.getProjectTriggers(); for (var i = 0; i < triggers.length; i++) { if (triggers[i].getHandlerFunction()=='getNewData'){ ScriptApp.deleteTrigger(triggers[i]); } } //max running time = 5.5 min var MAX_RUNNING_TIME = 330000; var startTime= (new Date()).getTime(); //get cache var cache = CacheService.getUserCache(); var downloaded =JSON.parse(cache.get('downloaded')); var compared =JSON.parse(cache.get('compared')); //start if (downloaded==1 && compared!=1){ //folder var Folder = DriveApp.getFoldersByName('theFolder').next(); var FolderId = licitacionesFolder.getId(); //call old_spreadsheet var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents"; var files = DriveApp.searchFiles(searchFor); var old_file = files.next(); var old_spreadsheet = SpreadsheetApp.openById(old_file.getId()); var old_sheet = old_spreadsheet.getSheets()[0]; var old_array = old_sheet.getDataRange().getValues(); //call spreadsheet var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents"; var files = DriveApp.searchFiles(searchFor); var file = files.next(); var spreadsheet = SpreadsheetApp.openById(old_file.getId()); var sheet = spreadsheet.getSheets()[0]; var array = sheet.getDataRange().getValues(); Logger.log(array.length+'::'+old_array.length); // Column var searchString = 'NAME'; var RC = getColumn(array,searchString); var Row = RC.Row; var Column = RC.Column; var RC = getColumn(old_array,searchString); var Row_old = RC.Row; var Column_old = RC.Column; Logger.log(Row_old+':::'+Column_old+'\n'+Row+':::'+Column); //compare var diff_index =JSON.parse(cache.get('diff_index')); var row_ind =JSON.parse(cache.get('row_ind')); var Roww =JSON.parse(cache.get('Row')); if (diff_index==null){var diff_index = [];} if (row_ind==null){var row_ind = 0;} if (Roww==null){var Roww = Row;} Logger.log(row_ind+'\n'+Roww); for (var i=Roww;i<array.length;i++){ var currTime = (new Date()).getTime(); if(currTime - startTime >= MAX_RUNNING_TIME){ Logger.log((currTime - startTime)/(1000*60)); Logger.log(i+'::'+row_ind); cache.putAll({'diff_index': JSON.stringify(diff_index),'row_ind': JSON.stringify(row_ind),'Row': JSON.stringify(i-1)},21600); ScriptApp.newTrigger('getNewData').timeBased().after(2 * 60 * 1000).create(); return; } else { Logger.log(i); var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]); if (existe==-1){ Logger.log(row_ind+'!!!'); diff_index[row_ind]=i; row_ind++; } } } cache.putAll({'diff_index': JSON.stringify(diff_index),'Row': JSON.stringify(Row),'compared': JSON.stringify(1)},21600); } else { Logger.log('file not downloaded yet or already compared'); } } function getColumn(array,searchString){ for (var i = 0; i < array.length; i++) { for (var j = 0; j < array[i].length; j++) { if (array[i][j] == searchString) { var Row = i+1; var Column = j; break; } } if (Row != undefined){ break; } } return {Row: Row, Column: Column}; } 
0
source

Your script has several major bottlenecks that slow it down massively:

  • Running both cycles at 0 each time causes it to explode
  • splicing every time you find a duplicate, you need to move the array around
  • string concatenating an array at each iteration

We can get around these problems:

  • sort the second range of times
  • I am sure that there is something clever that should be performed by iteratively binary searching through each column, but we had to resort every time, so we will binary search for the first column and then perform a linear search.

We will use ArrayLib for sorting (I hope this is a quick sorting algorithm).

We start with the function to find the first row where the first column corresponds to the value (first column of the current row):

 function firstRowMatchingCol1(target, lookupRange) { var min = 0; var max = lookupRange.length - 1; var guess; var guessVal; while(min <= max) { guess = (min + max) / 2 | 0; guessVal = lookupRange[guess][0]; if (guessVal < target) { min = guess + 1; } else if (guessVal > target) { max = guess - 1; } else { while (guess > 0 && lookupRange[guess - 1][0] === target) { guess -= 1; } return guess; } } return -1; } 

Now we can go linearly through each row and check if the columns match until the first column no longer matches.

 function matchExists(row, lookupRange) { var index = firstRowMatchingCol1(row[0], lookupRange); if (index === -1) {return false;} while (index < lookupRange.length && lookupRange[index][0] === row[0]) { for (var col = 1; col < row.length; col++) { if (row[col] !== lookupRange[index][col]) {break;} if (col === row.length - 1) {return true;} // This only works if the ranges are at least two columns wide but if they are one column wide you can just check if index > -1 } index += 1; } return false; } 

And finally, we can get duplicates as follows:

 function getNonDuplicates(r1, r2) { r2 = ArrayLib.sort(r2, 0, true); return r1.filter(function(row) {return !matchExists(row, r2);}); } 

Like mTorres code, this is not verified

+1
source

The solution I propose is to β€œcrack” in time. But if you want to use a cleaner solution, you could, if possible, reorganize and make your code more efficient if you somehow ordered arrays.

You do not specify the data inside array1 and array2 if there was some kind of identifier field in the lines that you could order by this identifier and check the line I on array1 and the line I on array2 instead of comparing each line in array 1 with each line in array2 (which is extremely inefficient with 30,000 lines).

If your data does not have an identifier field for ordering strings, then what can you do based on my proposed solution: add a track for each compared string in the array. When the run reaches the time limit, you run the function again, but starting from the last line to be compared (you would know what happened because you will keep track of the lines being compared), and when you repeat the second time, and so on.

Each time you run a comparison, you ask if it is executed first (or use a boolean - I prefer to ask the user, so you will not forget to change the boolean), if this is the first run, you delete the tracking column, if it is not the first start, you will start with the next last tracked line, basically continuing your script where it will end. I use this technique with good results.

In the code (untested, so check it before running with real data):

 /** * Only checks if it the first run and calls the real work function */ function test() { var firstRun = "yes" === Browser.msgBox("Question", "Is this the first run?", Browser.Buttons.YES_NO); doTest(firstRun); } /** * Gets the data of the 2 spreadsheets and also the starting * row */ function doTest(firstRun) { var Folder = DriveApp.getFoldersByName('theFolder').next(); var FolderId = Folder.getId(); //call old_spreadsheet var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents"; var files = DriveApp.searchFiles(searchFor); var old_file = files.next(); var old_spreadsheet = SpreadsheetApp.openById(old_file.getId()); var old_sheet = old_spreadsheet.getSheets()[0]; var old_sheetname = old_sheet.getName(); var old_array = old_sheet.getDataRange().getValues(); /** * Here is the code to create the tracking hability */ var strartFromRow = 0; // 0 because row 1 is array 0 index when you getValues(); var trackSheet = old_spreadsheet.getSheetByName("Tracking"); if (trackSheet === null) { trackSheet = old_spreadsheet.insertSheet("Tracking"); } if (firstRun) { trackSheet.getRange("A:A").clearContent(); // make sure there no row is tracked yet } else { // we have to continue from the previous row, keep in mind you're making the comparison // with array which is 0 based, but sheet is 1 based, but you want the next one so getLasRow() // should be the first item to compare on your array strartFromRow = trackSheet.getLastRow(); } Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length); //call spreadsheet var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents"; var files = DriveApp.searchFiles(searchFor); var file = files.next(); var spreadsheet = SpreadsheetApp.openById(file.getId()); var sheet = spreadsheet.getSheets()[0]; var sheetname = sheet.getName(); var array = sheet.getDataRange().getValues(); Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length); // when you call the DIFF function, pass the tracking sheet and the start Row var newarray = getNewData(array,old_array, trackSheet, startFromRow); Logger.log('there are ' + newarray.length + 'different rows'); } /** * Creates a diff array using array1 and array2 * It marks each element on array1 once it has checked if it in array2 */ function getNewData(array1, array2, trackingSheet, startFromRow){ var logRow = trackingSheet.getLastRow(); var diff = array2; for (var i = startFromRow; i < array1.length; i++){ var duplicate = false; for (var j = 0; j < diff.length;j++){ if (array1[i].join() == diff[j].join()){ Logger.log('duplicated line found on rows ' + i + ':' + j); diff.splice(j,1); duplicate = true; break; } } if (duplicate === false) { Logger.log('not duplicated line found on row ' + i); diff.push(array1[i]); } trackingSheet.getRange(logRow++, 1).setValue("Checked!"); // Mark i row as checked } return diff; } 
0
source

Here is an alternative solution that is approaching the deadline. Create a new highlighted spreadsheet along with a custom sidebar. In the sidebar, you will need to create HTML code that will ultimately be embedded and displayed in an iframe on the client. You can embed pure javascript in HTML through script tags.

The beauty of this approach is that these scripts will not work on the server side, and on the client, regardless of the server environment of the Google Apps script, they do not fall under the 6-minute limit. In addition, they can also call functions in your Google Script. Thus, one approach would be to have client side scripts call the Google script function to get the required data, do all the heavy processing on client scripts, and then send the results to the server side of the script to update the sheet.

Here's a link to setting up a custom sidebar to get you started: https://developers.google.com/apps-script/guides/dialogs#custom_sidebars

0
source

Source: https://habr.com/ru/post/1268594/


All Articles