The document I am working is a 11.5 x 16 PDF document. The height I get back from pdf2json is 51.75, which when examining the Text's locations (x,y), and assuming that they also are represented as page units (PU), the y seems to be correct. However, the x seems to be off for elements located on the right half of the document. For instance, I placed text ("BottomRight") in the bottom right and got back the following coordinates: { x: 193.45312500000003, y: 50.918749999999996 }. Seeing that the document is 11.5 x 16, and the PU for the height are 51.75, this would technically make the width 74.25 PU. How is it possible that a text can have a position of 193.45..., with a max PU of 74.25?
define(function(require,exports,modules){
```
var fs = require('fs'),
_ = require('underscore-node'),
PDFParser = require('pdf2json/pdfparser'),
pdfParser = new PDFParser(),
pdfutils = require('pdfutils').pdfutils;
var PDF = function(base,file){
var pdf = this;
var location = '/Users/dayne/sites/wl/client/products/';
pdf.base = null;
pdf.file = null;
pdf.adors = [];
pdf.pages = [];
pdf.init = function(base,file){
console.log('starting pdf parsing');
// set base path + file name
pdf.file = file;
pdf.base = base;
// set the bindings
pdfParser.on("pdfParser_dataReady", _.bind(pdf.initParse, this));
pdfParser.on("pdfParser_dataError", _.bind(pdf.parseDataError, this));
// start parsing
pdfParser.loadPDF(base + file);
};
pdf.initParse = function(data){
```
// console.log('parsing pdf data');
```
pdfutils(pdf.base + pdf.file, function(err,doc){
```
// for(var i = 0; i < data.PDFJS.pages.length; i++)
for(var i = 0; i < 1; i++)
pdf.pages.push(pdf.parsePage(data.PDFJS.pages[i],doc[i]));
// console.log(data.PDFJS.pages[0]);
});
```
};
pdf.parsePage = function(page,doc){
var parsedPage = {};
parsedPage.adors = [];
parsedPage.ratio = doc.height / page.Height;
parsedPage.width = doc.width;
parsedPage.height = doc.height;
for(var i = 0; i < page.Texts.length; i++)
pdf.findCamelCase( page.Texts[i].R[0].T, page.Texts[i], page.Texts[i].R[0].TS, parsedPage, parsedPage.ratio);
// TODO:: find solution for this xml parsing (grabbing pictures)...
```
// console.log(parsedPage);
// var meta = doc.metadata.split('\n');
// doc[0].asPNG({maxWidth: doc[0].width, maxHeight: doc[0].height }).toFile( pdf.base + 'test.png' )
return parsedPage;
};
```
pdf.findCamelCase = function(text,textLocation,textData,parsedPage,ratio){
// TODO :: fix regex to only accept camelcase without spacing...
text.replace(/[A-Z]([A-Z0-9]*[a-z][a-z0-9]*[A-Z]|[a-z0-9]*[A-Z][A-Z0-9]*[a-z])[A-Za-z0-9]*/g, function(match){
var t = {};
```
// console.log(textLocation.x);
// console.log(ratio);
```
t.text = text;
t.size = textData[1];
t.bold = textData[2] == 1;
t.italics = textData[3] == 1;
t.position = {
x: textLocation.x,
y: textLocation.y
};
```
// console.log(textLocation.x);
console.log(t.text, t.position);
```
parsedPage.adors.push(t);
});
};
pdf.parseDataError = function(err){
console.log('pdf parse error...',err);
};
pdf.init(base,file);
};
return new PDF('/Users/dayne/sites/wl/server/utils/','test.pdf');
```
});
The document I am working is a 11.5 x 16 PDF document. The height I get back from pdf2json is 51.75, which when examining the Text's locations (x,y), and assuming that they also are represented as page units (PU), the y seems to be correct. However, the x seems to be off for elements located on the right half of the document. For instance, I placed text ("BottomRight") in the bottom right and got back the following coordinates: { x: 193.45312500000003, y: 50.918749999999996 }. Seeing that the document is 11.5 x 16, and the PU for the height are 51.75, this would technically make the width 74.25 PU. How is it possible that a text can have a position of 193.45..., with a max PU of 74.25?
define(function(require,exports,modules){ ``` var fs = require('fs'), _ = require('underscore-node'), PDFParser = require('pdf2json/pdfparser'), pdfParser = new PDFParser(), pdfutils = require('pdfutils').pdfutils; var PDF = function(base,file){ var pdf = this; var location = '/Users/dayne/sites/wl/client/products/'; pdf.base = null; pdf.file = null; pdf.adors = []; pdf.pages = []; pdf.init = function(base,file){ console.log('starting pdf parsing'); // set base path + file name pdf.file = file; pdf.base = base; // set the bindings pdfParser.on("pdfParser_dataReady", _.bind(pdf.initParse, this)); pdfParser.on("pdfParser_dataError", _.bind(pdf.parseDataError, this)); // start parsing pdfParser.loadPDF(base + file); }; pdf.initParse = function(data){ ``` // console.log('parsing pdf data'); ``` pdfutils(pdf.base + pdf.file, function(err,doc){ ``` // for(var i = 0; i < data.PDFJS.pages.length; i++) for(var i = 0; i < 1; i++) pdf.pages.push(pdf.parsePage(data.PDFJS.pages[i],doc[i])); // console.log(data.PDFJS.pages[0]); }); ``` }; pdf.parsePage = function(page,doc){ var parsedPage = {}; parsedPage.adors = []; parsedPage.ratio = doc.height / page.Height; parsedPage.width = doc.width; parsedPage.height = doc.height; for(var i = 0; i < page.Texts.length; i++) pdf.findCamelCase( page.Texts[i].R[0].T, page.Texts[i], page.Texts[i].R[0].TS, parsedPage, parsedPage.ratio); // TODO:: find solution for this xml parsing (grabbing pictures)... ``` // console.log(parsedPage); // var meta = doc.metadata.split('\n'); // doc[0].asPNG({maxWidth: doc[0].width, maxHeight: doc[0].height }).toFile( pdf.base + 'test.png' ) return parsedPage; }; ``` pdf.findCamelCase = function(text,textLocation,textData,parsedPage,ratio){ // TODO :: fix regex to only accept camelcase without spacing... text.replace(/[A-Z]([A-Z0-9]*[a-z][a-z0-9]*[A-Z]|[a-z0-9]*[A-Z][A-Z0-9]*[a-z])[A-Za-z0-9]*/g, function(match){ var t = {}; ``` // console.log(textLocation.x); // console.log(ratio); ``` t.text = text; t.size = textData[1]; t.bold = textData[2] == 1; t.italics = textData[3] == 1; t.position = { x: textLocation.x, y: textLocation.y }; ``` // console.log(textLocation.x); console.log(t.text, t.position); ``` parsedPage.adors.push(t); }); }; pdf.parseDataError = function(err){ console.log('pdf parse error...',err); }; pdf.init(base,file); }; return new PDF('/Users/dayne/sites/wl/server/utils/','test.pdf'); ``` });