-
Notifications
You must be signed in to change notification settings - Fork 0
/
ms_doc_file.rb
614 lines (572 loc) · 16.8 KB
/
ms_doc_file.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
require 'ms_comp_file'
require 'ms_fib'
class APcd < BinData::Record
endian :little
uint8 :fNoParaLast # only first bit matters (0x01)
uint8 :fR2
uint32 :fc
uint16 :prm
def fCompressed
(fc & 0x40000000)==0x40000000
end
def offset
ofs = (fc & 0x3ffffff)
ofs = (fc & 0x3ffffff)/2 if fCompressed
ofs
end
end
class Fld < BinData::Record
endian :little
uint8 :fldch
uint8 :grffld
end
class FBKF < BinData::Record
endian :little
uint16 :ibkl
struct :bkc do
bit7 :itcFirst
bit1 :fPub
bit6 :itcLim
bit1 :fNative
bit1 :fCol
end
end
class SprmOpcode < BinData::Record
# 6A03 => 0110 1010 0000 0011 | ispmd=3 | fSpec=1 | sType=2 | spra=3
bit9 :ispmd
bit1 :fSpec
bit3 :sType
bit3 :spra
end
class Xstz < BinData::Record
endian :little
uint16 :cch
array :rgtchar, :type=>:uint16, :initial_length => :cch
uint16 :zero
def get
rgtchar.to_a.pack('U*')
end
end
class STTBx < BinData::Record
endian :little
uint16 :fExtend, :asserted_value => 0xffff # must be 0xffff
uint16 :cData
uint16 :cbExtra
array :sttb, :initial_length => :cData do
uint16 :cchData
array :data, :type=>:uint16, :initial_length => :cchData
array :extraData, :type=>:uint8, :initial_length => :cbExtra
end
end
class FFDataBits < BinData::Record
endian :little
# low byte
bit1 :fOwnHelp
bit5 :iRes
bit2 :iType
# high byte
bit1 :fHasListBox
bit1 :fRecalc
bit3 :iTypeTxt
bit1 :iSize
bit1 :fProt
bit1 :fOwnStat
end
class FormFieldData < BinData::Record
endian :little
uint32 :version
FFDataBits :ffdb
uint16 :cch
uint16 :hps
Xstz :xstzName
Xstz :xstzTextDef, :onlyif => lambda { ffdb.iType==0 }
uint16 :wDef, :onlyif => lambda { ffdb.iType==1 or ffdb.iType==2 }
Xstz :xstzTextFormat
Xstz :xstzHelpText
Xstz :xstzStatText
Xstz :xstzEntryMcr
Xstz :xstzExitMcr
STTBx :hsttbDropList, :onlyif => lambda { ffdb.iType==2 }
end
class MsDocFile
attr_reader :fib
attr_reader :compfile
attr_reader :table
attr_reader :clx
attr_reader :acp
attr_reader :apcd
attr_reader :formfields
attr_reader :maindoc
def initialize fname
@compfile = MsCompFile.new fname
@fib = MsFIB::new
parse
end
def text
@text
end
def fields finame
@formfields.each do |ff|
return ff.values[0] if ff.keys[0] == finame
end
raise "Unknown formfield name given! [#{finame}]"
end
# read piecetable
def read_clx
bidx = 0
while bidx<@clx.length do
if @clx[bidx]=="\x02"
#puts "Pcdt found"
# found Pcdt!
# read 4 bytes lcb (length of PlcPcd)
bidx+=1
lcb = @clx[bidx..bidx+3].unpack('l')[0]
#puts "lcb = #{lcb}"
# read aCP array
other_ccp = @fib.fibRgLw97.ccpFtn+@fib.fibRgLw97.ccpHdd+@fib.fibRgLw97.ccpMcr+
@fib.fibRgLw97.ccpAtn+@fib.fibRgLw97.ccpEdn+@fib.fibRgLw97.ccpTxbx+
@fib.fibRgLw97.ccpHdrTxbx
last_cp = (other_ccp!=0) ? @fib.fibRgLw97.ccpText+1+other_ccp : @fib.fibRgLw97.ccpText
last_acpi = (lcb-4)/(8+4)
#puts "last_cp = #{last_cp} other_ccp = #{other_ccp}"
bidx+=4
acpi = 0
@acp = []
begin
cp = @clx[bidx..bidx+3].unpack('l')[0]
#puts "#{acpi}. cp = #{cp}"
@acp << cp
acpi+=1
bidx+=4
end while (acpi<last_acpi+1)
# now aPcd array
@apcd = []
apcdi = 0
while apcdi<(last_acpi) do
apcd = APcd.read(@clx[bidx..bidx+7])
#puts "#{apcdi}. apcd = #{apcd.inspect}"
@apcd << apcd
apcdi+=1
bidx+=8
end
break # we've done
else
# found Prc structure
# read uint16 & skip that no. bytes
bidx+=1
skip = @clx[bidx..bidx+1].unpack('s')[0]
bidx+=skip
#puts "Prc found, skipped #{skip} bytes..."
end
end
end
# read PLC type table
# @plcbuf -> PLC data
# @plclen -> length of PLC data
# @cbdata -> data item size
# @dataclass -> class of data item (for instantiation)
def read_plex stream, fc, lcb, cbdata, dataclass
cItems = (lcb-4)/(4+cbdata)
cps = []
items = []
idx = fc
ci = 0
while ci < cItems+1
cps << stream[idx..idx+3].unpack('l')[0]
idx+=4
ci+=1
end
ci=0
if cbdata>0
while ci < cItems
item = dataclass.new
item.read(stream[idx..idx+cbdata-1])
idx+=cbdata
items << item
ci+=1
end
end
return cps, items
end
# sttb
def read_sttb stream, fc, lcb, dataclass=nil
#puts "STTB @ : #{'%d' % fc} len= #{'%d' % lcb}"
# read fExtend
ext = false
items = []
idx = fc
fExt = stream[idx..idx+1].unpack('S')[0]
#puts "fExtend : #{'%x' % fExt}"
if fExt==0xffff
# extended Sttb
ext = true
idx+=2
cData = stream[idx..idx+1].unpack('S')[0]
idx+=2
else
# simple Sttb
cData = stream[idx..idx+3].unpack('L')[0]
idx+=4
end
#puts "cData : #{'%x' % cData}"
cbExtra = stream[idx..idx+1].unpack('S')[0]
idx+=2
#puts "cbExtra : #{'%x' % cbExtra}"
i=0;
while i<cData
if ext
cchData = stream[idx..idx+1].unpack('S')[0]
cbData = cchData*2
idx+=2
else
cchData = stream[idx..idx].unpack('C')[0]
cbData = cchData
idx+=1
end
ec = Encoding::Converter.new('UTF-16LE', 'UTF-8')
#puts "item : #{i} : cch=#{'%x' % cchData} len=#{'%x' % cbData}"
if dataclass.nil?
item=stream[idx..idx+cbData-1]
item = ec.convert(item.force_encoding('UTF-16LE')) if ext
item = item.force_encoding('UTF-8') if !ext
else
item = dataclass.new
item.read(stream[idx..idx+cbData-1])
end
idx+=cbData
items << item
idx+=cbExtra # skip Extra bytes
i+=1
end
return items
end
def read_fibptrbuf stream,ptr,lcb
@compfile.read_from(stream)[ptr..ptr+lcb-1]
end
def read_bytes stream, fc, lcb
stream[fc..fc+lcb-1]
end
def parse
# read WordDocument stream
@worddoc = @compfile.read_from('WordDocument')
# read Data stream
@data = @compfile.read_from('Data')
# read FIB structures
@fib.read(@worddoc)
# check for wIdent
raise 'WordDocument stream format unknown!' if @fib.fibBase.wIdent!=0xA5EC
# read 1/0Table
@table = @compfile.read_from "#{@fib.fibBase.bfA.fWhichTblStm.to_i.to_s}Table"
# read Clx array
@clx = @table[@fib.fibRgFcLcbBlob.fcClx..@fib.fibRgFcLcbBlob.fcClx+@fib.fibRgFcLcbBlob.lcbClx-1]
read_clx
# read FldMom - fields of main document
@fldcps, @fldflds = read_plex @table,@fib.fibRgFcLcbBlob.fcPlcfFldMom, @fib.fibRgFcLcbBlob.lcbPlcfFldMom, 2, Fld
# parse fields :)
@fldflds.each do |f|
f.fldch = f.fldch & 0x1f
end
# bookmarks [fcSttbfBkmk]
@mombkmk = read_sttb @table, @fib.fibRgFcLcbBlob.fcSttbfBkmk, @fib.fibRgFcLcbBlob.lcbSttbfBkmk
# bookmarks' plexes
@plcfbkf_cp, @plcfbkf_fbkf = read_plex @table, @fib.fibRgFcLcbBlob.fcPlcfBkf, @fib.fibRgFcLcbBlob.lcbPlcfBkf, 4, FBKF
@plcfbkl_cp, ignore = read_plex @table, @fib.fibRgFcLcbBlob.fcPlcfBkl, @fib.fibRgFcLcbBlob.lcbPlcfBkl, 0, nil
# read texts :)
@maindoc = []
ec = Encoding::Converter.new('UTF-16LE', 'UTF-8')
@fc_of_cp = {} # file_offsets of each character position in maindoc
@cp_of_fc = {} # character positions of each file offset in maindoc
@apcd.each_index do |i|
txln = @acp[i+1]-@acp[i]
if @apcd[i].fCompressed
@maindoc << @worddoc[@apcd[i].offset..@apcd[i].offset+txln-1].force_encoding('UTF-8')
for cpi in 0..txln
@fc_of_cp[@acp[i]+cpi] = @apcd[i].offset+cpi
@cp_of_fc[@apcd[i].offset+cpi] = @acp[i]+cpi
end
else
@maindoc << ec.convert(@worddoc[@apcd[i].offset..@apcd[i].offset+txln*2-1].force_encoding('UTF-16LE'))
for cpi in 0..txln
@fc_of_cp[@acp[i]+cpi] = @apcd[i].offset+cpi*2
@cp_of_fc[@apcd[i].offset+cpi*2] = @acp[i]+cpi
end
end
end
@fkpchpx = []
# find CHPXs
btChpx = read_bytes @table, @fib.fibRgFcLcbBlob.fcPlcfBteChpx, @fib.fibRgFcLcbBlob.lcbPlcfBteChpx
n = ((@fib.fibRgFcLcbBlob.lcbPlcfBteChpx-4)/8)+1
i = n*4
while i<@fib.fibRgFcLcbBlob.lcbPlcfBteChpx
fkpidx = btChpx[i..i+3].unpack('L')[0]
fc = fkpidx*512
# read 512 bytes of FKP & fc on WordDocument stream
fkpSector = read_bytes @worddoc,fc,512
crun = fkpSector[511].unpack('C')[0] # run count (crun)
rgfc = []
fkpi = 0
for rgfci in 0..(crun)
rgfc[rgfci] = fkpSector[fkpi..fkpi+3].unpack('L')[0]
fkpi+=4
end
rgb = []
grpchpx = []
fkpi = 4*(crun+1)
for rgbi in 0..(crun-1)
wo = fkpSector[fkpi].unpack('C')[0]
rgb[rgbi] = wo
fkpi+=1
if wo!=0
cb = fkpSector[wo*2].unpack('C')[0]
grpchpx[rgbi] = parseChpxs(fkpSector[wo*2+1..wo*2+1+cb-1]) # this is the CHPX !!!
else
grpchpx[rgbi] = [] # empty CHPX (not "modifying")
end
end
@fkpchpx << { :rgfc => rgfc, :rgb => rgb, :grpchpx => grpchpx }
i+=4
end
# parse main document
@formfields = []
@text = ""
cp = 0
while cp < @fib.fibRgLw97.ccpText
c = getCharAtCp cp # returned char will be UTF-8
ci = c.unpack('C')[0]
#print '%d : ' % cp
if ci == 19
#print 'F'
# we've found FieldBegin mark
cpfs = cp
cpfp = findNextCpWith cpfs, 20 # find separator's Cp
cpfe = findNextCpWith cpfs, 21
fcode = "" # whole field def in worddoc
(cpfs..cpfe).each { |cpf| fcode << getCharAtCp(cpf) }
fcode = fcode.force_encoding('UTF-8') # make sure :)
if fcode['FORM'] # check for FORM* instrText
# FORM[text][checkbox][list] found
cpPic = findNextCpWith cpfs, 1 # search for Picture textmark
if (cpPic<cpfe)
# valid
#puts "| FORM @ #{cpfs}-#{cpfp}-#{cpfe} cpPic:#{cpPic}"
fcPic = @fc_of_cp[cpPic]
chpxpic = getChpxs(fcPic, fcPic+1)[0]
#puts "| CHPX: #{chpxpic.inspect}"
npabd = getNPABD(chpxpic, @data) # retrieve NilPicfAndBinData
ffdata = FormFieldData.new
#puts "| npabd: #{npabd.inspect}"
ffdata.read(npabd[:binData])
#puts "| fcode : | #{fcode.encode('UTF-8')}"
#puts " FFData : #{ffdata.xstzName.get}"
# have whole field data
if ffdata.ffdb.iType==0
finame = "FT_#{ffdata.xstzName.get}"
firesult = fcode[cpfp-cpfs+1..fcode.length-2]
#puts " #{finame.encode('UTF-8')} => #{firesult.encode('UTF-8')} "
@formfields << {finame.encode('UTF-8') => firesult.encode('UTF-8')}
@text << firesult.encode('UTF-8')
elsif ffdata.ffdb.iType==1
finame = "FC_#{ffdata.xstzName.get}"
if ffdata.ffdb.iRes!=25
firesult = ffdata.ffdb.iRes.to_s
@text << (firesult=='1' ? '[X]' : '[ ]')
else
firesult = ""
@text << '[ ]'
end
@formfields << {finame.encode('UTF-8') => firesult.encode('UTF-8')}
elsif ffdata.ffdb.iType==2
finame = "FL_#{ffdata.xstzName.get}"
if ffdata.ffdb.iRes!=25 # undef entry
firesult = ffdata.hsttbDropList.sttb[ffdata.ffdb.iRes].data.to_a.pack('C*')
@text << "[#{firesult}]"
else
firesult = ""
@text << "[ ]"
end
@formfields << {finame.encode('UTF-8') => firesult.encode('UTF-8')}
end
cp = cpfe
end
end
#fields.merge!({:fcode => fcode, :cps=>cpfs, :cpe=>cpfe })
else
#print '%d' % c.unpack('C')[0]
# recode special characters into something more beautiful :)
if c == "\b" or ci==1
@text << ' [pic] '
elsif c == "\a" or c == "\r" or c == "\v"
@text << "\n"
elsif c == "\t"
@text << ' '
else
begin
@text << c
rescue
#puts "#{c} => #{ci} @ cp=#{cp}"
c=" "
retry
end
end
end
#print c.encode('UTF-8')
cp+=1
end
end
def parseChpxs buf
# parse PropertyExceptions in given buf
return [] if buf.length==0
grpprl = []
sprmS = 0
more = true
while more
if (sprmS+2) < buf.length
opcode = buf[sprmS..sprmS+1].unpack('S')[0]
spra = opcode >> 13
opsize = [1,1,2,4,2,2,255,3,0][spra]
len = 0
if opsize==255
if opcode==0xd608 || opcode==0xd606 #sprmTDefTable || sprmTDefTable10
len = 2
opsize = buf[sprmS+2..sprmS+3].unpack('S')[0]
opsize-=1
elsif opcode==0xc615 #sprmPChgTabs
len=1
opsize = buf[sprmS+2]
if opsize==255
itbdDelMax=buf[sprmS+3].unpack('C')[0]
itbdAddMax=buf[sprmS+3+2*itbdDelMax].unpack('C')[0]
opsize = (itbdDelMax*4+itbdAddMax*3)-1
end
else
len = 1
opsize = buf[sprmS+2].unpack('C')[0]
end
end
sprmLen = 2+len+opsize
if buf.length>=sprmS+sprmLen
sprmBuf = buf[sprmS..sprmS+sprmLen-1]
# parse sprm & add to grpprl
grpprl << parseSprm(sprmBuf)
sprmS += sprmBuf.length
else
more = false
end
else
more = false
end
end
return grpprl
end
def parseSprm sbuf
#puts "parseSprm : #{sbuf}"
sprm = {}
op = SprmOpcode.new
o = sbuf[0..1].unpack('S')[0]
#op.read(sbuf[0..1])
op.ispmd = o & 0x1ff
op.fSpec = o[9]
op.sType = o[10]+o[11]*2+o[12]*4
op.spra = o[13]+o[14]*2+o[15]*4
sprm[:opcode] = op
sprm[:opc] = sbuf[0..1].unpack('S')[0]
#puts "opcode = #{op.inspect} = #{sprm[:opc]}"
opsize = [1,1,2,4,2,2,255,3,0][op.spra]
#puts "opsize = #{opsize}"
if opsize==255
if sprm[:opc]==0xd608 || sprm[:opc]==0xd606 #sprmTDefTable || sprmTDefTable10
opsztbl = sbuf[2..3].unpack('S')[0]
args = sbuf[4..4+opsztbl-1-1] # args are opsztbl-1
elsif sprm[:opc]==0xc615 #sprmPChgTabs
alen = sbuf[2].unpack('C')[0]
args = sbuf[3..3+alen-1]
else
opsize = sbuf[2].unpack('C')[0]
args = sbuf[3..3+opsize-1]
end
else
args = sbuf[2..2+opsize-1]
end
sprm[:opsize] = opsize
sprm[:args] = args
return sprm
end
def getNPABD grpprl, dataStream
# find fcPIC for given grpprl array
fc = -1
grpprl.each do |sprm|
#puts "|| #{sprm.inspect}"
if sprm[:opc]==0x6A03 # sprmCPicLocation
fc = sprm[:args][0..3].unpack('L')[0]
elsif sprm[:opc]==0x6A12 # sprmCHsp
fc = sprm[:args][0..3].unpack('L')[0]
end
end
if fc!=-1
#puts "| HAVE fcPIC @ #{fc} "
# found fcPIC
lcb = dataStream[fc..fc+3].unpack('L')[0]
fc+=4
cbHeader = dataStream[fc..fc+1].unpack('S')[0]
fc+=2
fc+=62 # skip 62 bytes
binData = dataStream[fc..fc+lcb-cbHeader-1]
return {:lcb=>lcb, :cbHdr=>cbHeader, :binData=>binData}
end
end
def getChpxs fcmin, fcmax
lst = []
@fkpchpx.each do |fkp|
fkp[:grpchpx].each_index do |j|
if fkp[:rgfc][j]<fcmin && fkp[:rgfc][j+1]>fcmin
lst << fkp[:grpchpx][j]
else if fkp[:rgfc][j]>=fcmin && fkp[:rgfc][j]<fcmax
lst << fkp[:grpchpx][j]
end
end
end
end
return lst
end
#@param startCp - where to start with search
#@param char - what to find as first found occurrence
def findNextCpWith startCp, char
cp = startCp
while cp < @fib.fibRgLw97.ccpText
if getCharAtCp(cp).unpack('C')[0]==char
return cp
end
cp+=1
end
# character not found - return invalid CP
return -1
end
def findApcd cp
# find apcd for cp
@apcd.each_index do |i|
if cp>=@acp[i] && cp<@acp[i+1]
# we've found proper apcd
#print "#{cp}=>i(#{i}) "
return @apcd[i]
end
end
end
def getCharAtCp cp
# check if range of cp in apcd is fCompressed or not
# find apcd for cp
to8 = Encoding::Converter.new('UTF-16LE','UTF-8')
apcd = findApcd cp
#puts "APCD: #{apcd.inspect}"
if apcd.fCompressed
ch = @worddoc[@fc_of_cp[cp]].force_encoding('ISO-8859-2').encode('UTF-8')
else
ch = to8.convert(@worddoc[@fc_of_cp[cp]..@fc_of_cp[cp]+1].force_encoding('UTF-16LE'))
end
#begin
# ch.encode!('UTF-8') if ch.encoding!='UTF-8'
#rescue
# puts "#{ch} (#{ch.unpack('C')[0]}) in #{ch.encoding} @ cp=#{cp}"
#end
return ch
end
end