Menu

[f7b098]: / htmlparser.py  Maximize  Restore  History

Download this file

372 lines (324 with data), 10.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#!/usr/bin/python
# -*- coding: iso-8859-15 -*-
from urllib import quote, quote_plus
import re
def unescape(s):
from htmllib import HTMLParser
p = HTMLParser(None)
p.save_bgn()
p.feed(s)
return p.save_end()
def find_closing(element, page, start=0):
element = element.lower()
page = page.lower()
children = True
while children:
opening = page.find('<'+element+' ', start+1)
closing = page.find('</'+element+'>', start+1)
if closing < opening or opening == -1: # no (more) children of the same class
return closing
start = find_closing(element, page, opening)
if start == -1: # end of page or page corrupted
return start
def straddle(page, startkey):
p = page.find(startkey)
if p < 0:
return '',page
p += len(startkey)
startkey = startkey.strip()
end = startkey.find(' ')
if end == -1:
end = len(startkey)
element = startkey[1:end]
q = find_closing(element, page, p)
return page[:p], page[q:]
def between(hay, before, after, occurence=1, skip=None, include=False, include_before=False, include_after=False): # return substring from haystack between "before" and "after"
haystack = str(hay)
start = 0
if skip != None:
occurence = skip+1
for i in range(1, occurence):
start = haystack.find(before, start)+1
p = haystack.find(before, start)
if p < 0:
return ''
if (not include) and (not include_before):
p += len(before)
q = haystack.find(after, p)
if q < 0:
q = len(haystack)
else:
if include or include_after:
q += len(after)
return haystack[p:q]
def betweens(hay, before, after, include=False, include_before=False, include_after=False): # return substring from haystack between "before" and "after"
results = []
i = 1
result = between(hay, before, after, i, include=include, include_before=include_before, include_after=include_after)
while result != '':
results.append(result)
i += 1
result = between(hay, before, after, i, include=include, include_before=include_before, include_after=include_after)
return results
def getflashvar(haystack, key):
return between(haystack, 'flashvars.'+key+'="', '"')
def getvalue( haystack, field ): # return the value in haystack: '... field=value ...'
lower = haystack.lower().replace(">", " ")
p = lower.find( field+"=" )
if p > -1:
p += len( field )+1
else:
return ""
if haystack[p] == '"': # " ist Trennzeichen
p += 1
q = lower.find( '"', p )
elif haystack[p] == "'": # ' ist Trennzeichen
p += 1
q = lower.find( "'", p )
else: # Keine Gänsefüßchen, Leerzeichen trennt zum nächsten Feldnamen
q = lower.find( " ", p )
return haystack[p:q]
def convert_to_base(number, base): # integer conversion
currentbase = base
while number >= currentbase:
currentbase = currentbase * base
result = ""
while currentbase > 1:
currentbase = int(currentbase/base)
d = int(number/currentbase)
number -= d*currentbase
return result
def toString(number, base):
if base <= 36: # dictionary size: 36 characters
if number < 10:
return str(number) # 0-9
else:
if number > 35:
return toString(number/base, base)+toString(number % base, base)
else:
return chr(ord('a')+number-10) # a-z
elif base == 54:
if number < 59-38+3:
return chr(number+29)
else:
if number > 54:
return toString(number/base, base)+toString(number % base, base)
else:
return chr(ord('a')+number-(59-38+3))
def e(c, base):
if c < base:
return ''
else:
if c % base > 35:
return e(int(c/base)) + String.fromCharCode( (c % base)+29 )
# 0-9
# a-z
# (){}
# '
# :.,
# /-
else:
return e(int(c/base)) + toString(c % base, 36)
# 0-9
# a-z
ElementClassesOfInterest = { "input": ["type", "name", "id", "value"], "img": ["name", "id", "src"] }
class Element: # Form method parse stores it's elements as array of Element classes
def __init__(self, HTML):
self.HTML = HTML
p = HTML.find("<")+1
if p == -1:
return
q = HTML.find(" ", p)
self.Class = HTML[p:q].lower()
if self.Class in ElementClassesOfInterest.keys():
for field in ElementClassesOfInterest[ self.Class ]:
value = getvalue( HTML, field )
if value != '':
self.__dict__[ field ] = value
def __str__(self):
return self.HTML
class Form: # Robot method parse() stores results as array of Form classes
def __init__(self, HTML=None):
if HTML is not None:
self.parse( HTML )
else:
self.name = self.id = self.action = self.method = ""
def __str__(self):
return self.POSTline()
def parse(self, HTML):
self.name = getvalue( HTML, "name" )
self.id = getvalue( HTML, "id" )
self.action = getvalue( HTML, "action" )
self.method = getvalue( HTML, "method" )
lower = HTML.lower() # mit ">"
for Class in ElementClassesOfInterest.keys():
self.__dict__[ Class ] = {}
starter = "<"+Class+" "
p = lower.find( starter )
while ( p > -1 ):
q = lower.find( ">", p )+1
e = Element(HTML[p:q])
#print str(e)
if 'name' in e.__dict__.keys():
if not 'value' in e.__dict__.keys():
e.value = ''
self.__dict__[ Class ][e.name] = e
p = lower.find( starter, q )
def POSTdict(self):
result = {}
for i in self.input.values():
result[ i.name ] = quote_plus( i.value )
return result
def POSTline(self):
return '&'.join( [i.name+"="+quote_plus(i.value) for i in self.input.values()] )
class Eval:
def __init__(self, source, debug=False):
self.source = source
self.debug = debug
def __str__(self):
return self.source
# Algorithmus:
# 6 Variablen werden an eine JavaScript-Dechiffrier-Funktion übergeben: P,A,C,K,E und D
# P ist das Chiffre
# A ist die Größe des Schlüssel-Wörterbuches
# C ist die Nummer des ersten Schlüssels, der in Klartext umgewandelt wird
# K ist das Klartext-Wörterbuch
# E und D haben keine Bedeutung
def unpack(self):
# extract the JS function's arguments
p = self.source.find("('", self.source.find("return p"))+2
q = self.source.find("',", p)
while self.source[q-1] == '\\': # it's "\'," instead of "',", so we are still in the middle of P
q = self.source.find("',", q+1)
P = self.source[p:q] # input
if self.debug:
print "cipher: "+P
q += 2
r = self.source.find(",", q)
A = int(self.source[q:r]) # base to convert to
if self.debug:
print "base: "+str(A)
if A > 36:
A = 36 # JavaScript toString radix cannot be > 36
r += 1
s = self.source.find(",'", r)
C = int(self.source[r:s]) # magic number
if self.debug:
print "number: "+str(C)
s += 2
t = self.source.find("'.split", s)
K = self.source[s:t].split("|") # replacement dictionary
if self.debug:
print "dict: "+str(K)
return P,A,C,K
def deobfuscate(self):
# function(p,a,c,k,e,d) {
# while(c--):
# if(k[c]):
# p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);
# return p
# }
code, base, number, dictionary = self.unpack() # p,a,c,k
while number >= 0:
if number < len(dictionary):
before = toString(number, base)
after = dictionary[number]
if self.debug:
#print str(number)+' base '+str(base)+' = '+str(before)
print "Replacing "+str(before)+" by "+str(after)+" ..."
code = re.sub(r'\b'+before+r'\b', after, code)
if self.debug:
print code
number -= 1
return code
class EnterCodeBelow:
def __init__(self, page):
self.page = str(page)
def extractCode(self):
obfuscated_code = between(self.page, 'Enter code below', '</table>')
code = [0, 0, 0, 0]
for i in range(4):
# deobfuscate order
j = int(between(between(obfuscated_code, '<span ', '</span>', skip=i), 'padding-left:', 'px'))
if 0 < j < 20:
n = 0
elif 20 < j < 40:
n = 1
elif 40 < j < 55:
n = 2
else:
n = 3
# deobfuscate value
k = between(obfuscated_code, '>&#', ';', skip=i)
k = str( int(k)-48 )
code[n] = k
code = ''.join(code)
return code
class HTML:
def __init__(self, HTML, debug=False):
self.string = HTML
self.forms = None
self.debug = debug
def log(self, msg):
if self.debug:
print msg
def __str__(self):
return self.string
def lower(self):
return self.string.lower()
def find(self, needle, start_at=0):
return self.string.find(needle, start_at)
def parse(self):
self.forms = []
self.log("Searching document for forms ...")
lower = self.lower()
p = lower.find("<form") # find form
while ( p > -1 ):
if lower[p+5] in " >": # is it "<form " or "<form>" ?
q = lower.find("</form>", p+5)+7 # find end of form
if q < p+5: # may be -1 (not found) or < p+5 (restart search at beginning)
q = len(lower) # if end not found, copy until end of file
form = self.string[p:q].strip()
if len(form) > 0:
self.forms.append( Form(form) )
else:
q = p+5
p = lower.find("<form", q)
self.log("Found "+str(len(self.forms))+" forms.")
def findForm(self, name=None, ID=None, action=None, number=None): # return specified form
if self.forms is None:
self.parse() # parse, if not parsed already
if name is not None:
for form in self.forms:
if form.name == name: # get form by name
return form
elif ID is not None:
for form in self.forms:
if form.id == ID: # get form by ID
return form
elif action is not None:
for form in self.forms:
if form.action == action: # get form by action
return form
elif len(self.forms) > 0: # if no parameters are given, return the first form
if number is None:
# return the first <form>
return self.forms[0]
else:
if number > len(self.forms):
# return last <form>
return self.forms[len(self.forms)-1]
else:
# return requested <form>
return self.forms[number-1]
return None
def findEval(self, occurence=1):
o = between(self.string, "eval(", "</script>", occurence).strip()
if o != '':
return Eval("eval("+o, debug=False)
else:
return None
def save(self, filename=None):
if filename is None:
filename = "robot.html"
open(filename,"w").write(self.string)