forked from EFForg/https-everywhere
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmetrics.py
More file actions
123 lines (94 loc) · 3.92 KB
/
metrics.py
File metadata and controls
123 lines (94 loc) · 3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#Metrics for measuring similarity of two strings, HTML/XML DOM trees, etc.
#They are not (yet) guarranteed to be "proper metrics" in calculus sense.
from lxml import etree
from cStringIO import StringIO
import struct
import bsdiff4 as bsdiff
import Levenshtein
class Metric(object):
"""Abstract interface for Metric objects."""
def __init__(self):
pass
def distanceNormed(self, s1, s2):
"""Return float distance metric of two strings s1 and s2 in
range [0, 1].
"""
raise NotImlementedError()
class BSDiffMetric(Metric):
"""String similarity metric based on BSDiff."""
def __init__(self):
Metric.__init__(self)
def distanceNormed(self, s1, s2):
if len(s1) == 0 and len(s2) == 0:
return 0
#bsdiff is not symmetric, so take max from both diffs
control, diffBlock, extra = bsdiff.Diff(s1, s2)
extraRatio1 = float(len(extra))/float(max(len(s1), len(s2)))
control, diffBlock, extra = bsdiff.Diff(s2, s1)
extraRatio2 = float(len(extra))/float(max(len(s1), len(s2)))
return max(extraRatio1, extraRatio2)
class MarkupMetric(Metric):
"""Metric for tree-like hierarchical languages - XML, HTML."""
def __init__(self):
Metric.__init__(self)
def tagNameToCharMap(self, doc1, doc2, minIndex=0):
"""Returns a dict that maps element names to unicode characters uniquely.
@param doc1: html/xml tree string as lxml Element or ElementTree
@param doc2: html/xml tree string as lxml Element or ElementTree
@param minIndex: start numbering with this unicode value
"""
tags = set((elem.tag for elem in doc1.xpath("//*")))
tags.update((elem.tag for elem in doc2.xpath("//*")))
#Number them consistently among those two documents.
#Hackish way to map custom alphabet onto unicode chars, but works for
#up to >= 55000 element names which should be more than enough.
unicodeAlphabet = (struct.pack("<H", index).decode('utf-16') \
for index in range(minIndex, minIndex+len(tags)))
numberedTags = zip(tags, unicodeAlphabet)
return dict(numberedTags)
def mapTree(self, elem, tagToCharMap):
"""Map element to unicode character. If it has no children, it'll be mapped
to a single char, otherwise mapped as "(X + Y + Z)" where X, Y, Z is
mapping of its children (+ is concatenation).
@param elem: lxml Element
@param tagToCharMap: dict from tag name to unicode char
"""
#this isinstance test is in lxml tutorial, dunno how else to skip comments
children = [child for child in list(elem) if isinstance(child.tag, basestring)]
thisElem = tagToCharMap[elem.tag]
if children:
childrenMap = [self.mapTree(child, tagToCharMap) for child in children]
return thisElem + u'(' + "".join(childrenMap) + u')'
else:
return thisElem
def mappedTrees(self, doc1, doc2):
"""Returns unicode string that represents the tree structure of
the HTML/XML documents. Only tag names are considered.
@returns: tuple of two unicode strings
"""
#The 42 is first char after parentheses in unicode encoding
tagToCharMap = self.tagNameToCharMap(doc1, doc2, 42)
return (self.mapTree(doc1, tagToCharMap), self.mapTree(doc2, tagToCharMap))
def distanceNormed(self, s1, s2):
"""
"""
#Empty strings are not proper HTML/XML, but we can consider them
#same for our purpose.
if len(s1) == 0 and len(s2) == 0:
return 0
if s1 == s2:
return 0
try:
doc1 = etree.parse(StringIO(s1), etree.HTMLParser())
doc2 = etree.parse(StringIO(s2), etree.HTMLParser())
except:
# Some documents don't parse as XML. In that case, punt and return 0
# distance.
return 0
# If we failed to parse either document, return max Levenshtein distance.
# Note this will happen for non-HTML documents like favicons.
# Identical documents should hit the equality check before parsing.
if not doc1 or not doc2 or doc1.getroot() is None or doc2.getroot() is None:
return 1
mapped1, mapped2 = self.mappedTrees(doc1.getroot(), doc2.getroot())
return 1.0-Levenshtein.ratio(mapped1, mapped2)