+
+
+
+Parsing untrusted XML files with a weakly configured XML parser may lead to attacks such as XML External Entity (XXE),
+Billion Laughs, Quadratic Blowup and DTD retrieval.
+This type of attack uses external entity references to access arbitrary files on a system, carry out denial of
+service, or server side request forgery. Even when the result of parsing is not returned to the user, out-of-band
+data retrieval techniques may allow attackers to steal sensitive data. Denial of services can also be carried out
+in this situation.
+
+
+
+
+
+Use defusedxml, a Python package aimed
+to prevent any potentially malicious operation.
+
+
+
+
+
+The following example calls xml.etree.ElementTree.fromstring using a parser (lxml.etree.XMLParser)
+that is not safely configured on untrusted data, and is therefore inherently unsafe.
+
+
+
+Providing an input (xml_content) like the following XML content against /bad, the request response would contain the contents of
+/etc/passwd.
+
+
+
+
+
+Python 3 XML Vulnerabilities.
+Python 2 XML Vulnerabilities.
+Python XML Parsing.
+OWASP vulnerability description: XML External Entity (XXE) Processing.
+OWASP guidance on parsing xml files: XXE Prevention Cheat Sheet.
+Paper by Timothy Morgen: XML Schema, DTD, and Entity Attacks
+Out-of-band data retrieval: Timur Yunusov & Alexey Osipov, Black hat EU 2013: XML Out-Of-Band Data Retrieval.
+Denial of service attack (Billion laughs): Billion Laughs.
+
+
+
diff --git a/python/ql/src/experimental/Security/CWE-611/XmlEntityInjection.ql b/python/ql/src/experimental/Security/CWE-611/XmlEntityInjection.ql
new file mode 100644
index 000000000000..922ca346b173
--- /dev/null
+++ b/python/ql/src/experimental/Security/CWE-611/XmlEntityInjection.ql
@@ -0,0 +1,31 @@
+/**
+ * @name XML Entity injection
+ * @description User input should not be parsed allowing the injection of entities.
+ * @kind path-problem
+ * @problem.severity error
+ * @id py/xml-entity-injection
+ * @tags security
+ * external/cwe/cwe-611
+ * external/cwe/cwe-776
+ * external/cwe/cwe-827
+ */
+
+// determine precision above
+import python
+import experimental.semmle.python.security.dataflow.XmlEntityInjection
+import DataFlow::PathGraph
+
+from
+ XmlEntityInjection::XmlEntityInjectionConfiguration config, DataFlow::PathNode source,
+ DataFlow::PathNode sink, string kinds
+where
+ config.hasFlowPath(source, sink) and
+ kinds =
+ strictconcat(string kind |
+ kind = sink.getNode().(XmlEntityInjection::Sink).getVulnerableKind()
+ |
+ kind, ", "
+ )
+select sink.getNode(), source, sink,
+ "$@ XML input is constructed from a $@ and is vulnerable to: " + kinds + ".", sink.getNode(),
+ "This", source.getNode(), "user-provided value"
diff --git a/python/ql/src/experimental/semmle/python/Concepts.qll b/python/ql/src/experimental/semmle/python/Concepts.qll
index bb83fbda5838..ce5617071845 100644
--- a/python/ql/src/experimental/semmle/python/Concepts.qll
+++ b/python/ql/src/experimental/semmle/python/Concepts.qll
@@ -14,6 +14,74 @@ private import semmle.python.dataflow.new.RemoteFlowSources
private import semmle.python.dataflow.new.TaintTracking
private import experimental.semmle.python.Frameworks
+/**
+ * Since there is both XML module in normal and experimental Concepts,
+ * we have to rename the experimental module as this.
+ */
+module ExperimentalXML {
+ /**
+ * A kind of XML vulnerability.
+ *
+ * See https://pypi.org/project/defusedxml/#python-xml-libraries
+ */
+ class XMLVulnerabilityKind extends string {
+ XMLVulnerabilityKind() {
+ this in ["Billion Laughs", "Quadratic Blowup", "XXE", "DTD retrieval"]
+ }
+
+ /** Holds for Billion Laughs vulnerability kind. */
+ predicate isBillionLaughs() { this = "Billion Laughs" }
+
+ /** Holds for Quadratic Blowup vulnerability kind. */
+ predicate isQuadraticBlowup() { this = "Quadratic Blowup" }
+
+ /** Holds for XXE vulnerability kind. */
+ predicate isXxe() { this = "XXE" }
+
+ /** Holds for DTD retrieval vulnerability kind. */
+ predicate isDtdRetrieval() { this = "DTD retrieval" }
+ }
+
+ /**
+ * A data-flow node that parses XML.
+ *
+ * Extend this class to model new APIs. If you want to refine existing API models,
+ * extend `XMLParsing` instead.
+ */
+ class XMLParsing extends DataFlow::Node instanceof XMLParsing::Range {
+ /**
+ * Gets the argument containing the content to parse.
+ */
+ DataFlow::Node getAnInput() { result = super.getAnInput() }
+
+ /**
+ * Holds if this XML parsing is vulnerable to `kind`.
+ */
+ predicate vulnerableTo(XMLVulnerabilityKind kind) { super.vulnerableTo(kind) }
+ }
+
+ /** Provides classes for modeling XML parsing APIs. */
+ module XMLParsing {
+ /**
+ * A data-flow node that parses XML.
+ *
+ * Extend this class to model new APIs. If you want to refine existing API models,
+ * extend `XMLParsing` instead.
+ */
+ abstract class Range extends DataFlow::Node {
+ /**
+ * Gets the argument containing the content to parse.
+ */
+ abstract DataFlow::Node getAnInput();
+
+ /**
+ * Holds if this XML parsing is vulnerable to `kind`.
+ */
+ abstract predicate vulnerableTo(XMLVulnerabilityKind kind);
+ }
+ }
+}
+
/** Provides classes for modeling LDAP query execution-related APIs. */
module LDAPQuery {
/**
diff --git a/python/ql/src/experimental/semmle/python/Frameworks.qll b/python/ql/src/experimental/semmle/python/Frameworks.qll
index 81b2c1bee23d..edbed61c41c4 100644
--- a/python/ql/src/experimental/semmle/python/Frameworks.qll
+++ b/python/ql/src/experimental/semmle/python/Frameworks.qll
@@ -3,6 +3,7 @@
*/
private import experimental.semmle.python.frameworks.Stdlib
+private import experimental.semmle.python.frameworks.Xml
private import experimental.semmle.python.frameworks.Flask
private import experimental.semmle.python.frameworks.Django
private import experimental.semmle.python.frameworks.Werkzeug
diff --git a/python/ql/src/experimental/semmle/python/frameworks/Xml.qll b/python/ql/src/experimental/semmle/python/frameworks/Xml.qll
new file mode 100644
index 000000000000..a2f36f66f2e3
--- /dev/null
+++ b/python/ql/src/experimental/semmle/python/frameworks/Xml.qll
@@ -0,0 +1,466 @@
+/**
+ * Provides class and predicates to track external data that
+ * may represent malicious XML objects.
+ */
+
+private import python
+private import semmle.python.dataflow.new.DataFlow
+private import experimental.semmle.python.Concepts
+private import semmle.python.ApiGraphs
+
+module XML = ExperimentalXML;
+
+private module XmlEtree {
+ /**
+ * Provides models for `xml.etree` parsers
+ *
+ * See
+ * - https://docs.python.org/3.10/library/xml.etree.elementtree.html#xml.etree.ElementTree.XMLParser
+ * - https://docs.python.org/3.10/library/xml.etree.elementtree.html#xml.etree.ElementTree.XMLPullParser
+ */
+ module XMLParser {
+ /**
+ * A source of instances of `xml.etree` parsers, extend this class to model new instances.
+ *
+ * This can include instantiations of the class, return values from function
+ * calls, or a special parameter that will be set when functions are called by an external
+ * library.
+ *
+ * Use the predicate `XMLParser::instance()` to get references to instances of `xml.etree` parsers.
+ */
+ abstract class InstanceSource extends DataFlow::LocalSourceNode { }
+
+ /** A direct instantiation of `xml.etree` parsers. */
+ private class ClassInstantiation extends InstanceSource, DataFlow::CallCfgNode {
+ ClassInstantiation() {
+ this =
+ API::moduleImport("xml")
+ .getMember("etree")
+ .getMember("ElementTree")
+ .getMember("XMLParser")
+ .getACall()
+ or
+ this =
+ API::moduleImport("xml")
+ .getMember("etree")
+ .getMember("ElementTree")
+ .getMember("XMLPullParser")
+ .getACall()
+ }
+ }
+
+ /** Gets a reference to an `xml.etree` parser instance. */
+ private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t) {
+ t.start() and
+ result instanceof InstanceSource
+ or
+ exists(DataFlow::TypeTracker t2 | result = instance(t2).track(t2, t))
+ }
+
+ /** Gets a reference to an `xml.etree` parser instance. */
+ DataFlow::Node instance() { instance(DataFlow::TypeTracker::end()).flowsTo(result) }
+
+ /**
+ * A call to the `feed` method of an `xml.etree` parser.
+ */
+ private class XMLEtreeParserFeedCall extends DataFlow::MethodCallNode, XML::XMLParsing::Range {
+ XMLEtreeParserFeedCall() { this.calls(instance(), "feed") }
+
+ override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("data")] }
+
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ kind.isBillionLaughs() or kind.isQuadraticBlowup()
+ }
+ }
+ }
+
+ /**
+ * A call to either of:
+ * - `xml.etree.ElementTree.fromstring`
+ * - `xml.etree.ElementTree.fromstringlist`
+ * - `xml.etree.ElementTree.XML`
+ * - `xml.etree.ElementTree.XMLID`
+ * - `xml.etree.ElementTree.parse`
+ * - `xml.etree.ElementTree.iterparse`
+ */
+ private class XMLEtreeParsing extends DataFlow::CallCfgNode, XML::XMLParsing::Range {
+ XMLEtreeParsing() {
+ this =
+ API::moduleImport("xml")
+ .getMember("etree")
+ .getMember("ElementTree")
+ .getMember(["fromstring", "fromstringlist", "XML", "XMLID", "parse", "iterparse"])
+ .getACall()
+ }
+
+ override DataFlow::Node getAnInput() {
+ result in [
+ this.getArg(0),
+ // fromstring / XML / XMLID
+ this.getArgByName("text"),
+ // fromstringlist
+ this.getArgByName("sequence"),
+ // parse / iterparse
+ this.getArgByName("source"),
+ ]
+ }
+
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ // note: it does not matter what `xml.etree` parser you are using, you cannot
+ // change the security features anyway :|
+ kind.isBillionLaughs() or kind.isQuadraticBlowup()
+ }
+ }
+}
+
+private module SaxBasedParsing {
+ /**
+ * A call to the `setFeature` method on a XML sax parser.
+ *
+ * See https://docs.python.org/3.10/library/xml.sax.reader.html#xml.sax.xmlreader.XMLReader.setFeature
+ */
+ class SaxParserSetFeatureCall extends DataFlow::MethodCallNode {
+ SaxParserSetFeatureCall() {
+ this =
+ API::moduleImport("xml")
+ .getMember("sax")
+ .getMember("make_parser")
+ .getReturn()
+ .getMember("setFeature")
+ .getACall()
+ }
+
+ // The keyword argument names does not match documentation. I checked (with Python
+ // 3.9.5) that the names used here actually works.
+ DataFlow::Node getFeatureArg() { result in [this.getArg(0), this.getArgByName("name")] }
+
+ DataFlow::Node getStateArg() { result in [this.getArg(1), this.getArgByName("state")] }
+ }
+
+ /** Gets a back-reference to the `setFeature` state argument `arg`. */
+ private DataFlow::TypeTrackingNode saxParserSetFeatureStateArgBacktracker(
+ DataFlow::TypeBackTracker t, DataFlow::Node arg
+ ) {
+ t.start() and
+ arg = any(SaxParserSetFeatureCall c).getStateArg() and
+ result = arg.getALocalSource()
+ or
+ exists(DataFlow::TypeBackTracker t2 |
+ result = saxParserSetFeatureStateArgBacktracker(t2, arg).backtrack(t2, t)
+ )
+ }
+
+ /** Gets a back-reference to the `setFeature` state argument `arg`. */
+ DataFlow::LocalSourceNode saxParserSetFeatureStateArgBacktracker(DataFlow::Node arg) {
+ result = saxParserSetFeatureStateArgBacktracker(DataFlow::TypeBackTracker::end(), arg)
+ }
+
+ /**
+ * Gets a reference to a XML sax parser that has `feature_external_ges` turned on.
+ *
+ * See https://docs.python.org/3/library/xml.sax.handler.html#xml.sax.handler.feature_external_ges
+ */
+ private DataFlow::Node saxParserWithFeatureExternalGesTurnedOn(DataFlow::TypeTracker t) {
+ t.start() and
+ exists(SaxParserSetFeatureCall call |
+ call.getFeatureArg() =
+ API::moduleImport("xml")
+ .getMember("sax")
+ .getMember("handler")
+ .getMember("feature_external_ges")
+ .getAUse() and
+ saxParserSetFeatureStateArgBacktracker(call.getStateArg())
+ .asExpr()
+ .(BooleanLiteral)
+ .booleanValue() = true and
+ result = call.getObject()
+ )
+ or
+ exists(DataFlow::TypeTracker t2 |
+ t = t2.smallstep(saxParserWithFeatureExternalGesTurnedOn(t2), result)
+ ) and
+ // take account of that we can set the feature to False, which makes the parser safe again
+ not exists(SaxParserSetFeatureCall call |
+ call.getObject() = result and
+ call.getFeatureArg() =
+ API::moduleImport("xml")
+ .getMember("sax")
+ .getMember("handler")
+ .getMember("feature_external_ges")
+ .getAUse() and
+ saxParserSetFeatureStateArgBacktracker(call.getStateArg())
+ .asExpr()
+ .(BooleanLiteral)
+ .booleanValue() = false
+ )
+ }
+
+ /**
+ * Gets a reference to a XML sax parser that has `feature_external_ges` turned on.
+ *
+ * See https://docs.python.org/3/library/xml.sax.handler.html#xml.sax.handler.feature_external_ges
+ */
+ DataFlow::Node saxParserWithFeatureExternalGesTurnedOn() {
+ result = saxParserWithFeatureExternalGesTurnedOn(DataFlow::TypeTracker::end())
+ }
+
+ /**
+ * A call to the `parse` method on a SAX XML parser.
+ */
+ private class XMLSaxInstanceParsing extends DataFlow::MethodCallNode, XML::XMLParsing::Range {
+ XMLSaxInstanceParsing() {
+ this =
+ API::moduleImport("xml")
+ .getMember("sax")
+ .getMember("make_parser")
+ .getReturn()
+ .getMember("parse")
+ .getACall()
+ }
+
+ override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("source")] }
+
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ // always vuln to these
+ (kind.isBillionLaughs() or kind.isQuadraticBlowup())
+ or
+ // can be vuln to other things if features has been turned on
+ this.getObject() = saxParserWithFeatureExternalGesTurnedOn() and
+ (kind.isXxe() or kind.isDtdRetrieval())
+ }
+ }
+
+ /**
+ * A call to either `parse` or `parseString` from `xml.sax` module.
+ *
+ * See:
+ * - https://docs.python.org/3.10/library/xml.sax.html#xml.sax.parse
+ * - https://docs.python.org/3.10/library/xml.sax.html#xml.sax.parseString
+ */
+ private class XMLSaxParsing extends DataFlow::MethodCallNode, XML::XMLParsing::Range {
+ XMLSaxParsing() {
+ this =
+ API::moduleImport("xml").getMember("sax").getMember(["parse", "parseString"]).getACall()
+ }
+
+ override DataFlow::Node getAnInput() {
+ result in [
+ this.getArg(0),
+ // parseString
+ this.getArgByName("string"),
+ // parse
+ this.getArgByName("source"),
+ ]
+ }
+
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ // always vuln to these
+ (kind.isBillionLaughs() or kind.isQuadraticBlowup())
+ or
+ // can be vuln to other things if features has been turned on
+ this.getObject() = saxParserWithFeatureExternalGesTurnedOn() and
+ (kind.isXxe() or kind.isDtdRetrieval())
+ }
+ }
+
+ /**
+ * A call to the `parse` or `parseString` methods from `xml.dom.minidom` or `xml.dom.pulldom`.
+ *
+ * Both of these modules are based on SAX parsers.
+ */
+ private class XMLDomParsing extends DataFlow::CallCfgNode, XML::XMLParsing::Range {
+ XMLDomParsing() {
+ this =
+ API::moduleImport("xml")
+ .getMember("dom")
+ .getMember(["minidom", "pulldom"])
+ .getMember(["parse", "parseString"])
+ .getACall()
+ }
+
+ override DataFlow::Node getAnInput() {
+ result in [
+ this.getArg(0),
+ // parseString
+ this.getArgByName("string"),
+ // minidom.parse
+ this.getArgByName("file"),
+ // pulldom.parse
+ this.getArgByName("stream_or_string"),
+ ]
+ }
+
+ DataFlow::Node getParserArg() { result in [this.getArg(1), this.getArgByName("parser")] }
+
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ this.getParserArg() = saxParserWithFeatureExternalGesTurnedOn() and
+ (kind.isXxe() or kind.isDtdRetrieval())
+ or
+ (kind.isBillionLaughs() or kind.isQuadraticBlowup())
+ }
+ }
+}
+
+private module Lxml {
+ /**
+ * Provides models for `lxml.etree` parsers.
+ *
+ * See https://lxml.de/apidoc/lxml.etree.html?highlight=xmlparser#lxml.etree.XMLParser
+ */
+ module XMLParser {
+ /**
+ * A source of instances of `lxml.etree` parsers, extend this class to model new instances.
+ *
+ * This can include instantiations of the class, return values from function
+ * calls, or a special parameter that will be set when functions are called by an external
+ * library.
+ *
+ * Use the predicate `XMLParser::instance()` to get references to instances of `lxml.etree` parsers.
+ */
+ abstract class InstanceSource extends DataFlow::LocalSourceNode {
+ /** Holds if this instance is vulnerable to `kind`. */
+ abstract predicate vulnerableTo(XML::XMLVulnerabilityKind kind);
+ }
+
+ /**
+ * A call to `lxml.etree.XMLParser`.
+ *
+ * See https://lxml.de/apidoc/lxml.etree.html?highlight=xmlparser#lxml.etree.XMLParser
+ */
+ private class LXMLParser extends InstanceSource, DataFlow::CallCfgNode {
+ LXMLParser() {
+ this = API::moduleImport("lxml").getMember("etree").getMember("XMLParser").getACall()
+ }
+
+ // NOTE: it's not possible to change settings of a parser after constructing it
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ kind.isXxe() and
+ (
+ // resolve_entities has default True
+ not exists(this.getArgByName("resolve_entities"))
+ or
+ this.getArgByName("resolve_entities").getALocalSource().asExpr() = any(True t)
+ )
+ or
+ (kind.isBillionLaughs() or kind.isQuadraticBlowup()) and
+ this.getArgByName("huge_tree").getALocalSource().asExpr() = any(True t) and
+ not this.getArgByName("resolve_entities").getALocalSource().asExpr() = any(False t)
+ or
+ kind.isDtdRetrieval() and
+ this.getArgByName("load_dtd").getALocalSource().asExpr() = any(True t) and
+ this.getArgByName("no_network").getALocalSource().asExpr() = any(False t)
+ }
+ }
+
+ /**
+ * A call to `lxml.etree.get_default_parser`.
+ *
+ * See https://lxml.de/apidoc/lxml.etree.html?highlight=xmlparser#lxml.etree.get_default_parser
+ */
+ private class LXMLDefaultParser extends InstanceSource, DataFlow::CallCfgNode {
+ LXMLDefaultParser() {
+ this =
+ API::moduleImport("lxml").getMember("etree").getMember("get_default_parser").getACall()
+ }
+
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ // as highlighted by
+ // https://lxml.de/apidoc/lxml.etree.html?highlight=xmlparser#lxml.etree.XMLParser
+ // by default XXE is allow. so as long as the default parser has not been
+ // overridden, the result is also vuln to XXE.
+ kind.isXxe()
+ // TODO: take into account that you can override the default parser with `lxml.etree.set_default_parser`.
+ }
+ }
+
+ /** Gets a reference to an `lxml.etree` parsers instance, with origin in `origin` */
+ private DataFlow::TypeTrackingNode instance(DataFlow::TypeTracker t, InstanceSource origin) {
+ t.start() and
+ result = origin
+ or
+ exists(DataFlow::TypeTracker t2 | result = instance(t2, origin).track(t2, t))
+ }
+
+ /** Gets a reference to an `lxml.etree` parsers instance, with origin in `origin` */
+ DataFlow::Node instance(InstanceSource origin) {
+ instance(DataFlow::TypeTracker::end(), origin).flowsTo(result)
+ }
+
+ /** Gets a reference to an `lxml.etree` parser instance, that is vulnerable to `kind`. */
+ DataFlow::Node instanceVulnerableTo(XML::XMLVulnerabilityKind kind) {
+ exists(InstanceSource origin | result = instance(origin) and origin.vulnerableTo(kind))
+ }
+
+ /**
+ * A call to the `feed` method of an `lxml` parser.
+ */
+ private class LXMLParserFeedCall extends DataFlow::MethodCallNode, XML::XMLParsing::Range {
+ LXMLParserFeedCall() { this.calls(instance(_), "feed") }
+
+ override DataFlow::Node getAnInput() { result in [this.getArg(0), this.getArgByName("data")] }
+
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ this.calls(instanceVulnerableTo(kind), "feed")
+ }
+ }
+ }
+
+ /**
+ * A call to either of:
+ * - `lxml.etree.fromstring`
+ * - `lxml.etree.fromstringlist`
+ * - `lxml.etree.XML`
+ * - `lxml.etree.parse`
+ * - `lxml.etree.parseid`
+ *
+ * See https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstring
+ */
+ private class LXMLParsing extends DataFlow::CallCfgNode, XML::XMLParsing::Range {
+ LXMLParsing() {
+ this =
+ API::moduleImport("lxml")
+ .getMember("etree")
+ .getMember(["fromstring", "fromstringlist", "XML", "parse", "parseid"])
+ .getACall()
+ }
+
+ override DataFlow::Node getAnInput() {
+ result in [
+ this.getArg(0),
+ // fromstring / XML
+ this.getArgByName("text"),
+ // fromstringlist
+ this.getArgByName("strings"),
+ // parse / parseid
+ this.getArgByName("source"),
+ ]
+ }
+
+ DataFlow::Node getParserArg() { result in [this.getArg(1), this.getArgByName("parser")] }
+
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ this.getParserArg() = XMLParser::instanceVulnerableTo(kind)
+ or
+ kind.isXxe() and
+ not exists(this.getParserArg())
+ }
+ }
+}
+
+private module Xmltodict {
+ /**
+ * A call to `xmltodict.parse`.
+ */
+ private class XMLtoDictParsing extends DataFlow::CallCfgNode, XML::XMLParsing::Range {
+ XMLtoDictParsing() { this = API::moduleImport("xmltodict").getMember("parse").getACall() }
+
+ override DataFlow::Node getAnInput() {
+ result in [this.getArg(0), this.getArgByName("xml_input")]
+ }
+
+ override predicate vulnerableTo(XML::XMLVulnerabilityKind kind) {
+ (kind.isBillionLaughs() or kind.isQuadraticBlowup()) and
+ this.getArgByName("disable_entities").getALocalSource().asExpr() = any(False f)
+ }
+ }
+}
diff --git a/python/ql/src/experimental/semmle/python/security/dataflow/XmlEntityInjection.qll b/python/ql/src/experimental/semmle/python/security/dataflow/XmlEntityInjection.qll
new file mode 100644
index 000000000000..35220e153d12
--- /dev/null
+++ b/python/ql/src/experimental/semmle/python/security/dataflow/XmlEntityInjection.qll
@@ -0,0 +1,28 @@
+import python
+import experimental.semmle.python.Concepts
+import semmle.python.dataflow.new.DataFlow
+import semmle.python.dataflow.new.TaintTracking
+import semmle.python.dataflow.new.RemoteFlowSources
+import semmle.python.dataflow.new.BarrierGuards
+
+module XmlEntityInjection {
+ import XmlEntityInjectionCustomizations::XmlEntityInjection
+
+ class XmlEntityInjectionConfiguration extends TaintTracking::Configuration {
+ XmlEntityInjectionConfiguration() { this = "XmlEntityInjectionConfiguration" }
+
+ override predicate isSource(DataFlow::Node source) {
+ source instanceof RemoteFlowSourceAsSource
+ }
+
+ override predicate isSink(DataFlow::Node sink) { sink instanceof Sink }
+
+ override predicate isSanitizerGuard(DataFlow::BarrierGuard guard) {
+ guard instanceof SanitizerGuard
+ }
+
+ override predicate isAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
+ any(AdditionalTaintStep s).step(nodeFrom, nodeTo)
+ }
+ }
+}
diff --git a/python/ql/src/experimental/semmle/python/security/dataflow/XmlEntityInjectionCustomizations.qll b/python/ql/src/experimental/semmle/python/security/dataflow/XmlEntityInjectionCustomizations.qll
new file mode 100644
index 000000000000..e420c738a978
--- /dev/null
+++ b/python/ql/src/experimental/semmle/python/security/dataflow/XmlEntityInjectionCustomizations.qll
@@ -0,0 +1,86 @@
+/**
+ * Provides default sources, sinks and sanitizers for detecting
+ * "ldap injection"
+ * vulnerabilities, as well as extension points for adding your own.
+ */
+
+private import python
+private import semmle.python.dataflow.new.DataFlow
+private import experimental.semmle.python.Concepts
+private import semmle.python.dataflow.new.RemoteFlowSources
+private import semmle.python.dataflow.new.BarrierGuards
+private import semmle.python.ApiGraphs
+
+/**
+ * Provides default sources, sinks and sanitizers for detecting "xml injection"
+ * vulnerabilities, as well as extension points for adding your own.
+ */
+module XmlEntityInjection {
+ /**
+ * A data flow source for "xml injection" vulnerabilities.
+ */
+ abstract class Source extends DataFlow::Node { }
+
+ /**
+ * A data flow sink for "xml injection" vulnerabilities.
+ */
+ abstract class Sink extends DataFlow::Node {
+ /** Gets the kind of XML injection that this sink is vulnerable to. */
+ abstract string getVulnerableKind();
+ }
+
+ /**
+ * A sanitizer guard for "xml injection" vulnerabilities.
+ */
+ abstract class SanitizerGuard extends DataFlow::BarrierGuard { }
+
+ /**
+ * A unit class for adding additional taint steps.
+ *
+ * Extend this class to add additional taint steps that should apply to `XmlEntityInjection`
+ * taint configuration.
+ */
+ class AdditionalTaintStep extends Unit {
+ /**
+ * Holds if the step from `nodeFrom` to `nodeTo` should be considered a taint
+ * step for `XmlEntityInjection` configuration.
+ */
+ abstract predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo);
+ }
+
+ /**
+ * An input to a direct XML parsing function, considered as a flow sink.
+ *
+ * See `XML::XMLParsing`.
+ */
+ class XMLParsingInputAsSink extends Sink {
+ ExperimentalXML::XMLParsing xmlParsing;
+
+ XMLParsingInputAsSink() { this = xmlParsing.getAnInput() }
+
+ override string getVulnerableKind() { xmlParsing.vulnerableTo(result) }
+ }
+
+ /**
+ * A source of remote user input, considered as a flow source.
+ */
+ class RemoteFlowSourceAsSource extends Source, RemoteFlowSource { }
+
+ /**
+ * A comparison with a constant string, considered as a sanitizer-guard.
+ */
+ class StringConstCompareAsSanitizerGuard extends SanitizerGuard, StringConstCompare { }
+
+ /**
+ * A taint step for `io`'s `StringIO` and `BytesIO` methods.
+ */
+ class IoAdditionalTaintStep extends AdditionalTaintStep {
+ override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
+ exists(DataFlow::CallCfgNode ioCalls |
+ ioCalls = API::moduleImport("io").getMember(["StringIO", "BytesIO"]).getACall() and
+ nodeFrom = ioCalls.getArg(0) and
+ nodeTo = ioCalls
+ )
+ }
+ }
+}
diff --git a/python/ql/test/experimental/library-tests/frameworks/XML/ExperimentalXmlConceptsTests.expected b/python/ql/test/experimental/library-tests/frameworks/XML/ExperimentalXmlConceptsTests.expected
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/ql/test/experimental/library-tests/frameworks/XML/ExperimentalXmlConceptsTests.ql b/python/ql/test/experimental/library-tests/frameworks/XML/ExperimentalXmlConceptsTests.ql
new file mode 100644
index 000000000000..81bc391d0e55
--- /dev/null
+++ b/python/ql/test/experimental/library-tests/frameworks/XML/ExperimentalXmlConceptsTests.ql
@@ -0,0 +1,33 @@
+import python
+import experimental.semmle.python.Concepts
+import experimental.semmle.python.frameworks.Xml
+import semmle.python.dataflow.new.DataFlow
+import TestUtilities.InlineExpectationsTest
+private import semmle.python.dataflow.new.internal.PrintNode
+
+class XmlParsingTest extends InlineExpectationsTest {
+ XmlParsingTest() { this = "XmlParsingTest" }
+
+ override string getARelevantTag() { result in ["input", "vuln"] }
+
+ override predicate hasActualResult(Location location, string element, string tag, string value) {
+ exists(location.getFile().getRelativePath()) and
+ exists(XML::XMLParsing parsing |
+ exists(DataFlow::Node input |
+ input = parsing.getAnInput() and
+ location = input.getLocation() and
+ element = input.toString() and
+ value = prettyNodeForInlineTest(input) and
+ tag = "input"
+ )
+ or
+ exists(XML::XMLVulnerabilityKind kind |
+ parsing.vulnerableTo(kind) and
+ location = parsing.getLocation() and
+ element = parsing.toString() and
+ value = "'" + kind + "'" and
+ tag = "vuln"
+ )
+ )
+ }
+}
diff --git a/python/ql/test/experimental/library-tests/frameworks/XML/lxml_etree.py b/python/ql/test/experimental/library-tests/frameworks/XML/lxml_etree.py
new file mode 100644
index 000000000000..22930a58af37
--- /dev/null
+++ b/python/ql/test/experimental/library-tests/frameworks/XML/lxml_etree.py
@@ -0,0 +1,54 @@
+from io import StringIO
+import lxml.etree
+
+x = "some xml"
+
+# different parsing methods
+lxml.etree.fromstring(x) # $ input=x vuln='XXE'
+lxml.etree.fromstring(text=x) # $ input=x vuln='XXE'
+
+lxml.etree.fromstringlist([x]) # $ input=List vuln='XXE'
+lxml.etree.fromstringlist(strings=[x]) # $ input=List vuln='XXE'
+
+lxml.etree.XML(x) # $ input=x vuln='XXE'
+lxml.etree.XML(text=x) # $ input=x vuln='XXE'
+
+lxml.etree.parse(StringIO(x)) # $ input=StringIO(..) vuln='XXE'
+lxml.etree.parse(source=StringIO(x)) # $ input=StringIO(..) vuln='XXE'
+
+lxml.etree.parseid(StringIO(x)) # $ input=StringIO(..) vuln='XXE'
+lxml.etree.parseid(source=StringIO(x)) # $ input=StringIO(..) vuln='XXE'
+
+# With default parsers (nothing changed)
+parser = lxml.etree.XMLParser()
+lxml.etree.fromstring(x, parser=parser) # $ input=x vuln='XXE'
+
+parser = lxml.etree.get_default_parser()
+lxml.etree.fromstring(x, parser=parser) # $ input=x vuln='XXE'
+
+# manual use of feed method
+parser = lxml.etree.XMLParser()
+parser.feed(x) # $ input=x vuln='XXE'
+parser.feed(data=x) # $ input=x vuln='XXE'
+parser.close()
+
+# XXE-safe
+parser = lxml.etree.XMLParser(resolve_entities=False)
+lxml.etree.fromstring(x, parser) # $ input=x
+lxml.etree.fromstring(x, parser=parser) # $ input=x
+
+# XXE-vuln
+parser = lxml.etree.XMLParser(resolve_entities=True)
+lxml.etree.fromstring(x, parser=parser) # $ input=x vuln='XXE'
+
+# Billion laughs vuln (also XXE)
+parser = lxml.etree.XMLParser(huge_tree=True)
+lxml.etree.fromstring(x, parser=parser) # $ input=x vuln='Billion Laughs' vuln='Quadratic Blowup' vuln='XXE'
+
+# Safe for both Billion laughs and XXE
+parser = lxml.etree.XMLParser(resolve_entities=False, huge_tree=True)
+lxml.etree.fromstring(x, parser=parser) # $ input=x
+
+# DTD retrival vuln (also XXE)
+parser = lxml.etree.XMLParser(load_dtd=True, no_network=False)
+lxml.etree.fromstring(x, parser=parser) # $ input=x vuln='DTD retrieval' vuln='XXE'
diff --git a/python/ql/test/experimental/library-tests/frameworks/XML/poc/PoC.py b/python/ql/test/experimental/library-tests/frameworks/XML/poc/PoC.py
new file mode 100644
index 000000000000..adcace1aa0a6
--- /dev/null
+++ b/python/ql/test/experimental/library-tests/frameworks/XML/poc/PoC.py
@@ -0,0 +1,677 @@
+#!/usr/bin/env python3
+
+# this file doesn't have a .py extension so the extractor doesn't pick it up, so it
+# doesn't have to be annotated
+
+# This file shows the ways to make exploit vulnerable XML parsing
+# see
+# https://pypi.org/project/defusedxml/#python-xml-libraries
+# https://docs.python.org/3.10/library/xml.html#xml-vulnerabilities
+
+import pathlib
+from flask import Flask
+import threading
+import multiprocessing
+import time
+from io import StringIO
+import pytest
+
+HOST = "localhost"
+PORT = 8080
+
+
+FLAG_PATH = pathlib.Path(__file__).with_name("flag")
+
+# ==============================================================================
+# xml samples
+
+ok_xml = f"""
+