Best Python code snippet using tempest_python
abstractParser.py
Source:abstractParser.py  
1# -*- coding: UTF-8 -*-2'''3abstractParser.py4Copyright 2006 Andres Riancho5This file is part of w3af, w3af.sourceforge.net .6w3af is free software; you can redistribute it and/or modify7it under the terms of the GNU General Public License as published by8the Free Software Foundation version 2 of the License.9w3af is distributed in the hope that it will be useful,10but WITHOUT ANY WARRANTY; without even the implied warranty of11MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the12GNU General Public License for more details.13You should have received a copy of the GNU General Public License14along with w3af; if not, write to the Free Software15Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA16'''17import core.controllers.outputManager as om18from core.controllers.w3afException import w3afException19from core.data.parsers.encode_decode import htmldecode20from core.data.parsers.urlParser import url_object21import re22import urllib23class abstractParser(object):24    '''25    This class is an abstract document parser.26    27    @author: Andres Riancho ( andres.riancho@gmail.com )28    '''29    def __init__( self, httpResponse ):30        # "setBaseUrl"31        url = httpResponse.getURL()32        redirURL = httpResponse.getRedirURL()33        if redirURL:34            url = redirURL35        36        self._baseUrl = url37        self._baseDomain = url.getDomain()38        self._rootDomain = url.getRootDomain()39        40        # A nice default41        self._encoding = 'utf-8'42        43        # To store results44        self._emails = []45        self._re_URLs = []46    47    def findEmails( self , documentString ):48        '''49        @return: A list with all mail users that are present in the documentString.50        Init,51        >>> from core.data.url.httpResponse import httpResponse as httpResponse52        >>> u = url_object('http://www.w3af.com/')53        >>> response = httpResponse( 200, '', {}, u, u )54        >>> a = abstractParser(response)55        56        First test, no emails.57        >>> a.findEmails( '' )58        []59        60        >>> a = abstractParser(response)61        >>> a.findEmails( ' abc@w3af.com ' )62        ['abc@w3af.com']63        64        >>> a = abstractParser(response)65        >>> a.findEmails( '<a href="mailto:abc@w3af.com">test</a>' )66        ['abc@w3af.com']67        >>> a = abstractParser(response)68        >>> a.findEmails( '<a href="mailto:abc@w3af.com">abc@w3af.com</a>' )69        ['abc@w3af.com']70        >>> a = abstractParser(response)71        >>> a.findEmails( '<a href="mailto:abc@w3af.com">abc_def@w3af.com</a>' )72        ['abc@w3af.com', 'abc_def@w3af.com']73        >>> a = abstractParser(response)74        >>> a.findEmails( 'header abc@w3af-scanner.com footer' )75        ['abc@w3af-scanner.com']76        77        >>> a = abstractParser(response)78        >>> a.findEmails( 'header abc4def@w3af.com footer' )79        ['abc4def@w3af.com']80        '''81        # First, we decode all chars. I have found some strange sites where they encode the @... some other82        # sites where they encode the email, or add some %20 padding... strange stuff... so better be safe...83        documentString = urllib.unquote_plus( documentString )84        85        # Now we decode the HTML special characters...86        documentString = htmldecode( documentString )87        88        # Perform a fast search for the @. In w3af, if we don't have an @ we don't have an email89        # We don't support mails like myself <at> gmail !dot! com90        if documentString.find('@') != -1:91            documentString = re.sub( '[^\w@\-\\.]', ' ', documentString )92            # NOTE: emailRegex is also used in pks search engine.93            # Now we have a clean documentString; and we can match the mail addresses!94            emailRegex = '([A-Z0-9\._%-]{1,45}@([A-Z0-9\.-]{1,45}\.){1,10}[A-Z]{2,4})'95            for email, domain in re.findall(emailRegex, documentString,  re.IGNORECASE):96                if email not in self._emails:97                    self._emails.append( email )98                    99        return self._emails100    def _regex_url_parse(self, httpResponse):101        '''102        Use regular expressions to find new URLs.103        104        @parameter httpResponse: The http response object that stores the response body and the URL.105        @return: None. The findings are stored in self._re_URLs as url_objects106        Init,107        >>> from core.data.url.httpResponse import httpResponse as httpResponse108        >>> u = url_object('http://www.w3af.com/')109        >>> response = httpResponse( 200, '', {}, u, u )110        >>> a = abstractParser(response)111        112        Simple, empty result113        >>> a = abstractParser(response)114        >>> response = httpResponse( 200, '', {}, u, u )115        >>> a._regex_url_parse( response )116        >>> a._re_URLs117        []118        Full URL119        >>> a = abstractParser(response)120        >>> response = httpResponse( 200, 'header http://www.w3af.com/foo/bar/index.html footer', {}, u, u )121        >>> a._regex_url_parse( response )122        >>> a._re_URLs[0].url_string123        'http://www.w3af.com/foo/bar/index.html'124        One relative URL125        >>> a = abstractParser(response)126        >>> response = httpResponse( 200, 'header /foo/bar/index.html footer', {}, u, u )127        >>> a._regex_url_parse( response )128        >>> a._re_URLs[0].url_string129        'http://www.w3af.com/foo/bar/index.html'130        Relative with initial "/" , inside an href131        >>> a = abstractParser(response)132        >>> response = httpResponse( 200, 'header <a href="/foo/bar/index.html">foo</a> footer', {}, u, u )133        >>> a._regex_url_parse( response )134        >>> a._re_URLs[0].url_string135        'http://www.w3af.com/foo/bar/index.html'136        Simple index relative URL137        >>> a = abstractParser(response)138        >>> response = httpResponse( 200, 'header <a href="index">foo</a> footer', {}, u, u )139        >>> a._regex_url_parse( response )140        >>> len( a._re_URLs )141        0142        '''143        #url_regex = '((http|https):[A-Za-z0-9/](([A-Za-z0-9$_.+!*(),;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:@&~=%-]*))?)'144        url_regex = '((http|https)://([a-zA-Z0-9_:@\-\./]*?)/[^ \n\r\t"\'<>]*)'145        146        for url in re.findall(url_regex, httpResponse.getBody() ):147            # This try is here because the _decode_URL method raises an exception148            # whenever it fails to decode a url.149            try:150                decoded_url = self._decode_URL( url_object(url[0]) , self._encoding)151            except w3afException:152                pass153            else:154                self._re_URLs.append(decoded_url)155        156        #157        # Now detect some relative URL's ( also using regexs )158        #159        def find_relative( doc ):160            res = []161            162            # TODO: Also matches //foo/bar.txt and http://host.tld/foo/bar.txt163            # I'm removing those matches manually below164            regex = '((:?[/]{1,2}[A-Z0-9a-z%_\-~\.]+)+\.[A-Za-z0-9]{2,4}(((\?)([a-zA-Z0-9]*=\w*)){1}((&)([a-zA-Z0-9]*=\w*))*)?)'165            relative_regex = re.compile( regex )166            167            for match_tuple in relative_regex.findall(doc):168                169                match_string = match_tuple[0]170                171                #172                #   And now I filter out some of the common false positives173                #174                if match_string.startswith('//'):175                    continue176                    177                if match_string.startswith('://'):178                    continue179                if re.match('HTTP/\d\.\d', match_string):180                    continue181                182                # Matches "PHP/5.2.4-2ubuntu5.7" , "Apache/2.2.8", and "mod_python/3.3.1"183                if re.match('.*?/\d\.\d\.\d', match_string):184                    continue185                #186                #   Filter finished.187                #188                    189                url = httpResponse.getURL().urlJoin( match_string )190                url = self._decode_URL( url , self._encoding)191                res.append( url )192            193            return res194        195        relative_URLs = find_relative( httpResponse.getBody() )196        self._re_URLs.extend( relative_URLs )197        [ i.normalizeURL() for i in self._re_URLs ]198        self._re_URLs = list(set(self._re_URLs)) 199    def getEmails( self, domain=None ):200        '''201        @parameter domain: Indicates what email addresses I want to retrieve:   "*@domain".202        @return: A list of email accounts that are inside the document.203        204        >>> from core.data.url.httpResponse import httpResponse as httpResponse205        >>> u = url_object('http://www.w3af.com/')206        >>> response = httpResponse( 200, '', {}, u, u )207        >>> a = abstractParser(response)208        >>> a._emails = ['a@w3af.com', 'foo@not-w3af.com']209        210        >>> a.getEmails()211        ['a@w3af.com', 'foo@not-w3af.com']212        >>> a.getEmails( domain='w3af.com')213        ['a@w3af.com']214        >>> a.getEmails( domain='not-w3af.com')215        ['foo@not-w3af.com']216                217        '''218        if domain:219            return [ i for i in self._emails if domain == i.split('@')[1] ]220        else:221            return self._emails222            223    def getForms( self ):224        '''225        @return: A list of forms.226        '''        227        raise Exception('You should create your own parser class and implement the getForms() method.')228        229    def getReferences( self ):230        '''231        Searches for references on a page. w3af searches references in every html tag, including:232            - a233            - forms234            - images235            - frames236            - etc.237        238        @return: Two sets, one with the parsed URLs, and one with the URLs that came out of a239        regular expression. The second list if less trustworthy.240        '''241        raise Exception('You should create your own parser class and implement the getReferences() method.')242        243    def getComments( self ):244        '''245        @return: A list of comments.246        '''        247        raise Exception('You should create your own parser class and implement the getComments() method.')248    249    def getScripts( self ):250        '''251        @return: A list of scripts (like javascript).252        '''        253        raise Exception('You should create your own parser class and implement the getScripts() method.')254        255    def getMetaRedir( self ):256        '''257        @return: Returns list of meta redirections.258        '''259        raise Exception('You should create your own parser class and implement the getMetaRedir() method.')260    261    def getMetaTags( self ):262        '''263        @return: Returns list of all meta tags.264        '''265        raise Exception('You should create your own parser class and implement the getMetaTags() method.')266        267    def _decode_URL(self, url_object_to_decode, encoding):268        '''269        This is one of the most important methods, because it will decode any URL270        and return an utf-8 encoded string. In other words, this methods does c14n (Canonicalization)271        (http://en.wikipedia.org/wiki/Canonicalization) and allows all layers of w3af to simply ignore the272        encoding of the HTTP body (if that's what they want).273        274        This method is very related to httpResponse._charset_handling(), which decodes the HTTP275        body of the response. The "problem" is that the body of the response is decoded as expected,276        but URLs aren't... why? Let's see an example:277        278        - HTTP Body: <a href="http://host.tld/%05%44">Click m\x05\x44!</a>279        - HTTP response header indicated encoding: xyz280        - After running _charset_handling() and supposing that "\x05\x44" decodes to "é" in xyz,281        the response is: <a href="http://host.tld/%05%44">Click mé!</a>282        283        As you may have noticed, the %05%44 (which in URL means "\x05\x44") wasn't decoded284        (as expected because the decoding method doesn't handle URL encoding AND xyz encoding at the285        same time!).286        287        So, when we use _decode_URL() we take as input "http://host.tld/%05%44", we decode the288        URL encoding to get "http://host.tld/\x05\x44" and finally we decode that with the xyz encoding289        to get "http://host.tld/é".290        Something small to remember:291        >>> urllib.unquote('ind%c3%a9x.html').decode('utf-8').encode('utf-8') == 'ind\xc3\xa9x.html'292        True293        294        Init,295        >>> from core.data.url.httpResponse import httpResponse as httpResponse296        >>> u = url_object('http://www.w3af.com/')297        >>> response = httpResponse( 200, '', {}, u, u )298        >>> a = abstractParser(response)299        Simple, no strange encoding300        >>> u = url_object('http://www.w3af.com/index.html')301        >>> print a._decode_URL( u , a._encoding ).url_string302        http://www.w3af.com/index.html303        Encoded304        >>> u = url_object('http://www.w3af.com/ind%c3%a9x.html')305        >>> print a._decode_URL( u , a._encoding ).url_string306        http://www.w3af.com/indéx.html307        Wrong parameter308        >>> print a._decode_URL( 'http://www.w3af.com/' , a._encoding )309        Traceback (most recent call last):310          File "<stdin>", line 1, in ?311        ValueError: The "url_object_to_decode" parameter @ _decode_URL of an abstractParser must be of urlParser.url_object type.312        '''313        if not isinstance(url_object_to_decode, url_object):314            msg = 'The "url_object_to_decode" parameter @ _decode_URL of an abstractParser'315            msg += ' must be of urlParser.url_object type.'316            raise ValueError( msg )317        318        # Avoid the double decoding performed by httpResponse._charset_handling() and319        # by this function in the cases like this link:320        #321        #   http://host.tld/é.html322        #323        # Which is written without URL encoding.324        url_string = url_object_to_decode.url_string325        if urllib.unquote(url_string) == url_string:326            return url_object_to_decode327            328        try:329            decoded = urllib.unquote(url_string).decode(encoding).encode('utf-8')330            return url_object(decoded)331        except UnicodeDecodeError, ude:332            # This error could have been produced by the buggy choice of encoding333            # done by the user when calling _decode_URL with two parameters, 334            # or "selected by default". So, now we are going to test something different335            if encoding == 'utf-8':336                # Test an encoding that only uses one byte:337                decoded = urllib.unquote(url_string).decode('iso-8859-1').encode('utf-8')338                return url_object(decoded)339            elif encoding != 'utf-8':340                # Sometimes, the web app developers, their editors, or some other component341                # makes a mistake, and they are really encoding it with utf-8 and they say they are342                # doing it with some other encoding; this is why I perform this last test:343                try:344                    decoded = urllib.unquote(url_string).decode('utf-8').encode('utf-8')345                    return url_object(decoded)346                except UnicodeDecodeError, ude:347                    msg = 'Failed to _decode_URL: "' + url_object_to_decode +'" using encoding: "' + encoding + '".'348                    om.out.error(msg)...git-clone
Source:git-clone  
...14#    },15#  }16_HOST_REWRITE = {17}18def _decode_url(url):19  '''Determine the key parts of a git clone url.20  >>> pprint(_decode_url('https://github.com/tsukasa-au/micropython.git'))21  {'compound_project': 'tsukasa-au/micropython', 'domain': 'github.com', 'project': 'micropython', 'user': 'tsukasa-au'}22  >>> pprint(_decode_url('https://github.com/makarandtapaswi/BallClustering_ICCV2019.git'))23  {'compound_project': 'makarandtapaswi/BallClustering_ICCV2019', 'domain': 'github.com', 'project': 'BallClustering_ICCV2019', 'user': 'makarandtapaswi'}24  >>> pprint(_decode_url('git@github.com:tsukasa-au/micropython.git'))25  {'compound_project': 'tsukasa-au/micropython', 'domain': 'github.com', 'project': 'micropython', 'user': 'tsukasa-au'}26  >>> pprint(_decode_url('https://gist.github.com/50b6cca61dd1c3f88f41.git'))27  {'compound_project': '50b6cca61dd1c3f88f41', 'domain': 'gist.github.com', 'project': '50b6cca61dd1c3f88f41', 'user': None}28  >>> pprint(_decode_url('git@gist.github.com:50b6cca61dd1c3f88f41.git'))29  {'compound_project': '50b6cca61dd1c3f88f41', 'domain': 'gist.github.com', 'project': '50b6cca61dd1c3f88f41', 'user': None}30  >>> pprint(_decode_url('https://git.code.sf.net/p/mcomix/git'))31  {'compound_project': 'mcomix', 'domain': 'sf.net', 'project': 'mcomix', 'user': None}32  >>> pprint(_decode_url('git://git@github.com:nickyringland/ncss.life.git'))33  {'compound_project': 'nickyringland/ncss.life', 'domain': 'github.com', 'project': 'ncss.life', 'user': 'nickyringland'}34  >>> pprint(_decode_url('git://git.code.sf.net/p/mcomix/git'))35  {'compound_project': 'mcomix', 'domain': 'sf.net', 'project': 'mcomix', 'user': None}36  >>> pprint(_decode_url('http://repo.or.cz/fast-export.git'))37  {'compound_project': 'fast-export', 'domain': 'repo.or.cz', 'project': 'fast-export', 'user': None}38  >>> pprint(_decode_url('git://git.videolan.org/libdvdnav.git'))39  {'compound_project': 'libdvdnav', 'domain': 'videolan.org', 'project': 'libdvdnav', 'user': None}40  >>> pprint(_decode_url('https://code.videolan.org/videolan/libdvdread.git'))41  {'compound_project': 'videolan/libdvdread', 'domain': 'videolan.org', 'project': 'libdvdread', 'user': 'videolan'}42  >>> pprint(_decode_url('https://gitlab.freedesktop.org/xorg/app/xrandr.git'))43  {'compound_project': 'xorg/app/xrandr', 'domain': 'gitlab.freedesktop.org', 'project': 'app/xrandr', 'user': 'xorg'}44  >>> pprint(_decode_url('https://chromium.googlesource.com/apps/libapps'))45  {'compound_project': 'chromium/apps/libapps', 'domain': 'googlesource.com', 'project': 'apps/libapps', 'user': 'chromium'}46  >>> pprint(_decode_url('https://git.nomology.id.au/preserve-modules'))47  {'compound_project': 'preserve-modules', 'domain': 'nomology.id.au', 'project': 'preserve-modules', 'user': None}48  >>> pprint(_decode_url('https://code.delx.net.au/webdl'))49  {'compound_project': 'webdl', 'domain': 'delx.net.au', 'project': 'webdl', 'user': None}50  >>> pprint(_decode_url('https://lore.kernel.org/linux-nfs/0'))51  {'compound_project': 'linux-nfs/0', 'domain': 'lore.kernel.org', 'project': '0', 'user': 'linux-nfs'}52  >>> pprint(_decode_url('http://localhost/abc.git'))53  Traceback (most recent call last):54    ...55  KeyError: 'Unknown host'56  >>> 57  '''58  RES = [59      # Sourceforge60      r'(?P<scheme>https?|git)://(?:git\.)?(?:code\.)?(?P<domain>sf\.net)/p/(?P<project>[^/]+)/git',61      # Google Source62      r'(?P<scheme>https?)://(?P<user>[^.]+)[.](?P<domain>googlesource\.com)/(?P<project>.+)',63      # Gist64      r'(?P<scheme>https?)://(?P<domain>gist\.github\.com)/(?P<project>[^.]+).git',65      # Generic case.66      r'(?P<scheme>https?)://(?P<domain>(?:git|code)[.][^/]+\.[^/]+)/(?:(?P<user>[a-zA-Z0-9_-]+)/)?(?P<project>(?:[a-zA-Z0-9_-]+/)?[a-zA-Z0-9_.-]+?)(?:\.git)?$',67      r'(?P<scheme>https?)://(?P<domain>[^/]+\.[^/]+)/(?:(?P<user>[a-zA-Z0-9_-]+)/)?(?P<project>(?:[a-zA-Z0-9_-]+/)?[a-zA-Z0-9_.-]+)\.git',68      r'(?P<scheme>git://)?(?:[a-zA-Z0-9_-]+@)?(?P<domain>[^.:]+\.[^:]+)[:/](?:(?P<user>[a-zA-Z0-9_-]+)/)?(?P<project>[a-zA-Z0-9_.-]+)\.git',69      # Linux kernel70      r'(?P<scheme>https?)://(?P<domain>(?:.+[.])?kernel[.]org)/(?:(?P<user>[a-zA-Z0-9_-]+)/)?(?P<project>(?:[a-zA-Z0-9_-]+/)?[a-zA-Z0-9_.-]+?)(?:\.git)?$',71  ]72  DOMAIN_PREFIXES_TO_STRIP = {'git', 'code'}73  for regexp in RES:74    m = re.match(regexp, url)75    if not m:76      continue77    d = m.groupdict()78    # Cleanup the domain79    # NOTE: We strip off the subdomains 'git' and 'code', though ensure there80    # is at least some part of the domain name left (there must be at least 181    # dot).82    domain_parts = d['domain'].split('.')83    domain_parts.reverse()84    while len(domain_parts) > 2 and domain_parts[-1] in DOMAIN_PREFIXES_TO_STRIP:85        domain_parts.pop()86    d['domain'] = '.'.join(reversed(domain_parts))87    # Rewrite the short hostname to the full hostname for our directory structure.88    if d['scheme'] in _HOST_REWRITE:89      _rewrite_rules = _HOST_REWRITE[d['scheme']]90      if d['domain'] in _rewrite_rules:91        d['domain'] = _rewrite_rules[d['domain']]92    return {93        'domain': d['domain'],94        'user': d.get('user'),95        'project': d['project'],96        'compound_project': '{}/{}'.format(d['user'], d['project']) if d.get('user') else d['project'],97    }98  raise KeyError('Unknown host')99def _ensure_dir_exists(dirname):100  def _walk_path(dirname):101    parts = dirname.split('/')102    parts.reverse()103    output_parts = [parts.pop()]104    while parts:105      output_parts.append(parts.pop())106      yield '/'.join(output_parts)107  if not os.path.exists(dirname):108    for partial_dirname in _walk_path(dirname):109      if not os.path.exists(partial_dirname):110        os.mkdir(partial_dirname)111def get_parser():112  parser = optparse.OptionParser()113  parser.add_option(114      '--mirror',115      action='store_true', dest='mirror', default=False,116      help='Mirror all refs/branches from the remote to the local')117  return parser118def main():119  opt_parser = get_parser()120  options, args = opt_parser.parse_args()121  if len(args) != 1:122    opt_parser.error(f'Must provide exactly 1 argument, the git repo to clone. Got {len(args)}: {args!r}')123  clone_url, = args124  url_parts = _decode_url(clone_url)125  # Ensure that the directory up to the domain exists before we call git.126  base_dir = os.path.join(127      os.environ['HOME'], 'Projects', 'src')128  _ensure_dir_exists(base_dir)129  src_base_dir = os.path.join(base_dir, url_parts['domain'])130  _ensure_dir_exists(src_base_dir)131  # Git will make sure that the final directory exists (in case132  # compound_project has a '/' in it).133  cmd = ['git', 'clone']134  # Check if we need any additional flags to git.135  if options.mirror:136    cmd += ['--mirror']137  # Now that we have (possibly) added our flags, specify the src repo and dest138  # directory....vencode
Source:vencode  
...33    b = base64.b64decode(b)34    print("%s => %s" % (s, b.decode("UTF-8")))35def _encode_url(s):36    print("%s => %s" % (s, parse.quote(s)))37def _decode_url(s):38    print("%s => %s" % (s, parse.unquote(s)))39if __name__ == "__main__":40    try:41        opts, args = getopt.getopt(42            sys.argv[1:],43            "e:d:h",44            ["encode=", "decode=", "help"],45        )46        for opt in opts:47            if opt[0] in ("-h", "--help"):48                _help()49        for opt in opts:50            if opt[0] in ("-e", "--encode"):51                if opt[1] == "base64":52                    for a in args:53                        _encode_base64(a)54                    exit(0)55                elif opt[1] == "url":56                    for a in args:57                        _encode_url(a)58                    exit(0)59            elif opt[0] in ("-d", "--decode"):60                if opt[1] == "base64":61                    for a in args:62                        _decode_base64(a)63                    exit(0)64                elif opt[1] == "url":65                    for a in args:66                        _decode_url(a)67                    exit(0)68        _help()69    except getopt.GetoptError as e:70        print(e)71        _help()72    except Exception as e:...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
