How to use cache_dir method in localstack

Best Python code snippet using localstack_python

file_utils.py

Source:file_utils.py

1"""2Utilities for working with the local dataset cache.3This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp4Copyright by the AllenNLP authors.5"""6from __future__ import (absolute_import, division, print_function, unicode_literals)7import sys8import json9import logging10import os11import shutil12import tempfile13import fnmatch14from functools import wraps15from hashlib import sha25616import sys17from io import open18import boto319import requests20from botocore.exceptions import ClientError21from tqdm import tqdm22try:23    from urllib.parse import urlparse24except ImportError:25    from urlparse import urlparse26try:27    from pathlib import Path28    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',29                                                   Path.home() / '.pytorch_pretrained_bert'))30except (AttributeError, ImportError):31    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',32                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))33CONFIG_NAME = "config.json"34WEIGHTS_NAME = "pytorch_model.bin"35logger = logging.getLogger(__name__)  # pylint: disable=invalid-name36def url_to_filename(url, etag=None):37    """38    Convert `url` into a hashed filename in a repeatable way.39    If `etag` is specified, append its hash to the url's, delimited40    by a period.41    """42    url_bytes = url.encode('utf-8')43    url_hash = sha256(url_bytes)44    filename = url_hash.hexdigest()45    if etag:46        etag_bytes = etag.encode('utf-8')47        etag_hash = sha256(etag_bytes)48        filename += '.' + etag_hash.hexdigest()49    return filename50def filename_to_url(filename, cache_dir=None):51    """52    Return the url and etag (which may be ``None``) stored for `filename`.53    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.54    """55    if cache_dir is None:56        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE57    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):58        cache_dir = str(cache_dir)59    cache_path = os.path.join(cache_dir, filename)60    if not os.path.exists(cache_path):61        raise EnvironmentError("file {} not found".format(cache_path))62    meta_path = cache_path + '.json'63    if not os.path.exists(meta_path):64        raise EnvironmentError("file {} not found".format(meta_path))65    with open(meta_path, encoding="utf-8") as meta_file:66        metadata = json.load(meta_file)67    url = metadata['url']68    etag = metadata['etag']69    return url, etag70def cached_path(url_or_filename, cache_dir=None):71    """72    Given something that might be a URL (or might be a local path),73    determine which. If it's a URL, download the file and cache it, and74    return the path to the cached file. If it's already a local path,75    make sure the file exists and then return the path.76    """77    if cache_dir is None:78        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE79    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):80        url_or_filename = str(url_or_filename)81    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):82        cache_dir = str(cache_dir)83    parsed = urlparse(url_or_filename)84    if parsed.scheme in ('http', 'https', 's3'):85        # URL, so get it from the cache (downloading if necessary)86        return get_from_cache(url_or_filename, cache_dir)87    elif os.path.exists(url_or_filename):88        # File, and it exists.89        return url_or_filename90    elif parsed.scheme == '':91        # File, but it doesn't exist.92        raise EnvironmentError("file {} not found".format(url_or_filename))93    else:94        # Something unknown95        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))96def split_s3_path(url):97    """Split a full s3 path into the bucket name and path."""98    parsed = urlparse(url)99    if not parsed.netloc or not parsed.path:100        raise ValueError("bad s3 path {}".format(url))101    bucket_name = parsed.netloc102    s3_path = parsed.path103    # Remove '/' at beginning of path.104    if s3_path.startswith("/"):105        s3_path = s3_path[1:]106    return bucket_name, s3_path107def s3_request(func):108    """109    Wrapper function for s3 requests in order to create more helpful error110    messages.111    """112    @wraps(func)113    def wrapper(url, *args, **kwargs):114        try:115            return func(url, *args, **kwargs)116        except ClientError as exc:117            if int(exc.response["Error"]["Code"]) == 404:118                raise EnvironmentError("file {} not found".format(url))119            else:120                raise121    return wrapper122@s3_request123def s3_etag(url):124    """Check ETag on S3 object."""125    s3_resource = boto3.resource("s3")126    bucket_name, s3_path = split_s3_path(url)127    s3_object = s3_resource.Object(bucket_name, s3_path)128    return s3_object.e_tag129@s3_request130def s3_get(url, temp_file):131    """Pull a file directly from S3."""132    s3_resource = boto3.resource("s3")133    bucket_name, s3_path = split_s3_path(url)134    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)135def http_get(url, temp_file):136    req = requests.get(url, stream=True)137    content_length = req.headers.get('Content-Length')138    total = int(content_length) if content_length is not None else None139    progress = tqdm(unit="B", total=total)140    for chunk in req.iter_content(chunk_size=1024):141        if chunk: # filter out keep-alive new chunks142            progress.update(len(chunk))143            temp_file.write(chunk)144    progress.close()145def get_from_cache(url, cache_dir=None):146    """147    Given a URL, look for the corresponding dataset in the local cache.148    If it's not there, download it. Then return the path to the cached file.149    """150    if cache_dir is None:151        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE152    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):153        cache_dir = str(cache_dir)154    if not os.path.exists(cache_dir):155        os.makedirs(cache_dir)156    # Get eTag to add to filename, if it exists.157    if url.startswith("s3://"):158        etag = s3_etag(url)159    else:160        try:161            response = requests.head(url, allow_redirects=True)162            if response.status_code != 200:163                etag = None164            else:165                etag = response.headers.get("ETag")166        except EnvironmentError:167            etag = None168    if sys.version_info[0] == 2 and etag is not None:169        etag = etag.decode('utf-8')170    filename = url_to_filename(url, etag)171    # get cache path to put the file172    cache_path = os.path.join(cache_dir, filename)173    # If we don't have a connection (etag is None) and can't identify the file174    # try to get the last downloaded one175    if not os.path.exists(cache_path) and etag is None:176        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')177        matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))178        if matching_files:179            cache_path = os.path.join(cache_dir, matching_files[-1])180    if not os.path.exists(cache_path):181        # Download to temporary file, then copy to cache dir once finished.182        # Otherwise you get corrupt cache entries if the download gets interrupted.183        with tempfile.NamedTemporaryFile() as temp_file:184            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)185            # GET file object186            if url.startswith("s3://"):187                s3_get(url, temp_file)188            else:189                http_get(url, temp_file)190            # we are copying the file before closing it, so flush to avoid truncation191            temp_file.flush()192            # shutil.copyfileobj() starts at the current position, so go to the start193            temp_file.seek(0)194            logger.info("copying %s to cache at %s", temp_file.name, cache_path)195            with open(cache_path, 'wb') as cache_file:196                shutil.copyfileobj(temp_file, cache_file)197            logger.info("creating metadata file for %s", cache_path)198            meta = {'url': url, 'etag': etag}199            meta_path = cache_path + '.json'200            with open(meta_path, 'w') as meta_file:201                output_string = json.dumps(meta)202                if sys.version_info[0] == 2 and isinstance(output_string, str):203                    output_string = unicode(output_string, 'utf-8')  # The beauty of python 2204                meta_file.write(output_string)205            logger.info("removing temp file %s", temp_file.name)206    return cache_path207def read_set_from_file(filename):208    '''209    Extract a de-duped collection (set) of text from a file.210    Expected file format is one item per line.211    '''212    collection = set()213    with open(filename, 'r', encoding='utf-8') as file_:214        for line in file_:215            collection.add(line.rstrip())216    return collection217def get_file_extension(path, dot=True, lower=True):218    ext = os.path.splitext(path)[1]219    ext = ext if dot else ext[1:]...

MmapWordEmbeddings.py

Source:MmapWordEmbeddings.py

1from pathlib import Path2import gensim3from flair.embeddings import WordEmbeddings, TokenEmbeddings4from flair.file_utils import instance_lru_cache5import torch6from typing import List7from flair.data import Sentence8import flair9import re10import numpy as np11class MmapWordEmbeddings(WordEmbeddings):12    """13    Adapted from https://github.com/flairNLP/flair/blob/v0.8/flair/embeddings/token.py14    to use gensim with mmap15    """16    def __init__(self, embeddings: str, field: str = None):17        """18        Initializes classic word embeddings. Constructor downloads required files if not there.19        :param embeddings: one of: 'glove', 'extvec', 'crawl' or two-letter language code or custom20        If you want to use a custom embedding file, just pass the path to the embeddings as embeddings variable.21        """22        self.embeddings = embeddings23        self.instance_parameters = self.get_instance_parameters(locals=locals())24        hu_path: str = "https://flair.informatik.hu-berlin.de/resources/embeddings/token"25        cache_dir = Path("embeddings")26        # GLOVE embeddings27        if embeddings.lower() == "glove" or embeddings.lower() == "en-glove":28            cached_path(f"{hu_path}/glove.gensim.vectors.npy", cache_dir=cache_dir)29            embeddings = cached_path(f"{hu_path}/glove.gensim", cache_dir=cache_dir)30        # TURIAN embeddings31        elif embeddings.lower() == "turian" or embeddings.lower() == "en-turian":32            cached_path(f"{hu_path}/turian.vectors.npy", cache_dir=cache_dir)33            embeddings = cached_path(f"{hu_path}/turian", cache_dir=cache_dir)34        # KOMNINOS embeddings35        elif embeddings.lower() == "extvec" or embeddings.lower() == "en-extvec":36            cached_path(f"{hu_path}/extvec.gensim.vectors.npy", cache_dir=cache_dir)37            embeddings = cached_path(f"{hu_path}/extvec.gensim", cache_dir=cache_dir)38        # pubmed embeddings39        elif embeddings.lower() == "pubmed" or embeddings.lower() == "en-pubmed":40            cached_path(f"{hu_path}/pubmed_pmc_wiki_sg_1M.gensim.vectors.npy", cache_dir=cache_dir)41            embeddings = cached_path(f"{hu_path}/pubmed_pmc_wiki_sg_1M.gensim", cache_dir=cache_dir)42        # FT-CRAWL embeddings43        elif embeddings.lower() == "crawl" or embeddings.lower() == "en-crawl":44            cached_path(f"{hu_path}/en-fasttext-crawl-300d-1M.vectors.npy", cache_dir=cache_dir)45            embeddings = cached_path(f"{hu_path}/en-fasttext-crawl-300d-1M", cache_dir=cache_dir)46        # FT-CRAWL embeddings47        elif embeddings.lower() in ["news", "en-news", "en"]:48            cached_path(f"{hu_path}/en-fasttext-news-300d-1M.vectors.npy", cache_dir=cache_dir)49            embeddings = cached_path(f"{hu_path}/en-fasttext-news-300d-1M", cache_dir=cache_dir)50        # twitter embeddings51        elif embeddings.lower() in ["twitter", "en-twitter"]:52            cached_path(f"{hu_path}/twitter.gensim.vectors.npy", cache_dir=cache_dir)53            embeddings = cached_path(f"{hu_path}/twitter.gensim", cache_dir=cache_dir)54        # two-letter language code wiki embeddings55        elif len(embeddings.lower()) == 2:56            cached_path(f"{hu_path}/{embeddings}-wiki-fasttext-300d-1M.vectors.npy", cache_dir=cache_dir)57            embeddings = cached_path(f"{hu_path}/{embeddings}-wiki-fasttext-300d-1M", cache_dir=cache_dir)58        # two-letter language code wiki embeddings59        elif len(embeddings.lower()) == 7 and embeddings.endswith("-wiki"):60            cached_path(f"{hu_path}/{embeddings[:2]}-wiki-fasttext-300d-1M.vectors.npy", cache_dir=cache_dir)61            embeddings = cached_path(f"{hu_path}/{embeddings[:2]}-wiki-fasttext-300d-1M", cache_dir=cache_dir)62        # two-letter language code crawl embeddings63        elif len(embeddings.lower()) == 8 and embeddings.endswith("-crawl"):64            cached_path(f"{hu_path}/{embeddings[:2]}-crawl-fasttext-300d-1M.vectors.npy", cache_dir=cache_dir)65            embeddings = cached_path(f"{hu_path}/{embeddings[:2]}-crawl-fasttext-300d-1M", cache_dir=cache_dir)66        elif not Path(embeddings).exists():67            raise ValueError(68                f'The given embeddings "{embeddings}" is not available or is not a valid path.'69            )70        self.name: str = str(embeddings)71        self.static_embeddings = True72        if str(embeddings).endswith(".bin"):73            self.precomputed_word_embeddings = gensim.models.KeyedVectors.load_word2vec_format(74                str(embeddings), binary=True75            )76        else:77            self.precomputed_word_embeddings = gensim.models.KeyedVectors.load(78                str(embeddings),79                mmap='r'80            )81        self.field = field82        self.__embedding_length: int = self.precomputed_word_embeddings.vector_size83        TokenEmbeddings.__init__(self)84    @property85    def embedding_length(self) -> int:86        return self.__embedding_length87    @instance_lru_cache(maxsize=10000, typed=False)88    def get_cached_vec(self, word: str) -> torch.Tensor:89        if word in self.precomputed_word_embeddings:90            word_embedding = self.precomputed_word_embeddings[word]91        elif word.lower() in self.precomputed_word_embeddings:92            word_embedding = self.precomputed_word_embeddings[word.lower()]93        elif re.sub(r"\d", "#", word.lower()) in self.precomputed_word_embeddings:94            word_embedding = self.precomputed_word_embeddings[95                re.sub(r"\d", "#", word.lower())96            ]97        elif re.sub(r"\d", "0", word.lower()) in self.precomputed_word_embeddings:98            word_embedding = self.precomputed_word_embeddings[99                re.sub(r"\d", "0", word.lower())100            ]101        else:102            word_embedding = np.zeros(self.embedding_length, dtype="float")103        word_embedding = torch.tensor(104            word_embedding.tolist(), device=flair.device, dtype=torch.float105        )106        return word_embedding107    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:108        for i, sentence in enumerate(sentences):109            for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))):110                if "field" not in self.__dict__ or self.field is None:111                    word = token.text112                else:113                    word = token.get_tag(self.field).value114                word_embedding = self.get_cached_vec(word=word)115                token.set_embedding(self.name, word_embedding)116        return sentences117    def __str__(self):118        return self.name119    def extra_repr(self):120        # fix serialized models121        if "embeddings" not in self.__dict__:122            self.embeddings = self.name...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.