How to use cache_dir method in localstack

Best Python code snippet using localstack_python

file_utils.py

Source:file_utils.py Github

copy

Full Screen

1"""2Utilities for working with the local dataset cache.3This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp4Copyright by the AllenNLP authors.5"""6from __future__ import (absolute_import, division, print_function, unicode_literals)7import sys8import json9import logging10import os11import shutil12import tempfile13import fnmatch14from functools import wraps15from hashlib import sha25616import sys17from io import open18import boto319import requests20from botocore.exceptions import ClientError21from tqdm import tqdm22try:23 from urllib.parse import urlparse24except ImportError:25 from urlparse import urlparse26try:27 from pathlib import Path28 PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',29 Path.home() / '.pytorch_pretrained_bert'))30except (AttributeError, ImportError):31 PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',32 os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))33CONFIG_NAME = "config.json"34WEIGHTS_NAME = "pytorch_model.bin"35logger = logging.getLogger(__name__) # pylint: disable=invalid-name36def url_to_filename(url, etag=None):37 """38 Convert `url` into a hashed filename in a repeatable way.39 If `etag` is specified, append its hash to the url's, delimited40 by a period.41 """42 url_bytes = url.encode('utf-8')43 url_hash = sha256(url_bytes)44 filename = url_hash.hexdigest()45 if etag:46 etag_bytes = etag.encode('utf-8')47 etag_hash = sha256(etag_bytes)48 filename += '.' + etag_hash.hexdigest()49 return filename50def filename_to_url(filename, cache_dir=None):51 """52 Return the url and etag (which may be ``None``) stored for `filename`.53 Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.54 """55 if cache_dir is None:56 cache_dir = PYTORCH_PRETRAINED_BERT_CACHE57 if sys.version_info[0] == 3 and isinstance(cache_dir, Path):58 cache_dir = str(cache_dir)59 cache_path = os.path.join(cache_dir, filename)60 if not os.path.exists(cache_path):61 raise EnvironmentError("file {} not found".format(cache_path))62 meta_path = cache_path + '.json'63 if not os.path.exists(meta_path):64 raise EnvironmentError("file {} not found".format(meta_path))65 with open(meta_path, encoding="utf-8") as meta_file:66 metadata = json.load(meta_file)67 url = metadata['url']68 etag = metadata['etag']69 return url, etag70def cached_path(url_or_filename, cache_dir=None):71 """72 Given something that might be a URL (or might be a local path),73 determine which. If it's a URL, download the file and cache it, and74 return the path to the cached file. If it's already a local path,75 make sure the file exists and then return the path.76 """77 if cache_dir is None:78 cache_dir = PYTORCH_PRETRAINED_BERT_CACHE79 if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):80 url_or_filename = str(url_or_filename)81 if sys.version_info[0] == 3 and isinstance(cache_dir, Path):82 cache_dir = str(cache_dir)83 parsed = urlparse(url_or_filename)84 if parsed.scheme in ('http', 'https', 's3'):85 # URL, so get it from the cache (downloading if necessary)86 return get_from_cache(url_or_filename, cache_dir)87 elif os.path.exists(url_or_filename):88 # File, and it exists.89 return url_or_filename90 elif parsed.scheme == '':91 # File, but it doesn't exist.92 raise EnvironmentError("file {} not found".format(url_or_filename))93 else:94 # Something unknown95 raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))96def split_s3_path(url):97 """Split a full s3 path into the bucket name and path."""98 parsed = urlparse(url)99 if not parsed.netloc or not parsed.path:100 raise ValueError("bad s3 path {}".format(url))101 bucket_name = parsed.netloc102 s3_path = parsed.path103 # Remove '/' at beginning of path.104 if s3_path.startswith("/"):105 s3_path = s3_path[1:]106 return bucket_name, s3_path107def s3_request(func):108 """109 Wrapper function for s3 requests in order to create more helpful error110 messages.111 """112 @wraps(func)113 def wrapper(url, *args, **kwargs):114 try:115 return func(url, *args, **kwargs)116 except ClientError as exc:117 if int(exc.response["Error"]["Code"]) == 404:118 raise EnvironmentError("file {} not found".format(url))119 else:120 raise121 return wrapper122@s3_request123def s3_etag(url):124 """Check ETag on S3 object."""125 s3_resource = boto3.resource("s3")126 bucket_name, s3_path = split_s3_path(url)127 s3_object = s3_resource.Object(bucket_name, s3_path)128 return s3_object.e_tag129@s3_request130def s3_get(url, temp_file):131 """Pull a file directly from S3."""132 s3_resource = boto3.resource("s3")133 bucket_name, s3_path = split_s3_path(url)134 s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)135def http_get(url, temp_file):136 req = requests.get(url, stream=True)137 content_length = req.headers.get('Content-Length')138 total = int(content_length) if content_length is not None else None139 progress = tqdm(unit="B", total=total)140 for chunk in req.iter_content(chunk_size=1024):141 if chunk: # filter out keep-alive new chunks142 progress.update(len(chunk))143 temp_file.write(chunk)144 progress.close()145def get_from_cache(url, cache_dir=None):146 """147 Given a URL, look for the corresponding dataset in the local cache.148 If it's not there, download it. Then return the path to the cached file.149 """150 if cache_dir is None:151 cache_dir = PYTORCH_PRETRAINED_BERT_CACHE152 if sys.version_info[0] == 3 and isinstance(cache_dir, Path):153 cache_dir = str(cache_dir)154 if not os.path.exists(cache_dir):155 os.makedirs(cache_dir)156 # Get eTag to add to filename, if it exists.157 if url.startswith("s3://"):158 etag = s3_etag(url)159 else:160 try:161 response = requests.head(url, allow_redirects=True)162 if response.status_code != 200:163 etag = None164 else:165 etag = response.headers.get("ETag")166 except EnvironmentError:167 etag = None168 if sys.version_info[0] == 2 and etag is not None:169 etag = etag.decode('utf-8')170 filename = url_to_filename(url, etag)171 # get cache path to put the file172 cache_path = os.path.join(cache_dir, filename)173 # If we don't have a connection (etag is None) and can't identify the file174 # try to get the last downloaded one175 if not os.path.exists(cache_path) and etag is None:176 matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')177 matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))178 if matching_files:179 cache_path = os.path.join(cache_dir, matching_files[-1])180 if not os.path.exists(cache_path):181 # Download to temporary file, then copy to cache dir once finished.182 # Otherwise you get corrupt cache entries if the download gets interrupted.183 with tempfile.NamedTemporaryFile() as temp_file:184 logger.info("%s not found in cache, downloading to %s", url, temp_file.name)185 # GET file object186 if url.startswith("s3://"):187 s3_get(url, temp_file)188 else:189 http_get(url, temp_file)190 # we are copying the file before closing it, so flush to avoid truncation191 temp_file.flush()192 # shutil.copyfileobj() starts at the current position, so go to the start193 temp_file.seek(0)194 logger.info("copying %s to cache at %s", temp_file.name, cache_path)195 with open(cache_path, 'wb') as cache_file:196 shutil.copyfileobj(temp_file, cache_file)197 logger.info("creating metadata file for %s", cache_path)198 meta = {'url': url, 'etag': etag}199 meta_path = cache_path + '.json'200 with open(meta_path, 'w') as meta_file:201 output_string = json.dumps(meta)202 if sys.version_info[0] == 2 and isinstance(output_string, str):203 output_string = unicode(output_string, 'utf-8') # The beauty of python 2204 meta_file.write(output_string)205 logger.info("removing temp file %s", temp_file.name)206 return cache_path207def read_set_from_file(filename):208 '''209 Extract a de-duped collection (set) of text from a file.210 Expected file format is one item per line.211 '''212 collection = set()213 with open(filename, 'r', encoding='utf-8') as file_:214 for line in file_:215 collection.add(line.rstrip())216 return collection217def get_file_extension(path, dot=True, lower=True):218 ext = os.path.splitext(path)[1]219 ext = ext if dot else ext[1:]...

Full Screen

Full Screen

MmapWordEmbeddings.py

Source:MmapWordEmbeddings.py Github

copy

Full Screen

1from pathlib import Path2import gensim3from flair.embeddings import WordEmbeddings, TokenEmbeddings4from flair.file_utils import instance_lru_cache5import torch6from typing import List7from flair.data import Sentence8import flair9import re10import numpy as np11class MmapWordEmbeddings(WordEmbeddings):12 """13 Adapted from https://github.com/flairNLP/flair/blob/v0.8/flair/embeddings/token.py14 to use gensim with mmap15 """16 def __init__(self, embeddings: str, field: str = None):17 """18 Initializes classic word embeddings. Constructor downloads required files if not there.19 :param embeddings: one of: 'glove', 'extvec', 'crawl' or two-letter language code or custom20 If you want to use a custom embedding file, just pass the path to the embeddings as embeddings variable.21 """22 self.embeddings = embeddings23 self.instance_parameters = self.get_instance_parameters(locals=locals())24 hu_path: str = "https://flair.informatik.hu-berlin.de/resources/embeddings/token"25 cache_dir = Path("embeddings")26 # GLOVE embeddings27 if embeddings.lower() == "glove" or embeddings.lower() == "en-glove":28 cached_path(f"{hu_path}/glove.gensim.vectors.npy", cache_dir=cache_dir)29 embeddings = cached_path(f"{hu_path}/glove.gensim", cache_dir=cache_dir)30 # TURIAN embeddings31 elif embeddings.lower() == "turian" or embeddings.lower() == "en-turian":32 cached_path(f"{hu_path}/turian.vectors.npy", cache_dir=cache_dir)33 embeddings = cached_path(f"{hu_path}/turian", cache_dir=cache_dir)34 # KOMNINOS embeddings35 elif embeddings.lower() == "extvec" or embeddings.lower() == "en-extvec":36 cached_path(f"{hu_path}/extvec.gensim.vectors.npy", cache_dir=cache_dir)37 embeddings = cached_path(f"{hu_path}/extvec.gensim", cache_dir=cache_dir)38 # pubmed embeddings39 elif embeddings.lower() == "pubmed" or embeddings.lower() == "en-pubmed":40 cached_path(f"{hu_path}/pubmed_pmc_wiki_sg_1M.gensim.vectors.npy", cache_dir=cache_dir)41 embeddings = cached_path(f"{hu_path}/pubmed_pmc_wiki_sg_1M.gensim", cache_dir=cache_dir)42 # FT-CRAWL embeddings43 elif embeddings.lower() == "crawl" or embeddings.lower() == "en-crawl":44 cached_path(f"{hu_path}/en-fasttext-crawl-300d-1M.vectors.npy", cache_dir=cache_dir)45 embeddings = cached_path(f"{hu_path}/en-fasttext-crawl-300d-1M", cache_dir=cache_dir)46 # FT-CRAWL embeddings47 elif embeddings.lower() in ["news", "en-news", "en"]:48 cached_path(f"{hu_path}/en-fasttext-news-300d-1M.vectors.npy", cache_dir=cache_dir)49 embeddings = cached_path(f"{hu_path}/en-fasttext-news-300d-1M", cache_dir=cache_dir)50 # twitter embeddings51 elif embeddings.lower() in ["twitter", "en-twitter"]:52 cached_path(f"{hu_path}/twitter.gensim.vectors.npy", cache_dir=cache_dir)53 embeddings = cached_path(f"{hu_path}/twitter.gensim", cache_dir=cache_dir)54 # two-letter language code wiki embeddings55 elif len(embeddings.lower()) == 2:56 cached_path(f"{hu_path}/{embeddings}-wiki-fasttext-300d-1M.vectors.npy", cache_dir=cache_dir)57 embeddings = cached_path(f"{hu_path}/{embeddings}-wiki-fasttext-300d-1M", cache_dir=cache_dir)58 # two-letter language code wiki embeddings59 elif len(embeddings.lower()) == 7 and embeddings.endswith("-wiki"):60 cached_path(f"{hu_path}/{embeddings[:2]}-wiki-fasttext-300d-1M.vectors.npy", cache_dir=cache_dir)61 embeddings = cached_path(f"{hu_path}/{embeddings[:2]}-wiki-fasttext-300d-1M", cache_dir=cache_dir)62 # two-letter language code crawl embeddings63 elif len(embeddings.lower()) == 8 and embeddings.endswith("-crawl"):64 cached_path(f"{hu_path}/{embeddings[:2]}-crawl-fasttext-300d-1M.vectors.npy", cache_dir=cache_dir)65 embeddings = cached_path(f"{hu_path}/{embeddings[:2]}-crawl-fasttext-300d-1M", cache_dir=cache_dir)66 elif not Path(embeddings).exists():67 raise ValueError(68 f'The given embeddings "{embeddings}" is not available or is not a valid path.'69 )70 self.name: str = str(embeddings)71 self.static_embeddings = True72 if str(embeddings).endswith(".bin"):73 self.precomputed_word_embeddings = gensim.models.KeyedVectors.load_word2vec_format(74 str(embeddings), binary=True75 )76 else:77 self.precomputed_word_embeddings = gensim.models.KeyedVectors.load(78 str(embeddings),79 mmap='r'80 )81 self.field = field82 self.__embedding_length: int = self.precomputed_word_embeddings.vector_size83 TokenEmbeddings.__init__(self)84 @property85 def embedding_length(self) -> int:86 return self.__embedding_length87 @instance_lru_cache(maxsize=10000, typed=False)88 def get_cached_vec(self, word: str) -> torch.Tensor:89 if word in self.precomputed_word_embeddings:90 word_embedding = self.precomputed_word_embeddings[word]91 elif word.lower() in self.precomputed_word_embeddings:92 word_embedding = self.precomputed_word_embeddings[word.lower()]93 elif re.sub(r"\d", "#", word.lower()) in self.precomputed_word_embeddings:94 word_embedding = self.precomputed_word_embeddings[95 re.sub(r"\d", "#", word.lower())96 ]97 elif re.sub(r"\d", "0", word.lower()) in self.precomputed_word_embeddings:98 word_embedding = self.precomputed_word_embeddings[99 re.sub(r"\d", "0", word.lower())100 ]101 else:102 word_embedding = np.zeros(self.embedding_length, dtype="float")103 word_embedding = torch.tensor(104 word_embedding.tolist(), device=flair.device, dtype=torch.float105 )106 return word_embedding107 def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:108 for i, sentence in enumerate(sentences):109 for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))):110 if "field" not in self.__dict__ or self.field is None:111 word = token.text112 else:113 word = token.get_tag(self.field).value114 word_embedding = self.get_cached_vec(word=word)115 token.set_embedding(self.name, word_embedding)116 return sentences117 def __str__(self):118 return self.name119 def extra_repr(self):120 # fix serialized models121 if "embeddings" not in self.__dict__:122 self.embeddings = self.name...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run localstack automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful