How to use safe_getattr method in Pytest

Best Python code snippet using pytest

Run Pytest automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

transformer_lm.py

Source: transformer_lm.py Github

copy
1# Copyright (c) Facebook, Inc. and its affiliates.
2#
3# This source code is licensed under the MIT license found in the
4# LICENSE file in the root directory of this source tree.
5
6
7from dataclasses import dataclass, field
8from typing import Optional
9
10from fairseq import options, utils
11from fairseq.dataclass import ChoiceEnum, FairseqDataclass
12from fairseq.models import (
13    FairseqLanguageModel,
14    register_model,
15    register_model_architecture,
16)
17from fairseq.models.transformer import (
18    DEFAULT_MIN_PARAMS_TO_WRAP, Embedding, TransformerDecoder
19)
20from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder
21from fairseq.utils import safe_getattr, safe_hasattr
22from omegaconf import II
23
24
25DEFAULT_MAX_TARGET_POSITIONS = 1024
26
27
28@dataclass
29class TransformerLanguageModelConfig(FairseqDataclass):
30    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
31        default="relu", metadata={"help": "activation function to use"}
32    )
33    dropout: float = field(default=0.1, metadata={"help": "dropout probability"})
34    attention_dropout: float = field(
35        default=0.0, metadata={"help": "dropout probability for attention weights"}
36    )
37    activation_dropout: float = field(
38        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
39    )
40    relu_dropout: float = field(
41        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
42    )
43    decoder_embed_dim: int = field(
44        default=512, metadata={"help": "decoder embedding dimension"}
45    )
46    decoder_output_dim: int = field(
47        default=512, metadata={"help": "decoder output dimension"}
48    )
49    decoder_input_dim: int = field(
50        default=512, metadata={"help": "decoder input dimension"}
51    )
52    decoder_ffn_embed_dim: int = field(
53        default=2048, metadata={"help": "decoder embedding dimension for FFN"}
54    )
55    decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"})
56    decoder_attention_heads: int = field(
57        default=8, metadata={"help": "num decoder attention heads"}
58    )
59    decoder_normalize_before: bool = field(
60        default=False, metadata={"help": "apply layernorm before each decoder block"}
61    )
62    no_decoder_final_norm: bool = field(
63        default=False,
64        metadata={"help": "don't add an extra layernorm after the last decoder block"},
65    )
66    adaptive_softmax_cutoff: Optional[str] = field(
67        default=None,
68        metadata={
69            "help": "comma separated list of adaptive softmax cutoff points. "
70            "Must be used with adaptive_loss criterion"
71        },
72    )
73    adaptive_softmax_dropout: float = field(
74        default=0,
75        metadata={"help": "sets adaptive softmax dropout for the tail projections"},
76    )
77    adaptive_softmax_factor: float = field(
78        default=4, metadata={"help": "adaptive input factor"}
79    )
80    no_token_positional_embeddings: bool = field(
81        default=False,
82        metadata={
83            "help": "if set, disables positional embeddings (outside self attention)"
84        },
85    )
86    share_decoder_input_output_embed: bool = field(
87        default=False, metadata={"help": "share decoder input and output embeddings"}
88    )
89    character_embeddings: bool = field(
90        default=False,
91        metadata={
92            "help": "if set, uses character embedding convolutions to produce token embeddings"
93        },
94    )
95    character_filters: str = field(
96        default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
97        metadata={"help": "size of character embeddings"},
98    )
99    character_embedding_dim: int = field(
100        default=4, metadata={"help": "size of character embeddings"}
101    )
102    char_embedder_highway_layers: int = field(
103        default=2,
104        metadata={"help": "number of highway layers for character token embeddder"},
105    )
106    adaptive_input: bool = field(
107        default=False, metadata={"help": "if set, uses adaptive input"}
108    )
109    adaptive_input_factor: float = field(
110        default=4, metadata={"help": "adaptive input factor"}
111    )
112    adaptive_input_cutoff: Optional[str] = field(
113        default=None,
114        metadata={"help": "comma separated list of adaptive input cutoff points."},
115    )
116    tie_adaptive_weights: bool = field(
117        default=False,
118        metadata={
119            "help": "if set, ties the weights of adaptive softmax and adaptive input"
120        },
121    )
122    tie_adaptive_proj: bool = field(
123        default=False,
124        metadata={
125            "help": "if set, ties the projection weights of adaptive softmax and adaptive input"
126        },
127    )
128    decoder_learned_pos: bool = field(
129        default=False,
130        metadata={"help": "use learned positional embeddings in the decoder"},
131    )
132    layernorm_embedding: bool = field(
133        default=False, metadata={"help": "add layernorm to embedding"}
134    )
135    no_scale_embedding: bool = field(
136        default=False, metadata={"help": "if True, dont scale embeddings"}
137    )
138    checkpoint_activations: bool = field(
139        default=False, metadata={"help": "checkpoint activations at each layer"}
140    )
141    offload_activations: bool = field(
142        default=False,
143        metadata={"help": "move checkpointed activations to CPU after they are used."},
144    )
145    # config for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
146    decoder_layerdrop: float = field(
147        default=0.0, metadata={"help": "LayerDrop probability for decoder"}
148    )
149    decoder_layers_to_keep: Optional[str] = field(
150        default=None,
151        metadata={
152            "help": "which layers to *keep* when pruning as a comma-separated list"
153        },
154    )
155    # config for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
156    quant_noise_pq: float = field(
157        default=0.0,
158        metadata={"help": "iterative PQ quantization noise at training time"},
159    )
160    quant_noise_pq_block_size: int = field(
161        default=8,
162        metadata={"help": "block size of quantization noise at training time"},
163    )
164    quant_noise_scalar: float = field(
165        default=0.0,
166        metadata={
167            "help": "scalar quantization noise and scalar quantization at training time"
168        },
169    )
170    # config for Fully Sharded Data Parallel (FSDP) training
171    min_params_to_wrap: int = field(
172        default=DEFAULT_MIN_PARAMS_TO_WRAP,
173        metadata={
174            "help": (
175                "minimum number of params for a layer to be wrapped with FSDP() when "
176                "training with --ddp-backend=fully_sharded. Smaller values will "
177                "improve memory efficiency, but may make torch.distributed "
178                "communication less efficient due to smaller input sizes. This option "
179                "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
180                "--offload-activations are passed."
181            )
182        }
183    )
184    # config for "BASE Layers: Simplifying Training of Large, Sparse Models"
185    base_layers: Optional[int] = field(
186        default=0, metadata={"help": "number of BASE layers in total"}
187    )
188    base_sublayers: Optional[int] = field(
189        default=1, metadata={"help": "number of sublayers in each BASE layer"}
190    )
191    base_shuffle: Optional[int] = field(
192        default=1, metadata={"help": "shuffle tokens between workers before computing assignment"}
193    )
194    # options from other parts of the config
195    add_bos_token: bool = II("task.add_bos_token")
196    tokens_per_sample: int = II("task.tokens_per_sample")
197    max_target_positions: Optional[int] = II("task.max_target_positions")
198    tpu: bool = II("common.tpu")
199
200
201@register_model("transformer_lm", dataclass=TransformerLanguageModelConfig)
202class TransformerLanguageModel(FairseqLanguageModel):
203    @classmethod
204    def hub_models(cls):
205        def moses_fastbpe(path):
206            return {"path": path, "tokenizer": "moses", "bpe": "fastbpe"}
207
208        def spm(path):
209            return {"path": path, "tokenizer": "space", "bpe": "sentencepiece"}
210
211        return {
212            "transformer_lm.gbw.adaptive_huge": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2",
213            "transformer_lm.wiki103.adaptive": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2",
214            "transformer_lm.wmt19.en": moses_fastbpe(
215                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.bz2"
216            ),
217            "transformer_lm.wmt19.de": moses_fastbpe(
218                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.bz2"
219            ),
220            "transformer_lm.wmt19.ru": moses_fastbpe(
221                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.bz2"
222            ),
223            "transformer_lm.wmt20.en": spm(
224                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.en.tar.gz"
225            ),
226            "transformer_lm.wmt20.ta": spm(
227                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.ta.tar.gz"
228            ),
229            "transformer_lm.wmt20.iu.news": spm(
230                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.iu.news.tar.gz"
231            ),
232            "transformer_lm.wmt20.iu.nh": spm(
233                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.iu.nh.tar.gz"
234            ),
235        }
236
237    def __init__(self, decoder):
238        super().__init__(decoder)
239
240    @classmethod
241    def build_model(cls, args, task):
242        """Build a new model instance."""
243
244        if args.decoder_layers_to_keep:
245            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
246
247        if safe_getattr(args, "max_target_positions", None) is None:
248            args.max_target_positions = safe_getattr(
249                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
250            )
251
252        if args.character_embeddings:
253            embed_tokens = CharacterTokenEmbedder(
254                task.source_dictionary,
255                eval(args.character_filters),
256                args.character_embedding_dim,
257                args.decoder_embed_dim,
258                args.char_embedder_highway_layers,
259            )
260        elif args.adaptive_input:
261            embed_tokens = AdaptiveInput(
262                len(task.source_dictionary),
263                task.source_dictionary.pad(),
264                args.decoder_input_dim,
265                args.adaptive_input_factor,
266                args.decoder_embed_dim,
267                options.eval_str_list(args.adaptive_input_cutoff, type=int),
268                args.quant_noise_pq,
269                args.quant_noise_pq_block_size,
270            )
271        else:
272            embed_tokens = cls.build_embedding(
273                args, task.source_dictionary, args.decoder_input_dim
274            )
275
276        if args.tie_adaptive_weights:
277            assert args.adaptive_input
278            assert args.adaptive_input_factor == args.adaptive_softmax_factor
279            assert (
280                args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
281            ), "{} != {}".format(
282                args.adaptive_softmax_cutoff, args.adaptive_input_cutoff
283            )
284            assert args.decoder_input_dim == args.decoder_output_dim
285
286        decoder = TransformerDecoder(
287            args, task.target_dictionary, embed_tokens, no_encoder_attn=True
288        )
289        return cls(decoder)
290
291    @classmethod
292    def build_embedding(cls, args, dictionary, embed_dim, path=None):
293        embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad())
294        return embed_tokens
295
296
297def base_lm_architecture(args):
298    # backward compatibility for older model checkpoints
299    if safe_hasattr(args, "no_tie_adaptive_proj"):
300        # previous models defined --no-tie-adaptive-proj, so use the existence of
301        # that option to determine if this is an "old" model checkpoint
302        args.no_decoder_final_norm = True  # old models always set this to True
303        if args.no_tie_adaptive_proj is False:
304            args.tie_adaptive_proj = True
305    if safe_hasattr(args, "decoder_final_norm"):
306        args.no_decoder_final_norm = not args.decoder_final_norm
307
308    args.dropout = safe_getattr(args, "dropout", 0.1)
309    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0)
310
311    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512)
312    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 2048)
313    args.decoder_layers = safe_getattr(args, "decoder_layers", 6)
314    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8)
315    args.adaptive_softmax_cutoff = safe_getattr(args, "adaptive_softmax_cutoff", None)
316    args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0)
317    args.adaptive_softmax_factor = safe_getattr(args, "adaptive_softmax_factor", 4)
318    args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", False)
319    args.activation_fn = safe_getattr(args, "activation_fn", "relu")
320
321    args.decoder_layerdrop = safe_getattr(args, "decoder_layerdrop", 0)
322    args.decoder_layers_to_keep = safe_getattr(args, "decoder_layers_to_keep", None)
323    args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0)
324    args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8)
325    args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0)
326
327    args.base_layers = safe_getattr(args, "base_layers", 0)
328    args.base_sublayers = safe_getattr(args, "base_sublayers", 1)
329    args.base_shuffle = safe_getattr(args, "base_shuffle", False)
330
331    args.add_bos_token = safe_getattr(args, "add_bos_token", False)
332    args.no_token_positional_embeddings = safe_getattr(
333        args, "no_token_positional_embeddings", False
334    )
335    args.share_decoder_input_output_embed = safe_getattr(
336        args, "share_decoder_input_output_embed", False
337    )
338    args.character_embeddings = safe_getattr(args, "character_embeddings", False)
339
340    args.decoder_output_dim = safe_getattr(
341        args, "decoder_output_dim", args.decoder_embed_dim
342    )
343    args.decoder_input_dim = safe_getattr(args, "decoder_input_dim", args.decoder_embed_dim)
344
345    # Model training is not stable without this
346    args.decoder_normalize_before = True
347    args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", False)
348
349    args.adaptive_input = safe_getattr(args, "adaptive_input", False)
350    args.adaptive_input_factor = safe_getattr(args, "adaptive_input_factor", 4)
351    args.adaptive_input_cutoff = safe_getattr(args, "adaptive_input_cutoff", None)
352
353    args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", False)
354    args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", False)
355
356    args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", False)
357    args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False)
358    args.checkpoint_activations = safe_getattr(args, "checkpoint_activations", False)
359    args.offload_activations = safe_getattr(args, "offload_activations", False)
360    if args.offload_activations:
361        args.checkpoint_activations = True
362
363
364@register_model_architecture("transformer_lm", "transformer_lm_big")
365def transformer_lm_big(args):
366    args.decoder_layers = safe_getattr(args, "decoder_layers", 12)
367    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024)
368    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 4096)
369    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16)
370    base_lm_architecture(args)
371
372
373@register_model_architecture("transformer_lm", "transformer_lm_wiki103")
374@register_model_architecture("transformer_lm", "transformer_lm_baevski_wiki103")
375def transformer_lm_baevski_wiki103(args):
376    args.decoder_layers = safe_getattr(args, "decoder_layers", 16)
377    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8)
378    args.dropout = safe_getattr(args, "dropout", 0.3)
379    args.adaptive_input = safe_getattr(args, "adaptive_input", True)
380    args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", True)
381    args.adaptive_input_cutoff = safe_getattr(args, "adaptive_input_cutoff", "20000,60000")
382    args.adaptive_softmax_cutoff = safe_getattr(
383        args, "adaptive_softmax_cutoff", "20000,60000"
384    )
385    args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0.2)
386    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
387    args.activation_dropout = safe_getattr(args, "activation_dropout", 0.1)
388    args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True)
389    args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", True)
390    transformer_lm_big(args)
391
392
393@register_model_architecture("transformer_lm", "transformer_lm_gbw")
394@register_model_architecture("transformer_lm", "transformer_lm_baevski_gbw")
395def transformer_lm_baevski_gbw(args):
396    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512)
397    args.dropout = safe_getattr(args, "dropout", 0.1)
398    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
399    args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True)
400    transformer_lm_big(args)
401
402
403@register_model_architecture("transformer_lm", "transformer_lm_gpt")
404def transformer_lm_gpt(args):
405    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768)
406    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 3072)
407    args.decoder_layers = safe_getattr(args, "decoder_layers", 12)
408    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 12)
409    args.dropout = safe_getattr(args, "dropout", 0.1)
410    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
411    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
412    base_lm_architecture(args)
413
414
415@register_model_architecture("transformer_lm", "transformer_lm_gpt2_small")
416def transformer_lm_gpt2_small(args):
417    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024)
418    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 4096)
419    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
420    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16)
421    args.dropout = safe_getattr(args, "dropout", 0.1)
422    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
423    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
424    base_lm_architecture(args)
425
426
427@register_model_architecture("transformer_lm", "transformer_lm_gpt2_tiny")
428def transformer_lm_gpt2_tiny(args):
429    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 64)
430    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 64)
431    args.decoder_layers = safe_getattr(args, "decoder_layers", 2)
432    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 1)
433    args.dropout = safe_getattr(args, "dropout", 0.1)
434    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
435    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
436    base_lm_architecture(args)
437
438
439@register_model_architecture("transformer_lm", "transformer_lm_gpt2_medium")
440def transformer_lm_gpt2_medium(args):
441    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1280)
442    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 5120)
443    args.decoder_layers = safe_getattr(args, "decoder_layers", 36)
444    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 20)
445    args.dropout = safe_getattr(args, "dropout", 0.1)
446    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
447    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
448    base_lm_architecture(args)
449
450
451@register_model_architecture("transformer_lm", "transformer_lm_gpt2_big")
452def transformer_lm_gpt2_big(args):
453    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1600)
454    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 6400)
455    args.decoder_layers = safe_getattr(args, "decoder_layers", 48)
456    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 25)
457    args.dropout = safe_getattr(args, "dropout", 0.1)
458    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
459    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
460    base_lm_architecture(args)
461
462
463def base_gpt3_architecture(args):
464    args.decoder_input_dim = args.decoder_embed_dim
465    args.decoder_output_dim = args.decoder_embed_dim
466    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", args.decoder_embed_dim * 4)
467    # GPT-3 used learned positional embeddings, rather than sinusoidal
468    args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", True)
469    args.dropout = safe_getattr(args, "dropout", 0.0)
470    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0)
471    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
472    args.share_decoder_input_output_embed = True
473    base_lm_architecture(args)
474
475
476@register_model_architecture("transformer_lm", "transformer_lm_gpt3_small")
477def transformer_lm_gpt3_small(args):
478    # 125M params
479    args.decoder_layers = safe_getattr(args, "decoder_layers", 12)
480    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768)
481    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 12)
482    base_gpt3_architecture(args)
483
484
485@register_model_architecture("transformer_lm", "transformer_lm_gpt3_medium")
486def transformer_lm_gpt3_medium(args):
487    # 350M params
488    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
489    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024)
490    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16)
491    base_gpt3_architecture(args)
492
493
494@register_model_architecture("transformer_lm", "transformer_lm_gpt3_large")
495def transformer_lm_gpt3_large(args):
496    # 760M params
497    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
498    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1536)
499    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16)
500    base_gpt3_architecture(args)
501
502
503@register_model_architecture("transformer_lm", "transformer_lm_gpt3_xl")
504def transformer_lm_gpt3_xl(args):
505    # 1.3B params
506    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
507    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2048)
508    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32)
509    base_gpt3_architecture(args)
510
511
512@register_model_architecture("transformer_lm", "transformer_lm_gpt3_2_7")
513def transformer_lm_gpt3_2_7(args):
514    # 2.7B params
515    args.decoder_layers = safe_getattr(args, "decoder_layers", 32)
516    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2560)
517    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32)
518    base_gpt3_architecture(args)
519
520
521@register_model_architecture("transformer_lm", "transformer_lm_gpt3_6_7")
522def transformer_lm_gpt3_6_7(args):
523    # 6.7B params
524    args.decoder_layers = safe_getattr(args, "decoder_layers", 32)
525    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 4096)
526    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32)
527    base_gpt3_architecture(args)
528
529
530@register_model_architecture("transformer_lm", "transformer_lm_gpt3_13")
531def transformer_lm_gpt3_13(args):
532    # 13B params
533    args.decoder_layers = safe_getattr(args, "decoder_layers", 40)
534    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 5120)
535    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 40)
536    base_gpt3_architecture(args)
537
538
539@register_model_architecture("transformer_lm", "transformer_lm_gpt3_175")
540def transformer_lm_gpt3_175(args):
541    # 175B params
542    args.decoder_layers = safe_getattr(args, "decoder_layers", 96)
543    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 12288)
544    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 96)
545    base_gpt3_architecture(args)
546
Full Screen

model.py

Source: model.py Github

copy
1# Copyright (c) Facebook, Inc. and its affiliates.
2#
3# This source code is licensed under the MIT license found in the
4# LICENSE file in the root directory of this source tree.
5"""
6RoBERTa: A Robustly Optimized BERT Pretraining Approach.
7"""
8
9import logging
10
11import torch
12import torch.nn as nn
13import torch.nn.functional as F
14from fairseq import utils
15from fairseq.models import (
16    FairseqEncoder,
17    FairseqEncoderModel,
18    register_model,
19    register_model_architecture,
20)
21from fairseq.models.transformer import DEFAULT_MIN_PARAMS_TO_WRAP, TransformerEncoder
22from fairseq.modules import LayerNorm
23from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
24from fairseq.modules.transformer_sentence_encoder import init_bert_params
25
26from .hub_interface import RobertaHubInterface
27
28
29logger = logging.getLogger(__name__)
30
31
32@register_model("roberta")
33class RobertaModel(FairseqEncoderModel):
34    @classmethod
35    def hub_models(cls):
36        return {
37            "roberta.base": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz",
38            "roberta.large": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz",
39            "roberta.large.mnli": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz",
40            "roberta.large.wsc": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz",
41        }
42
43    def __init__(self, args, encoder):
44        super().__init__(encoder)
45        self.args = args
46
47        # We follow BERT's random weight initialization
48        self.apply(init_bert_params)
49
50        self.classification_heads = nn.ModuleDict()
51
52    @staticmethod
53    def add_args(parser):
54        """Add model-specific arguments to the parser."""
55        parser.add_argument(
56            "--encoder-layers", type=int, metavar="L", help="num encoder layers"
57        )
58        parser.add_argument(
59            "--encoder-embed-dim",
60            type=int,
61            metavar="H",
62            help="encoder embedding dimension",
63        )
64        parser.add_argument(
65            "--encoder-ffn-embed-dim",
66            type=int,
67            metavar="F",
68            help="encoder embedding dimension for FFN",
69        )
70        parser.add_argument(
71            "--encoder-attention-heads",
72            type=int,
73            metavar="A",
74            help="num encoder attention heads",
75        )
76        parser.add_argument(
77            "--activation-fn",
78            choices=utils.get_available_activation_fns(),
79            help="activation function to use",
80        )
81        parser.add_argument(
82            "--pooler-activation-fn",
83            choices=utils.get_available_activation_fns(),
84            help="activation function to use for pooler layer",
85        )
86        parser.add_argument(
87            "--encoder-normalize-before",
88            action="store_true",
89            help="apply layernorm before each encoder block",
90        )
91        parser.add_argument(
92            "--layernorm-embedding",
93            action="store_true",
94            help="add layernorm to embedding",
95        )
96        parser.add_argument(
97            "--dropout", type=float, metavar="D", help="dropout probability"
98        )
99        parser.add_argument(
100            "--attention-dropout",
101            type=float,
102            metavar="D",
103            help="dropout probability for attention weights",
104        )
105        parser.add_argument(
106            "--activation-dropout",
107            type=float,
108            metavar="D",
109            help="dropout probability after activation in FFN",
110        )
111        parser.add_argument(
112            "--pooler-dropout",
113            type=float,
114            metavar="D",
115            help="dropout probability in the masked_lm pooler layers",
116        )
117        parser.add_argument(
118            "--max-positions", type=int, help="number of positional embeddings to learn"
119        )
120        parser.add_argument(
121            "--load-checkpoint-heads",
122            action="store_true",
123            help="(re-)register and load heads when loading checkpoints",
124        )
125        parser.add_argument(
126            "--untie-weights-roberta",
127            action="store_true",
128            help="Untie weights between embeddings and classifiers in RoBERTa",
129        )
130        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
131        parser.add_argument(
132            "--encoder-layerdrop",
133            type=float,
134            metavar="D",
135            default=0,
136            help="LayerDrop probability for encoder",
137        )
138        parser.add_argument(
139            "--encoder-layers-to-keep",
140            default=None,
141            help="which layers to *keep* when pruning as a comma-separated list",
142        )
143        # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
144        parser.add_argument(
145            "--quant-noise-pq",
146            type=float,
147            metavar="D",
148            default=0,
149            help="iterative PQ quantization noise at training time",
150        )
151        parser.add_argument(
152            "--quant-noise-pq-block-size",
153            type=int,
154            metavar="D",
155            default=8,
156            help="block size of quantization noise at training time",
157        )
158        parser.add_argument(
159            "--quant-noise-scalar",
160            type=float,
161            metavar="D",
162            default=0,
163            help="scalar quantization noise and scalar quantization at training time",
164        )
165        # args for "Better Fine-Tuning by Reducing Representational Collapse" (Aghajanyan et al. 2020)
166        parser.add_argument(
167            "--spectral-norm-classification-head",
168            action="store_true",
169            default=False,
170            help="Apply spectral normalization on the classification head",
171        )
172        # args for Fully Sharded Data Parallel (FSDP) training
173        parser.add_argument(
174            "--min-params-to-wrap",
175            type=int,
176            metavar="D",
177            default=DEFAULT_MIN_PARAMS_TO_WRAP,
178            help=(
179                "minimum number of params for a layer to be wrapped with FSDP() when "
180                "training with --ddp-backend=fully_sharded. Smaller values will "
181                "improve memory efficiency, but may make torch.distributed "
182                "communication less efficient due to smaller input sizes. This option "
183                "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
184                "--offload-activations are passed."
185            )
186        )
187
188    @classmethod
189    def build_model(cls, args, task):
190        """Build a new model instance."""
191
192        from omegaconf import OmegaConf
193
194        if OmegaConf.is_config(args):
195            OmegaConf.set_struct(args, False)
196
197        # make sure all arguments are present
198        base_architecture(args)
199
200        if not hasattr(args, "max_positions"):
201            if not hasattr(args, "tokens_per_sample"):
202                args.tokens_per_sample = task.max_positions()
203            args.max_positions = args.tokens_per_sample
204
205        encoder = RobertaEncoder(args, task.source_dictionary)
206
207        if OmegaConf.is_config(args):
208            OmegaConf.set_struct(args, True)
209
210        return cls(args, encoder)
211
212    def forward(
213        self,
214        src_tokens,
215        features_only=False,
216        return_all_hiddens=False,
217        classification_head_name=None,
218        **kwargs,
219    ):
220        if classification_head_name is not None:
221            features_only = True
222
223        x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs)
224
225        if classification_head_name is not None:
226            x = self.classification_heads[classification_head_name](x)
227        return x, extra
228
229    def get_normalized_probs(self, net_output, log_probs, sample=None):
230        """Get normalized probabilities (or log probs) from a net's output."""
231        logits = net_output[0].float()
232        if log_probs:
233            return F.log_softmax(logits, dim=-1)
234        else:
235            return F.softmax(logits, dim=-1)
236
237    def register_classification_head(
238        self, name, num_classes=None, inner_dim=None, **kwargs
239    ):
240        """Register a classification head."""
241        if name in self.classification_heads:
242            prev_num_classes = self.classification_heads[name].out_proj.out_features
243            prev_inner_dim = self.classification_heads[name].dense.out_features
244            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
245                logger.warning(
246                    're-registering head "{}" with num_classes {} (prev: {}) '
247                    "and inner_dim {} (prev: {})".format(
248                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
249                    )
250                )
251        self.classification_heads[name] = RobertaClassificationHead(
252            input_dim=self.args.encoder_embed_dim,
253            inner_dim=inner_dim or self.args.encoder_embed_dim,
254            num_classes=num_classes,
255            activation_fn=self.args.pooler_activation_fn,
256            pooler_dropout=self.args.pooler_dropout,
257            q_noise=self.args.quant_noise_pq,
258            qn_block_size=self.args.quant_noise_pq_block_size,
259            do_spectral_norm=self.args.spectral_norm_classification_head,
260        )
261
262    @property
263    def supported_targets(self):
264        return {"self"}
265
266    @classmethod
267    def from_pretrained(
268        cls,
269        model_name_or_path,
270        checkpoint_file="model.pt",
271        data_name_or_path=".",
272        bpe="gpt2",
273        **kwargs,
274    ):
275        from fairseq import hub_utils
276
277        x = hub_utils.from_pretrained(
278            model_name_or_path,
279            checkpoint_file,
280            data_name_or_path,
281            archive_map=cls.hub_models(),
282            bpe=bpe,
283            load_checkpoint_heads=True,
284            **kwargs,
285        )
286
287        logger.info(x["args"])
288        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
289
290    def upgrade_state_dict_named(self, state_dict, name):
291        prefix = name + "." if name != "" else ""
292
293        # rename decoder -> encoder before upgrading children modules
294        for k in list(state_dict.keys()):
295            if k.startswith(prefix + "decoder"):
296                new_k = prefix + "encoder" + k[len(prefix + "decoder") :]
297                state_dict[new_k] = state_dict[k]
298                del state_dict[k]
299
300        # rename emb_layer_norm -> layernorm_embedding
301        for k in list(state_dict.keys()):
302            if ".emb_layer_norm." in k:
303                new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.")
304                state_dict[new_k] = state_dict[k]
305                del state_dict[k]
306
307        # upgrade children modules
308        super().upgrade_state_dict_named(state_dict, name)
309
310        # Handle new classification heads present in the state dict.
311        current_head_names = (
312            []
313            if not hasattr(self, "classification_heads")
314            else self.classification_heads.keys()
315        )
316        keys_to_delete = []
317        for k in state_dict.keys():
318            if not k.startswith(prefix + "classification_heads."):
319                continue
320
321            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
322            num_classes = state_dict[
323                prefix + "classification_heads." + head_name + ".out_proj.weight"
324            ].size(0)
325            inner_dim = state_dict[
326                prefix + "classification_heads." + head_name + ".dense.weight"
327            ].size(0)
328
329            if getattr(self.args, "load_checkpoint_heads", False):
330                if head_name not in current_head_names:
331                    self.register_classification_head(head_name, num_classes, inner_dim)
332            else:
333                if head_name not in current_head_names:
334                    logger.warning(
335                        "deleting classification head ({}) from checkpoint "
336                        "not present in current model: {}".format(head_name, k)
337                    )
338                    keys_to_delete.append(k)
339                elif (
340                    num_classes
341                    != self.classification_heads[head_name].out_proj.out_features
342                    or inner_dim
343                    != self.classification_heads[head_name].dense.out_features
344                ):
345                    logger.warning(
346                        "deleting classification head ({}) from checkpoint "
347                        "with different dimensions than current model: {}".format(
348                            head_name, k
349                        )
350                    )
351                    keys_to_delete.append(k)
352        for k in keys_to_delete:
353            del state_dict[k]
354
355        # Copy any newly-added classification heads into the state dict
356        # with their current weights.
357        if hasattr(self, "classification_heads"):
358            cur_state = self.classification_heads.state_dict()
359            for k, v in cur_state.items():
360                if prefix + "classification_heads." + k not in state_dict:
361                    logger.info("Overwriting " + prefix + "classification_heads." + k)
362                    state_dict[prefix + "classification_heads." + k] = v
363
364
365class RobertaLMHead(nn.Module):
366    """Head for masked language modeling."""
367
368    def __init__(self, embed_dim, output_dim, activation_fn, weight=None):
369        super().__init__()
370        self.dense = nn.Linear(embed_dim, embed_dim)
371        self.activation_fn = utils.get_activation_fn(activation_fn)
372        self.layer_norm = LayerNorm(embed_dim)
373
374        if weight is None:
375            weight = nn.Linear(embed_dim, output_dim, bias=False).weight
376        self.weight = weight
377        self.bias = nn.Parameter(torch.zeros(output_dim))
378
379    def forward(self, features, masked_tokens=None, **kwargs):
380        # Only project the masked tokens while training,
381        # saves both memory and computation
382        if masked_tokens is not None:
383            features = features[masked_tokens, :]
384
385        x = self.dense(features)
386        x = self.activation_fn(x)
387        x = self.layer_norm(x)
388        # project back to size of vocabulary with bias
389        x = F.linear(x, self.weight) + self.bias
390        return x
391
392
393class RobertaClassificationHead(nn.Module):
394    """Head for sentence-level classification tasks."""
395
396    def __init__(
397        self,
398        input_dim,
399        inner_dim,
400        num_classes,
401        activation_fn,
402        pooler_dropout,
403        q_noise=0,
404        qn_block_size=8,
405        do_spectral_norm=False,
406    ):
407        super().__init__()
408        self.dense = nn.Linear(input_dim, inner_dim)
409        self.activation_fn = utils.get_activation_fn(activation_fn)
410        self.dropout = nn.Dropout(p=pooler_dropout)
411        self.out_proj = apply_quant_noise_(
412            nn.Linear(inner_dim, num_classes), q_noise, qn_block_size
413        )
414        if do_spectral_norm:
415            if q_noise != 0:
416                raise NotImplementedError(
417                    "Attempting to use Spectral Normalization with Quant Noise. This is not officially supported"
418                )
419            self.out_proj = torch.nn.utils.spectral_norm(self.out_proj)
420
421    def forward(self, features, **kwargs):
422        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
423        x = self.dropout(x)
424        x = self.dense(x)
425        x = self.activation_fn(x)
426        x = self.dropout(x)
427        x = self.out_proj(x)
428        return x
429
430
431class RobertaEncoder(FairseqEncoder):
432    """RoBERTa encoder."""
433
434    def __init__(self, args, dictionary):
435        super().__init__(dictionary)
436
437        # set any missing default values
438        base_architecture(args)
439        self.args = args
440
441        if args.encoder_layers_to_keep:
442            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
443
444        embed_tokens = self.build_embedding(
445            len(dictionary), args.encoder_embed_dim, dictionary.pad()
446        )
447
448        self.sentence_encoder = self.build_encoder(args, dictionary, embed_tokens)
449
450        self.lm_head = self.build_lm_head(
451            embed_dim=args.encoder_embed_dim,
452            output_dim=len(dictionary),
453            activation_fn=args.activation_fn,
454            weight=(
455                self.sentence_encoder.embed_tokens.weight
456                if not args.untie_weights_roberta
457                else None
458            ),
459        )
460
461    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
462        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
463
464    def build_encoder(self, args, dictionary, embed_tokens):
465        encoder = TransformerEncoder(args, dictionary, embed_tokens)
466        encoder.apply(init_bert_params)
467        return encoder
468
469    def build_lm_head(self, embed_dim, output_dim, activation_fn, weight):
470        return RobertaLMHead(embed_dim, output_dim, activation_fn, weight)
471
472    def forward(
473        self,
474        src_tokens,
475        features_only=False,
476        return_all_hiddens=False,
477        masked_tokens=None,
478        **unused,
479    ):
480        """
481        Args:
482            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
483            features_only (bool, optional): skip LM head and just return
484                features. If True, the output will be of shape
485                `(batch, src_len, embed_dim)`.
486            return_all_hiddens (bool, optional): also return all of the
487                intermediate hidden states (default: False).
488
489        Returns:
490            tuple:
491                - the LM output of shape `(batch, src_len, vocab)`
492                - a dictionary of additional data, where 'inner_states'
493                  is a list of hidden states. Note that the hidden
494                  states have shape `(src_len, batch, vocab)`.
495        """
496        x, extra = self.extract_features(
497            src_tokens, return_all_hiddens=return_all_hiddens
498        )
499        if not features_only:
500            x = self.output_layer(x, masked_tokens=masked_tokens)
501        return x, extra
502
503    def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs):
504        encoder_out = self.sentence_encoder(
505            src_tokens,
506            return_all_hiddens=return_all_hiddens,
507            token_embeddings=kwargs.get("token_embeddings", None),
508        )
509        # T x B x C -> B x T x C
510        features = encoder_out["encoder_out"][0].transpose(0, 1)
511        inner_states = encoder_out["encoder_states"] if return_all_hiddens else None
512        return features, {"inner_states": inner_states}
513
514    def output_layer(self, features, masked_tokens=None, **unused):
515        return self.lm_head(features, masked_tokens)
516
517    def max_positions(self):
518        """Maximum output length supported by the encoder."""
519        return self.args.max_positions
520
521
522def safe_getattr(obj, k, default=None):
523    from omegaconf import OmegaConf
524
525    if OmegaConf.is_config(obj):
526        return obj[k] if k in obj and obj[k] is not None else default
527
528    return getattr(obj, k, default)
529
530@register_model_architecture("roberta", "roberta")
531def base_architecture(args):
532    args.encoder_layers = safe_getattr(args, "encoder_layers", 12)
533    args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 768)
534    args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 3072)
535    args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 12)
536
537    args.dropout = safe_getattr(args, "dropout", 0.1)
538    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
539    args.activation_dropout = safe_getattr(args, "activation_dropout", 0.0)
540    args.pooler_dropout = safe_getattr(args, "pooler_dropout", 0.0)
541
542    args.max_source_positions = safe_getattr(args, "max_positions", 512)
543    args.no_token_positional_embeddings = safe_getattr(
544        args, "no_token_positional_embeddings", False
545    )
546
547    # BERT has a few structural differences compared to the original Transformer
548    args.encoder_learned_pos = safe_getattr(args, "encoder_learned_pos", True)
549    args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", True)
550    args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", True)
551    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
552    args.encoder_normalize_before = safe_getattr(args, "encoder_normalize_before", False)
553    args.pooler_activation_fn = safe_getattr(args, "pooler_activation_fn", "tanh")
554    args.untie_weights_roberta = safe_getattr(args, "untie_weights_roberta", False)
555
556    # Adaptive input config
557    args.adaptive_input = safe_getattr(args, "adaptive_input", False)
558
559    # LayerDrop config
560    args.encoder_layerdrop = safe_getattr(args, "encoder_layerdrop", 0.0)
561    args.encoder_layers_to_keep = safe_getattr(args, "encoder_layers_to_keep", None)
562
563    # Quantization noise config
564    args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0)
565    args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8)
566    args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0)
567
568    # R4F config
569    args.spectral_norm_classification_head = safe_getattr(
570        args, "spectral_norm_classification_head", False
571    )
572
573
574@register_model_architecture("roberta", "roberta_prenorm")
575def roberta_prenorm_architecture(args):
576    args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False)
577    args.encoder_normalize_before = safe_getattr(args, "encoder_normalize_before", True)
578    base_architecture(args)
579
580
581@register_model_architecture("roberta", "roberta_base")
582def roberta_base_architecture(args):
583    base_architecture(args)
584
585
586@register_model_architecture("roberta", "roberta_large")
587def roberta_large_architecture(args):
588    args.encoder_layers = safe_getattr(args, "encoder_layers", 24)
589    args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1024)
590    args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 4096)
591    args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16)
592    base_architecture(args)
593
594
595@register_model_architecture("roberta", "xlm")
596def xlm_architecture(args):
597    args.encoder_layers = safe_getattr(args, "encoder_layers", 16)
598    args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1280)
599    args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 1280 * 4)
600    args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16)
601    base_architecture(args)
602
Full Screen

Accelerate Your Automation Test Cycles With LambdaTest

Leverage LambdaTest’s cloud-based platform to execute your automation tests in parallel and trim down your test execution time significantly. Your first 100 automation testing minutes are on us.

Try LambdaTest

Run Python Tests on LambdaTest Cloud Grid

Execute automation tests with Pytest on a cloud-based Grid of 3000+ real browsers and operating systems for both web and mobile applications.

Test now for Free
LambdaTestX

We use cookies to give you the best experience. Cookies help to provide a more personalized experience and relevant advertising for you, and web analytics for us. Learn More in our Cookies policy, Privacy & Terms of service

Allow Cookie
Sarah

I hope you find the best code examples for your project.

If you want to accelerate automated browser testing, try LambdaTest. Your first 100 automation testing minutes are FREE.

Sarah Elson (Product & Growth Lead)