docs(bench): update charts · huggingface/optimum-quanto@285862b (original) (raw)

`@@ -12,25 +12,25 @@ Note: the language modeling head (lm_head) of the tested models is not quantized

`

12

12

``

13

13

`The paragraphs below display results for some popular models on a NVIDIA A10 GPU.

`

14

14

``

15

``

`-

meta-llama/Meta-Llama-3-8B

`

``

15

`+

meta-llama/Meta-Llama-3.1-8B

`

16

16

``

17

17

`

`

18

18

`

`

19

``

`-

meta-llama/Meta-llama-3-8B Lambada prediction accuracy

`

``

19

`+

meta-llama/Meta-llama-3.1-8B Lambada prediction accuracy

`

20

20

`

`

21

21

`

`

22

22

`

`

23

23

``

24

24

`

`

25

25

`

`

26

``

`-

meta-llama/Meta-Llama-3-8B WikiText perplexity

`

``

26

`+

meta-llama/Meta-Llama-3.1-8B WikiText perplexity

`

27

27

`

`

28

28

`

`

29

29

`

`

30

30

``

31

31

`

`

32

32

`

`

33

``

`-

meta-llama/Meta-Llama-3-8B Latency

`

``

33

`+

meta-llama/Meta-Llama-3.1-8B Latency

`

34

34

`

`

35

35

`

`

36

36

`

`

`@@ -39,21 +39,21 @@ The paragraphs below display results for some popular models on a NVIDIA A10 GPU

`

39

39

``

40

40

`

`

41

41

`

`

42

``

`-

mistralai/Mistral-7B-Instruct-v0.3 Lambada prediction accuracy

`

``

42

`+

mistralai/Mistral-7B-Instruct-v0.3 Lambada prediction accuracy

`

43

43

`

`

44

44

`

`

45

45

`

`

46

46

``

47

47

`

`

48

48

`

`

49

``

`-

mistralai/Mistral-7B-Instruct-v0.3 WikiText perplexity

`

``

49

`+

mistralai/Mistral-7B-Instruct-v0.3 WikiText perplexity

`

50

50

`

`

51

51

`

`

52

52

`

`

53

53

``

54

54

`

`

55

55

`

`

56

``

`-

mistralai/Mistral-7B-Instruct-v0.3 Latency

`

``

56

`+

mistralai/Mistral-7B-Instruct-v0.3 Latency

`

57

57

`

`

58

58

`

`

59

59

`

`

`@@ -62,67 +62,21 @@ The paragraphs below display results for some popular models on a NVIDIA A10 GPU

`

62

62

``

63

63

`

`

64

64

`

`

65

``

`-

google-gemma-2b Lambada prediction accuracy

`

``

65

`+

google-gemma-2b Lambada prediction accuracy

`

66

66

`

`

67

67

`

`

68

68

`

`

69

69

``

70

70

`

`

71

71

`

`

72

``

`-

google-gemma-2b WikiText perplexity

`

``

72

`+

google-gemma-2b WikiText perplexity

`

73

73

`

`

74

74

`

`

75

75

`

`

76

76

``

77

77

`

`

78

78

`

`

79

``

`-

google-gemma-2b Latency

`

80

``

`-

`

81

``

`-

`

82

``

`-

`

83

``

-

84

``

`-

EleutherAI-pythia-1b

`

85

``

-

86

``

`-

`

87

``

`-

`

88

``

`-

EleutherAI-pythia-1b Lambada prediction accuracy

`

89

``

`-

`

90

``

`-

`

91

``

`-

`

92

``

-

93

``

`-

`

94

``

`-

`

95

``

`-

EleutherAI-pythia-1b WikiText perplexity

`

96

``

`-

`

97

``

`-

`

98

``

`-

`

99

``

-

100

``

`-

`

101

``

`-

`

102

``

`-

EleutherAI-pythia-1b Latency

`

103

``

`-

`

104

``

`-

`

105

``

`-

`

106

``

-

107

``

`-

princeton-nlp/Sheared-LLaMA-1.3B

`

108

``

-

109

``

`-

`

110

``

`-

`

111

``

`-

princeton-nlp/Sheared-LLaMA-1.3B Lambada prediction accuracy

`

112

``

`-

`

113

``

`-

`

114

``

`-

`

115

``

-

116

``

`-

`

117

``

`-

`

118

``

`-

princeton-nlp/Sheared-LLaMA-1.3B WikiText perplexity

`

119

``

`-

`

120

``

`-

`

121

``

`-

`

122

``

-

123

``

`-

`

124

``

`-

`

125

``

`-

princeton-nlp/Sheared-LLaMA-1.3B Latency

`

``

79

`+

google-gemma-2b Latency

`

126

80

`

`

127

81

`

`

128

82

`

`