diff --git a/previews/PR44/2024-05-28-cuda_5.4/index.html b/2024-05-28-cuda_5.4/index.html similarity index 100% rename from previews/PR44/2024-05-28-cuda_5.4/index.html rename to 2024-05-28-cuda_5.4/index.html diff --git a/previews/PR44/post/2024-05-28-cuda_5.4/index.html b/post/2024-05-28-cuda_5.4/index.html similarity index 93% rename from previews/PR44/post/2024-05-28-cuda_5.4/index.html rename to post/2024-05-28-cuda_5.4/index.html index c38af1c..b6b1f44 100644 --- a/previews/PR44/post/2024-05-28-cuda_5.4/index.html +++ b/post/2024-05-28-cuda_5.4/index.html @@ -4,12 +4,12 @@ - - + + - + - + - - - - - 404 ⋅ JuliaGPU - - - - - - -
-
- - - - -

404: File not found

-

The requested file was not found.

-

Please click here to go to the home page.

- -
-
- - - - - - - - - - - - - - - - - diff --git a/previews/PR44/Manifest.toml b/previews/PR44/Manifest.toml deleted file mode 100644 index b3b02f4..0000000 --- a/previews/PR44/Manifest.toml +++ /dev/null @@ -1,237 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.9.2" -manifest_format = "2.0" -project_hash = "471c9743ba70a989d67778bee16a3c4e8c8ad36d" - -[[deps.ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" -version = "1.1.1" - -[[deps.Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[deps.Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[deps.Crayons]] -git-tree-sha1 = "3f71217b538d7aaee0b69ab47d9b7724ca8afa0d" -uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" -version = "4.0.4" - -[[deps.Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[deps.DelimitedFiles]] -deps = ["Mmap"] -git-tree-sha1 = "9e2f36d3c96a820c678f2f1f1782582fcf685bae" -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" -version = "1.9.1" - -[[deps.DocStringExtensions]] -deps = ["LibGit2"] -git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.5" - -[[deps.Downloads]] -deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -version = "1.6.0" - -[[deps.ExprTools]] -git-tree-sha1 = "b7e3d17636b348f005f11040025ae8c6f645fe92" -uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.6" - -[[deps.FileWatching]] -uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" - -[[deps.Franklin]] -deps = ["Dates", "DelimitedFiles", "DocStringExtensions", "ExprTools", "FranklinTemplates", "HTTP", "Literate", "LiveServer", "Logging", "Markdown", "NodeJS", "OrderedCollections", "Pkg", "REPL", "Random"] -git-tree-sha1 = "ca57f7e99cb91d90fa8f969bbc7ffda3b7e09a12" -uuid = "713c75ef-9fc9-4b05-94a9-213340da978e" -version = "0.10.59" - -[[deps.FranklinTemplates]] -deps = ["LiveServer"] -git-tree-sha1 = "20ec221753e0c6bcac845423089b656538ac4eec" -uuid = "3a985190-f512-4703-8d38-2a7944ed5916" -version = "0.8.22" - -[[deps.HTTP]] -deps = ["Base64", "Dates", "IniFile", "Logging", "MbedTLS", "NetworkOptions", "Sockets", "URIs"] -git-tree-sha1 = "14eece7a3308b4d8be910e265c724a6ba51a9798" -uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" -version = "0.9.16" - -[[deps.IOCapture]] -deps = ["Logging", "Random"] -git-tree-sha1 = "f7be53659ab06ddc986428d3a9dcc95f6fa6705a" -uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" -version = "0.2.2" - -[[deps.IniFile]] -deps = ["Test"] -git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8" -uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" -version = "0.5.0" - -[[deps.InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[deps.JSON]] -deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "8076680b162ada2a031f707ac7b4953e30667a37" -uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.2" - -[[deps.LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" -version = "0.6.3" - -[[deps.LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" -version = "7.84.0+0" - -[[deps.LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[deps.LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" -version = "1.10.2+0" - -[[deps.Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[deps.Literate]] -deps = ["Base64", "IOCapture", "JSON", "REPL"] -git-tree-sha1 = "bbebc3c14dbfbe76bfcbabf0937481ac84dc86ef" -uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306" -version = "2.9.3" - -[[deps.LiveServer]] -deps = ["Crayons", "FileWatching", "HTTP", "Pkg", "Sockets", "Test"] -git-tree-sha1 = "99990da121ad310875b3c4dba5954eba54df8cfd" -uuid = "16fef848-5104-11e9-1b77-fb7a48bbb589" -version = "0.7.0" - -[[deps.Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[deps.Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[deps.MbedTLS]] -deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"] -git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe" -uuid = "739be429-bea8-5141-9913-cc70e7f3736d" -version = "1.0.3" - -[[deps.MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.28.2+0" - -[[deps.Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[deps.MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" -version = "2022.10.11" - -[[deps.NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" -version = "1.2.0" - -[[deps.NodeJS]] -deps = ["Pkg"] -git-tree-sha1 = "905224bbdd4b555c69bb964514cfa387616f0d3a" -uuid = "2bd173c7-0d6d-553b-b6af-13a54713934c" -version = "1.3.0" - -[[deps.OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" - -[[deps.Parsers]] -deps = ["Dates"] -git-tree-sha1 = "a8709b968a1ea6abc2dc1967cb1db6ac9a00dfb6" -uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.0.5" - -[[deps.Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.9.2" - -[[deps.Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[deps.Random]] -deps = ["SHA", "Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" -version = "0.7.0" - -[[deps.Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[deps.Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[deps.TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" -version = "1.0.3" - -[[deps.Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" -version = "1.10.0" - -[[deps.Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[deps.URIs]] -git-tree-sha1 = "97bbe755a53fe859669cd907f2d96aee8d2c1355" -uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" -version = "1.3.0" - -[[deps.UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[deps.Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[deps.Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.13+0" - -[[deps.nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" -version = "1.48.0+0" - -[[deps.p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" -version = "17.4.0+0" \ No newline at end of file diff --git a/previews/PR44/Project.toml b/previews/PR44/Project.toml deleted file mode 100644 index a633fb8..0000000 --- a/previews/PR44/Project.toml +++ /dev/null @@ -1,5 +0,0 @@ -[deps] -Franklin = "713c75ef-9fc9-4b05-94a9-213340da978e" - -[compat] -Franklin = "0.10.35" diff --git a/previews/PR44/assets/favicon.ico b/previews/PR44/assets/favicon.ico deleted file mode 100644 index 9021a68..0000000 Binary files a/previews/PR44/assets/favicon.ico and /dev/null differ diff --git a/previews/PR44/assets/img/amdgpu-performance.png b/previews/PR44/assets/img/amdgpu-performance.png deleted file mode 100644 index bf59cc4..0000000 Binary files a/previews/PR44/assets/img/amdgpu-performance.png and /dev/null differ diff --git a/previews/PR44/assets/img/cuda-performance.png b/previews/PR44/assets/img/cuda-performance.png deleted file mode 100644 index d22efc8..0000000 Binary files a/previews/PR44/assets/img/cuda-performance.png and /dev/null differ diff --git a/previews/PR44/assets/logo_crop.png b/previews/PR44/assets/logo_crop.png deleted file mode 100644 index 6313539..0000000 Binary files a/previews/PR44/assets/logo_crop.png and /dev/null differ diff --git a/previews/PR44/css/bootstrap.min.css b/previews/PR44/css/bootstrap.min.css deleted file mode 100644 index 92e3fe8..0000000 --- a/previews/PR44/css/bootstrap.min.css +++ /dev/null @@ -1,7 +0,0 @@ -/*! - * Bootstrap v4.3.1 (https://getbootstrap.com/) - * Copyright 2011-2019 The Bootstrap Authors - * Copyright 2011-2019 Twitter, Inc. - * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) - */:root{--blue:#007bff;--indigo:#6610f2;--purple:#6f42c1;--pink:#e83e8c;--red:#dc3545;--orange:#fd7e14;--yellow:#ffc107;--green:#28a745;--teal:#20c997;--cyan:#17a2b8;--white:#fff;--gray:#6c757d;--gray-dark:#343a40;--primary:#007bff;--secondary:#6c757d;--success:#28a745;--info:#17a2b8;--warning:#ffc107;--danger:#dc3545;--light:#f8f9fa;--dark:#343a40;--breakpoint-xs:0;--breakpoint-sm:576px;--breakpoint-md:768px;--breakpoint-lg:992px;--breakpoint-xl:1200px;--font-family-sans-serif:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";--font-family-monospace:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace}*,::after,::before{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-webkit-tap-highlight-color:transparent}article,aside,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}body{margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-size:1rem;font-weight:400;line-height:1.5;color:#212529;text-align:left;background-color:#fff}[tabindex="-1"]:focus{outline:0!important}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem}p{margin-top:0;margin-bottom:1rem}abbr[data-original-title],abbr[title]{text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted;cursor:help;border-bottom:0;-webkit-text-decoration-skip-ink:none;text-decoration-skip-ink:none}address{margin-bottom:1rem;font-style:normal;line-height:inherit}dl,ol,ul{margin-top:0;margin-bottom:1rem}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}a{color:#007bff;text-decoration:none;background-color:transparent}a:hover{color:#0056b3;text-decoration:underline}a:not([href]):not([tabindex]){color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus,a:not([href]):not([tabindex]):hover{color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus{outline:0}code,kbd,pre,samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;font-size:1em}pre{margin-top:0;margin-bottom:1rem;overflow:auto}figure{margin:0 0 1rem}img{vertical-align:middle;border-style:none}svg{overflow:hidden;vertical-align:middle}table{border-collapse:collapse}caption{padding-top:.75rem;padding-bottom:.75rem;color:#6c757d;text-align:left;caption-side:bottom}th{text-align:inherit}label{display:inline-block;margin-bottom:.5rem}button{border-radius:0}button:focus{outline:1px dotted;outline:5px auto -webkit-focus-ring-color}button,input,optgroup,select,textarea{margin:0;font-family:inherit;font-size:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}select{word-wrap:normal}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}[type=button]:not(:disabled),[type=reset]:not(:disabled),[type=submit]:not(:disabled),button:not(:disabled){cursor:pointer}[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner,button::-moz-focus-inner{padding:0;border-style:none}input[type=checkbox],input[type=radio]{box-sizing:border-box;padding:0}input[type=date],input[type=datetime-local],input[type=month],input[type=time]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;max-width:100%;padding:0;margin-bottom:.5rem;font-size:1.5rem;line-height:inherit;color:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item;cursor:pointer}template{display:none}[hidden]{display:none!important}.h1,.h2,.h3,.h4,.h5,.h6,h1,h2,h3,h4,h5,h6{margin-bottom:.5rem;font-weight:500;line-height:1.2}.h1,h1{font-size:2.5rem}.h2,h2{font-size:2rem}.h3,h3{font-size:1.75rem}.h4,h4{font-size:1.5rem}.h5,h5{font-size:1.25rem}.h6,h6{font-size:1rem}.lead{font-size:1.25rem;font-weight:300}.display-1{font-size:6rem;font-weight:300;line-height:1.2}.display-2{font-size:5.5rem;font-weight:300;line-height:1.2}.display-3{font-size:4.5rem;font-weight:300;line-height:1.2}.display-4{font-size:3.5rem;font-weight:300;line-height:1.2}hr{margin-top:1rem;margin-bottom:1rem;border:0;border-top:1px solid rgba(0,0,0,.1)}.small,small{font-size:80%;font-weight:400}.mark,mark{padding:.2em;background-color:#fcf8e3}.list-unstyled{padding-left:0;list-style:none}.list-inline{padding-left:0;list-style:none}.list-inline-item{display:inline-block}.list-inline-item:not(:last-child){margin-right:.5rem}.initialism{font-size:90%;text-transform:uppercase}.blockquote{margin-bottom:1rem;font-size:1.25rem}.blockquote-footer{display:block;font-size:80%;color:#6c757d}.blockquote-footer::before{content:"\2014\00A0"}.img-fluid{max-width:100%;height:auto}.img-thumbnail{padding:.25rem;background-color:#fff;border:1px solid #dee2e6;border-radius:.25rem;max-width:100%;height:auto}.figure{display:inline-block}.figure-img{margin-bottom:.5rem;line-height:1}.figure-caption{font-size:90%;color:#6c757d}code{font-size:87.5%;color:#e83e8c;word-break:break-word}a>code{color:inherit}kbd{padding:.2rem .4rem;font-size:87.5%;color:#fff;background-color:#212529;border-radius:.2rem}kbd kbd{padding:0;font-size:100%;font-weight:700}pre{display:block;font-size:87.5%;color:#212529}pre code{font-size:inherit;color:inherit;word-break:normal}.pre-scrollable{max-height:340px;overflow-y:scroll}.container{width:100%;padding-right:15px;padding-left:15px;margin-right:auto;margin-left:auto}@media (min-width:576px){.container{max-width:540px}}@media (min-width:768px){.container{max-width:720px}}@media (min-width:992px){.container{max-width:960px}}@media (min-width:1200px){.container{max-width:1140px}}.container-fluid{width:100%;padding-right:15px;padding-left:15px;margin-right:auto;margin-left:auto}.row{display:-ms-flexbox;display:flex;-ms-flex-wrap:wrap;flex-wrap:wrap;margin-right:-15px;margin-left:-15px}.no-gutters{margin-right:0;margin-left:0}.no-gutters>.col,.no-gutters>[class*=col-]{padding-right:0;padding-left:0}.col,.col-1,.col-10,.col-11,.col-12,.col-2,.col-3,.col-4,.col-5,.col-6,.col-7,.col-8,.col-9,.col-auto,.col-lg,.col-lg-1,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9,.col-lg-auto,.col-md,.col-md-1,.col-md-10,.col-md-11,.col-md-12,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9,.col-md-auto,.col-sm,.col-sm-1,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9,.col-sm-auto,.col-xl,.col-xl-1,.col-xl-10,.col-xl-11,.col-xl-12,.col-xl-2,.col-xl-3,.col-xl-4,.col-xl-5,.col-xl-6,.col-xl-7,.col-xl-8,.col-xl-9,.col-xl-auto{position:relative;width:100%;padding-right:15px;padding-left:15px}.col{-ms-flex-preferred-size:0;flex-basis:0;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-auto{-ms-flex:0 0 auto;flex:0 0 auto;width:auto;max-width:100%}.col-1{-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-2{-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-3{-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-4{-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-5{-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-6{-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-7{-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-8{-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-9{-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-10{-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-11{-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-12{-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.order-first{-ms-flex-order:-1;order:-1}.order-last{-ms-flex-order:13;order:13}.order-0{-ms-flex-order:0;order:0}.order-1{-ms-flex-order:1;order:1}.order-2{-ms-flex-order:2;order:2}.order-3{-ms-flex-order:3;order:3}.order-4{-ms-flex-order:4;order:4}.order-5{-ms-flex-order:5;order:5}.order-6{-ms-flex-order:6;order:6}.order-7{-ms-flex-order:7;order:7}.order-8{-ms-flex-order:8;order:8}.order-9{-ms-flex-order:9;order:9}.order-10{-ms-flex-order:10;order:10}.order-11{-ms-flex-order:11;order:11}.order-12{-ms-flex-order:12;order:12}.offset-1{margin-left:8.333333%}.offset-2{margin-left:16.666667%}.offset-3{margin-left:25%}.offset-4{margin-left:33.333333%}.offset-5{margin-left:41.666667%}.offset-6{margin-left:50%}.offset-7{margin-left:58.333333%}.offset-8{margin-left:66.666667%}.offset-9{margin-left:75%}.offset-10{margin-left:83.333333%}.offset-11{margin-left:91.666667%}@media (min-width:576px){.col-sm{-ms-flex-preferred-size:0;flex-basis:0;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-sm-auto{-ms-flex:0 0 auto;flex:0 0 auto;width:auto;max-width:100%}.col-sm-1{-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-sm-2{-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-sm-3{-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-sm-4{-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-sm-5{-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-sm-6{-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-sm-7{-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-sm-8{-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-sm-9{-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-sm-10{-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-sm-11{-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-sm-12{-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.order-sm-first{-ms-flex-order:-1;order:-1}.order-sm-last{-ms-flex-order:13;order:13}.order-sm-0{-ms-flex-order:0;order:0}.order-sm-1{-ms-flex-order:1;order:1}.order-sm-2{-ms-flex-order:2;order:2}.order-sm-3{-ms-flex-order:3;order:3}.order-sm-4{-ms-flex-order:4;order:4}.order-sm-5{-ms-flex-order:5;order:5}.order-sm-6{-ms-flex-order:6;order:6}.order-sm-7{-ms-flex-order:7;order:7}.order-sm-8{-ms-flex-order:8;order:8}.order-sm-9{-ms-flex-order:9;order:9}.order-sm-10{-ms-flex-order:10;order:10}.order-sm-11{-ms-flex-order:11;order:11}.order-sm-12{-ms-flex-order:12;order:12}.offset-sm-0{margin-left:0}.offset-sm-1{margin-left:8.333333%}.offset-sm-2{margin-left:16.666667%}.offset-sm-3{margin-left:25%}.offset-sm-4{margin-left:33.333333%}.offset-sm-5{margin-left:41.666667%}.offset-sm-6{margin-left:50%}.offset-sm-7{margin-left:58.333333%}.offset-sm-8{margin-left:66.666667%}.offset-sm-9{margin-left:75%}.offset-sm-10{margin-left:83.333333%}.offset-sm-11{margin-left:91.666667%}}@media (min-width:768px){.col-md{-ms-flex-preferred-size:0;flex-basis:0;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-md-auto{-ms-flex:0 0 auto;flex:0 0 auto;width:auto;max-width:100%}.col-md-1{-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-md-2{-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-md-3{-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-md-4{-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-md-5{-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-md-6{-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-md-7{-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-md-8{-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-md-9{-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-md-10{-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-md-11{-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-md-12{-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.order-md-first{-ms-flex-order:-1;order:-1}.order-md-last{-ms-flex-order:13;order:13}.order-md-0{-ms-flex-order:0;order:0}.order-md-1{-ms-flex-order:1;order:1}.order-md-2{-ms-flex-order:2;order:2}.order-md-3{-ms-flex-order:3;order:3}.order-md-4{-ms-flex-order:4;order:4}.order-md-5{-ms-flex-order:5;order:5}.order-md-6{-ms-flex-order:6;order:6}.order-md-7{-ms-flex-order:7;order:7}.order-md-8{-ms-flex-order:8;order:8}.order-md-9{-ms-flex-order:9;order:9}.order-md-10{-ms-flex-order:10;order:10}.order-md-11{-ms-flex-order:11;order:11}.order-md-12{-ms-flex-order:12;order:12}.offset-md-0{margin-left:0}.offset-md-1{margin-left:8.333333%}.offset-md-2{margin-left:16.666667%}.offset-md-3{margin-left:25%}.offset-md-4{margin-left:33.333333%}.offset-md-5{margin-left:41.666667%}.offset-md-6{margin-left:50%}.offset-md-7{margin-left:58.333333%}.offset-md-8{margin-left:66.666667%}.offset-md-9{margin-left:75%}.offset-md-10{margin-left:83.333333%}.offset-md-11{margin-left:91.666667%}}@media (min-width:992px){.col-lg{-ms-flex-preferred-size:0;flex-basis:0;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-lg-auto{-ms-flex:0 0 auto;flex:0 0 auto;width:auto;max-width:100%}.col-lg-1{-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-lg-2{-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-lg-3{-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-lg-4{-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-lg-5{-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-lg-6{-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-lg-7{-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-lg-8{-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-lg-9{-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-lg-10{-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-lg-11{-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-lg-12{-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.order-lg-first{-ms-flex-order:-1;order:-1}.order-lg-last{-ms-flex-order:13;order:13}.order-lg-0{-ms-flex-order:0;order:0}.order-lg-1{-ms-flex-order:1;order:1}.order-lg-2{-ms-flex-order:2;order:2}.order-lg-3{-ms-flex-order:3;order:3}.order-lg-4{-ms-flex-order:4;order:4}.order-lg-5{-ms-flex-order:5;order:5}.order-lg-6{-ms-flex-order:6;order:6}.order-lg-7{-ms-flex-order:7;order:7}.order-lg-8{-ms-flex-order:8;order:8}.order-lg-9{-ms-flex-order:9;order:9}.order-lg-10{-ms-flex-order:10;order:10}.order-lg-11{-ms-flex-order:11;order:11}.order-lg-12{-ms-flex-order:12;order:12}.offset-lg-0{margin-left:0}.offset-lg-1{margin-left:8.333333%}.offset-lg-2{margin-left:16.666667%}.offset-lg-3{margin-left:25%}.offset-lg-4{margin-left:33.333333%}.offset-lg-5{margin-left:41.666667%}.offset-lg-6{margin-left:50%}.offset-lg-7{margin-left:58.333333%}.offset-lg-8{margin-left:66.666667%}.offset-lg-9{margin-left:75%}.offset-lg-10{margin-left:83.333333%}.offset-lg-11{margin-left:91.666667%}}@media (min-width:1200px){.col-xl{-ms-flex-preferred-size:0;flex-basis:0;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-xl-auto{-ms-flex:0 0 auto;flex:0 0 auto;width:auto;max-width:100%}.col-xl-1{-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-xl-2{-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-xl-3{-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-xl-4{-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-xl-5{-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-xl-6{-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-xl-7{-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-xl-8{-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-xl-9{-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-xl-10{-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-xl-11{-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-xl-12{-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.order-xl-first{-ms-flex-order:-1;order:-1}.order-xl-last{-ms-flex-order:13;order:13}.order-xl-0{-ms-flex-order:0;order:0}.order-xl-1{-ms-flex-order:1;order:1}.order-xl-2{-ms-flex-order:2;order:2}.order-xl-3{-ms-flex-order:3;order:3}.order-xl-4{-ms-flex-order:4;order:4}.order-xl-5{-ms-flex-order:5;order:5}.order-xl-6{-ms-flex-order:6;order:6}.order-xl-7{-ms-flex-order:7;order:7}.order-xl-8{-ms-flex-order:8;order:8}.order-xl-9{-ms-flex-order:9;order:9}.order-xl-10{-ms-flex-order:10;order:10}.order-xl-11{-ms-flex-order:11;order:11}.order-xl-12{-ms-flex-order:12;order:12}.offset-xl-0{margin-left:0}.offset-xl-1{margin-left:8.333333%}.offset-xl-2{margin-left:16.666667%}.offset-xl-3{margin-left:25%}.offset-xl-4{margin-left:33.333333%}.offset-xl-5{margin-left:41.666667%}.offset-xl-6{margin-left:50%}.offset-xl-7{margin-left:58.333333%}.offset-xl-8{margin-left:66.666667%}.offset-xl-9{margin-left:75%}.offset-xl-10{margin-left:83.333333%}.offset-xl-11{margin-left:91.666667%}}.table{width:100%;margin-bottom:1rem;color:#212529}.table td,.table th{padding:.75rem;vertical-align:top;border-top:1px solid #dee2e6}.table thead th{vertical-align:bottom;border-bottom:2px solid #dee2e6}.table tbody+tbody{border-top:2px solid #dee2e6}.table-sm td,.table-sm th{padding:.3rem}.table-bordered{border:1px solid #dee2e6}.table-bordered td,.table-bordered th{border:1px solid #dee2e6}.table-bordered thead td,.table-bordered thead th{border-bottom-width:2px}.table-borderless tbody+tbody,.table-borderless td,.table-borderless th,.table-borderless thead th{border:0}.table-striped tbody tr:nth-of-type(odd){background-color:rgba(0,0,0,.05)}.table-hover tbody tr:hover{color:#212529;background-color:rgba(0,0,0,.075)}.table-primary,.table-primary>td,.table-primary>th{background-color:#b8daff}.table-primary tbody+tbody,.table-primary td,.table-primary th,.table-primary thead th{border-color:#7abaff}.table-hover .table-primary:hover{background-color:#9fcdff}.table-hover .table-primary:hover>td,.table-hover .table-primary:hover>th{background-color:#9fcdff}.table-secondary,.table-secondary>td,.table-secondary>th{background-color:#d6d8db}.table-secondary tbody+tbody,.table-secondary td,.table-secondary th,.table-secondary thead th{border-color:#b3b7bb}.table-hover .table-secondary:hover{background-color:#c8cbcf}.table-hover .table-secondary:hover>td,.table-hover .table-secondary:hover>th{background-color:#c8cbcf}.table-success,.table-success>td,.table-success>th{background-color:#c3e6cb}.table-success tbody+tbody,.table-success td,.table-success th,.table-success thead th{border-color:#8fd19e}.table-hover .table-success:hover{background-color:#b1dfbb}.table-hover .table-success:hover>td,.table-hover .table-success:hover>th{background-color:#b1dfbb}.table-info,.table-info>td,.table-info>th{background-color:#bee5eb}.table-info tbody+tbody,.table-info td,.table-info th,.table-info thead th{border-color:#86cfda}.table-hover .table-info:hover{background-color:#abdde5}.table-hover .table-info:hover>td,.table-hover .table-info:hover>th{background-color:#abdde5}.table-warning,.table-warning>td,.table-warning>th{background-color:#ffeeba}.table-warning tbody+tbody,.table-warning td,.table-warning th,.table-warning thead th{border-color:#ffdf7e}.table-hover .table-warning:hover{background-color:#ffe8a1}.table-hover .table-warning:hover>td,.table-hover .table-warning:hover>th{background-color:#ffe8a1}.table-danger,.table-danger>td,.table-danger>th{background-color:#f5c6cb}.table-danger tbody+tbody,.table-danger td,.table-danger th,.table-danger thead th{border-color:#ed969e}.table-hover .table-danger:hover{background-color:#f1b0b7}.table-hover .table-danger:hover>td,.table-hover .table-danger:hover>th{background-color:#f1b0b7}.table-light,.table-light>td,.table-light>th{background-color:#fdfdfe}.table-light tbody+tbody,.table-light td,.table-light th,.table-light thead th{border-color:#fbfcfc}.table-hover .table-light:hover{background-color:#ececf6}.table-hover .table-light:hover>td,.table-hover .table-light:hover>th{background-color:#ececf6}.table-dark,.table-dark>td,.table-dark>th{background-color:#c6c8ca}.table-dark tbody+tbody,.table-dark td,.table-dark th,.table-dark thead th{border-color:#95999c}.table-hover .table-dark:hover{background-color:#b9bbbe}.table-hover .table-dark:hover>td,.table-hover .table-dark:hover>th{background-color:#b9bbbe}.table-active,.table-active>td,.table-active>th{background-color:rgba(0,0,0,.075)}.table-hover .table-active:hover{background-color:rgba(0,0,0,.075)}.table-hover .table-active:hover>td,.table-hover .table-active:hover>th{background-color:rgba(0,0,0,.075)}.table .thead-dark th{color:#fff;background-color:#343a40;border-color:#454d55}.table .thead-light th{color:#495057;background-color:#e9ecef;border-color:#dee2e6}.table-dark{color:#fff;background-color:#343a40}.table-dark td,.table-dark th,.table-dark thead th{border-color:#454d55}.table-dark.table-bordered{border:0}.table-dark.table-striped tbody tr:nth-of-type(odd){background-color:rgba(255,255,255,.05)}.table-dark.table-hover tbody tr:hover{color:#fff;background-color:rgba(255,255,255,.075)}@media (max-width:575.98px){.table-responsive-sm{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-sm>.table-bordered{border:0}}@media (max-width:767.98px){.table-responsive-md{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-md>.table-bordered{border:0}}@media (max-width:991.98px){.table-responsive-lg{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-lg>.table-bordered{border:0}}@media (max-width:1199.98px){.table-responsive-xl{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive-xl>.table-bordered{border:0}}.table-responsive{display:block;width:100%;overflow-x:auto;-webkit-overflow-scrolling:touch}.table-responsive>.table-bordered{border:0}.form-control{display:block;width:100%;height:calc(1.5em + .75rem + 2px);padding:.375rem .75rem;font-size:1rem;font-weight:400;line-height:1.5;color:#495057;background-color:#fff;background-clip:padding-box;border:1px solid #ced4da;border-radius:.25rem;transition:border-color .15s ease-in-out,box-shadow .15s ease-in-out}@media (prefers-reduced-motion:reduce){.form-control{transition:none}}.form-control::-ms-expand{background-color:transparent;border:0}.form-control:focus{color:#495057;background-color:#fff;border-color:#80bdff;outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,.25)}.form-control::-webkit-input-placeholder{color:#6c757d;opacity:1}.form-control::-moz-placeholder{color:#6c757d;opacity:1}.form-control:-ms-input-placeholder{color:#6c757d;opacity:1}.form-control::-ms-input-placeholder{color:#6c757d;opacity:1}.form-control::placeholder{color:#6c757d;opacity:1}.form-control:disabled,.form-control[readonly]{background-color:#e9ecef;opacity:1}select.form-control:focus::-ms-value{color:#495057;background-color:#fff}.form-control-file,.form-control-range{display:block;width:100%}.col-form-label{padding-top:calc(.375rem + 1px);padding-bottom:calc(.375rem + 1px);margin-bottom:0;font-size:inherit;line-height:1.5}.col-form-label-lg{padding-top:calc(.5rem + 1px);padding-bottom:calc(.5rem + 1px);font-size:1.25rem;line-height:1.5}.col-form-label-sm{padding-top:calc(.25rem + 1px);padding-bottom:calc(.25rem + 1px);font-size:.875rem;line-height:1.5}.form-control-plaintext{display:block;width:100%;padding-top:.375rem;padding-bottom:.375rem;margin-bottom:0;line-height:1.5;color:#212529;background-color:transparent;border:solid transparent;border-width:1px 0}.form-control-plaintext.form-control-lg,.form-control-plaintext.form-control-sm{padding-right:0;padding-left:0}.form-control-sm{height:calc(1.5em + .5rem + 2px);padding:.25rem .5rem;font-size:.875rem;line-height:1.5;border-radius:.2rem}.form-control-lg{height:calc(1.5em + 1rem + 2px);padding:.5rem 1rem;font-size:1.25rem;line-height:1.5;border-radius:.3rem}select.form-control[multiple],select.form-control[size]{height:auto}textarea.form-control{height:auto}.form-group{margin-bottom:1rem}.form-text{display:block;margin-top:.25rem}.form-row{display:-ms-flexbox;display:flex;-ms-flex-wrap:wrap;flex-wrap:wrap;margin-right:-5px;margin-left:-5px}.form-row>.col,.form-row>[class*=col-]{padding-right:5px;padding-left:5px}.form-check{position:relative;display:block;padding-left:1.25rem}.form-check-input{position:absolute;margin-top:.3rem;margin-left:-1.25rem}.form-check-input:disabled~.form-check-label{color:#6c757d}.form-check-label{margin-bottom:0}.form-check-inline{display:-ms-inline-flexbox;display:inline-flex;-ms-flex-align:center;align-items:center;padding-left:0;margin-right:.75rem}.form-check-inline .form-check-input{position:static;margin-top:0;margin-right:.3125rem;margin-left:0}.valid-feedback{display:none;width:100%;margin-top:.25rem;font-size:80%;color:#28a745}.valid-tooltip{position:absolute;top:100%;z-index:5;display:none;max-width:100%;padding:.25rem .5rem;margin-top:.1rem;font-size:.875rem;line-height:1.5;color:#fff;background-color:rgba(40,167,69,.9);border-radius:.25rem}.form-control.is-valid,.was-validated .form-control:valid{border-color:#28a745;padding-right:calc(1.5em + .75rem);background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3e%3cpath fill='%2328a745' d='M2.3 6.73L.6 4.53c-.4-1.04.46-1.4 1.1-.8l1.1 1.4 3.4-3.8c.6-.63 1.6-.27 1.2.7l-4 4.6c-.43.5-.8.4-1.1.1z'/%3e%3c/svg%3e");background-repeat:no-repeat;background-position:center right calc(.375em + .1875rem);background-size:calc(.75em + .375rem) calc(.75em + .375rem)}.form-control.is-valid:focus,.was-validated .form-control:valid:focus{border-color:#28a745;box-shadow:0 0 0 .2rem rgba(40,167,69,.25)}.form-control.is-valid~.valid-feedback,.form-control.is-valid~.valid-tooltip,.was-validated .form-control:valid~.valid-feedback,.was-validated .form-control:valid~.valid-tooltip{display:block}.was-validated textarea.form-control:valid,textarea.form-control.is-valid{padding-right:calc(1.5em + .75rem);background-position:top calc(.375em + .1875rem) right calc(.375em + .1875rem)}.custom-select.is-valid,.was-validated .custom-select:valid{border-color:#28a745;padding-right:calc((1em + .75rem) * 3 / 4 + 1.75rem);background:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3e%3cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3e%3c/svg%3e") no-repeat right .75rem center/8px 10px,url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3e%3cpath fill='%2328a745' d='M2.3 6.73L.6 4.53c-.4-1.04.46-1.4 1.1-.8l1.1 1.4 3.4-3.8c.6-.63 1.6-.27 1.2.7l-4 4.6c-.43.5-.8.4-1.1.1z'/%3e%3c/svg%3e") #fff no-repeat center right 1.75rem/calc(.75em + .375rem) calc(.75em + .375rem)}.custom-select.is-valid:focus,.was-validated .custom-select:valid:focus{border-color:#28a745;box-shadow:0 0 0 .2rem rgba(40,167,69,.25)}.custom-select.is-valid~.valid-feedback,.custom-select.is-valid~.valid-tooltip,.was-validated .custom-select:valid~.valid-feedback,.was-validated .custom-select:valid~.valid-tooltip{display:block}.form-control-file.is-valid~.valid-feedback,.form-control-file.is-valid~.valid-tooltip,.was-validated .form-control-file:valid~.valid-feedback,.was-validated .form-control-file:valid~.valid-tooltip{display:block}.form-check-input.is-valid~.form-check-label,.was-validated .form-check-input:valid~.form-check-label{color:#28a745}.form-check-input.is-valid~.valid-feedback,.form-check-input.is-valid~.valid-tooltip,.was-validated .form-check-input:valid~.valid-feedback,.was-validated .form-check-input:valid~.valid-tooltip{display:block}.custom-control-input.is-valid~.custom-control-label,.was-validated .custom-control-input:valid~.custom-control-label{color:#28a745}.custom-control-input.is-valid~.custom-control-label::before,.was-validated .custom-control-input:valid~.custom-control-label::before{border-color:#28a745}.custom-control-input.is-valid~.valid-feedback,.custom-control-input.is-valid~.valid-tooltip,.was-validated .custom-control-input:valid~.valid-feedback,.was-validated .custom-control-input:valid~.valid-tooltip{display:block}.custom-control-input.is-valid:checked~.custom-control-label::before,.was-validated .custom-control-input:valid:checked~.custom-control-label::before{border-color:#34ce57;background-color:#34ce57}.custom-control-input.is-valid:focus~.custom-control-label::before,.was-validated .custom-control-input:valid:focus~.custom-control-label::before{box-shadow:0 0 0 .2rem rgba(40,167,69,.25)}.custom-control-input.is-valid:focus:not(:checked)~.custom-control-label::before,.was-validated .custom-control-input:valid:focus:not(:checked)~.custom-control-label::before{border-color:#28a745}.custom-file-input.is-valid~.custom-file-label,.was-validated .custom-file-input:valid~.custom-file-label{border-color:#28a745}.custom-file-input.is-valid~.valid-feedback,.custom-file-input.is-valid~.valid-tooltip,.was-validated .custom-file-input:valid~.valid-feedback,.was-validated .custom-file-input:valid~.valid-tooltip{display:block}.custom-file-input.is-valid:focus~.custom-file-label,.was-validated .custom-file-input:valid:focus~.custom-file-label{border-color:#28a745;box-shadow:0 0 0 .2rem rgba(40,167,69,.25)}.invalid-feedback{display:none;width:100%;margin-top:.25rem;font-size:80%;color:#dc3545}.invalid-tooltip{position:absolute;top:100%;z-index:5;display:none;max-width:100%;padding:.25rem .5rem;margin-top:.1rem;font-size:.875rem;line-height:1.5;color:#fff;background-color:rgba(220,53,69,.9);border-radius:.25rem}.form-control.is-invalid,.was-validated .form-control:invalid{border-color:#dc3545;padding-right:calc(1.5em + .75rem);background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23dc3545' viewBox='-2 -2 7 7'%3e%3cpath stroke='%23dc3545' d='M0 0l3 3m0-3L0 3'/%3e%3ccircle r='.5'/%3e%3ccircle cx='3' r='.5'/%3e%3ccircle cy='3' r='.5'/%3e%3ccircle cx='3' cy='3' r='.5'/%3e%3c/svg%3E");background-repeat:no-repeat;background-position:center right calc(.375em + .1875rem);background-size:calc(.75em + .375rem) calc(.75em + .375rem)}.form-control.is-invalid:focus,.was-validated .form-control:invalid:focus{border-color:#dc3545;box-shadow:0 0 0 .2rem rgba(220,53,69,.25)}.form-control.is-invalid~.invalid-feedback,.form-control.is-invalid~.invalid-tooltip,.was-validated .form-control:invalid~.invalid-feedback,.was-validated .form-control:invalid~.invalid-tooltip{display:block}.was-validated textarea.form-control:invalid,textarea.form-control.is-invalid{padding-right:calc(1.5em + .75rem);background-position:top calc(.375em + .1875rem) right calc(.375em + .1875rem)}.custom-select.is-invalid,.was-validated .custom-select:invalid{border-color:#dc3545;padding-right:calc((1em + .75rem) * 3 / 4 + 1.75rem);background:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3e%3cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3e%3c/svg%3e") no-repeat right .75rem center/8px 10px,url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23dc3545' viewBox='-2 -2 7 7'%3e%3cpath stroke='%23dc3545' d='M0 0l3 3m0-3L0 3'/%3e%3ccircle r='.5'/%3e%3ccircle cx='3' r='.5'/%3e%3ccircle cy='3' r='.5'/%3e%3ccircle cx='3' cy='3' r='.5'/%3e%3c/svg%3E") #fff no-repeat center right 1.75rem/calc(.75em + .375rem) calc(.75em + .375rem)}.custom-select.is-invalid:focus,.was-validated .custom-select:invalid:focus{border-color:#dc3545;box-shadow:0 0 0 .2rem rgba(220,53,69,.25)}.custom-select.is-invalid~.invalid-feedback,.custom-select.is-invalid~.invalid-tooltip,.was-validated .custom-select:invalid~.invalid-feedback,.was-validated .custom-select:invalid~.invalid-tooltip{display:block}.form-control-file.is-invalid~.invalid-feedback,.form-control-file.is-invalid~.invalid-tooltip,.was-validated .form-control-file:invalid~.invalid-feedback,.was-validated .form-control-file:invalid~.invalid-tooltip{display:block}.form-check-input.is-invalid~.form-check-label,.was-validated .form-check-input:invalid~.form-check-label{color:#dc3545}.form-check-input.is-invalid~.invalid-feedback,.form-check-input.is-invalid~.invalid-tooltip,.was-validated .form-check-input:invalid~.invalid-feedback,.was-validated .form-check-input:invalid~.invalid-tooltip{display:block}.custom-control-input.is-invalid~.custom-control-label,.was-validated .custom-control-input:invalid~.custom-control-label{color:#dc3545}.custom-control-input.is-invalid~.custom-control-label::before,.was-validated .custom-control-input:invalid~.custom-control-label::before{border-color:#dc3545}.custom-control-input.is-invalid~.invalid-feedback,.custom-control-input.is-invalid~.invalid-tooltip,.was-validated .custom-control-input:invalid~.invalid-feedback,.was-validated .custom-control-input:invalid~.invalid-tooltip{display:block}.custom-control-input.is-invalid:checked~.custom-control-label::before,.was-validated .custom-control-input:invalid:checked~.custom-control-label::before{border-color:#e4606d;background-color:#e4606d}.custom-control-input.is-invalid:focus~.custom-control-label::before,.was-validated .custom-control-input:invalid:focus~.custom-control-label::before{box-shadow:0 0 0 .2rem rgba(220,53,69,.25)}.custom-control-input.is-invalid:focus:not(:checked)~.custom-control-label::before,.was-validated .custom-control-input:invalid:focus:not(:checked)~.custom-control-label::before{border-color:#dc3545}.custom-file-input.is-invalid~.custom-file-label,.was-validated .custom-file-input:invalid~.custom-file-label{border-color:#dc3545}.custom-file-input.is-invalid~.invalid-feedback,.custom-file-input.is-invalid~.invalid-tooltip,.was-validated .custom-file-input:invalid~.invalid-feedback,.was-validated .custom-file-input:invalid~.invalid-tooltip{display:block}.custom-file-input.is-invalid:focus~.custom-file-label,.was-validated .custom-file-input:invalid:focus~.custom-file-label{border-color:#dc3545;box-shadow:0 0 0 .2rem rgba(220,53,69,.25)}.form-inline{display:-ms-flexbox;display:flex;-ms-flex-flow:row wrap;flex-flow:row wrap;-ms-flex-align:center;align-items:center}.form-inline .form-check{width:100%}@media (min-width:576px){.form-inline label{display:-ms-flexbox;display:flex;-ms-flex-align:center;align-items:center;-ms-flex-pack:center;justify-content:center;margin-bottom:0}.form-inline .form-group{display:-ms-flexbox;display:flex;-ms-flex:0 0 auto;flex:0 0 auto;-ms-flex-flow:row wrap;flex-flow:row wrap;-ms-flex-align:center;align-items:center;margin-bottom:0}.form-inline .form-control{display:inline-block;width:auto;vertical-align:middle}.form-inline .form-control-plaintext{display:inline-block}.form-inline .custom-select,.form-inline .input-group{width:auto}.form-inline .form-check{display:-ms-flexbox;display:flex;-ms-flex-align:center;align-items:center;-ms-flex-pack:center;justify-content:center;width:auto;padding-left:0}.form-inline .form-check-input{position:relative;-ms-flex-negative:0;flex-shrink:0;margin-top:0;margin-right:.25rem;margin-left:0}.form-inline .custom-control{-ms-flex-align:center;align-items:center;-ms-flex-pack:center;justify-content:center}.form-inline .custom-control-label{margin-bottom:0}}.btn{display:inline-block;font-weight:400;color:#212529;text-align:center;vertical-align:middle;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;background-color:transparent;border:1px solid transparent;padding:.375rem .75rem;font-size:1rem;line-height:1.5;border-radius:.25rem;transition:color .15s ease-in-out,background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out}@media (prefers-reduced-motion:reduce){.btn{transition:none}}.btn:hover{color:#212529;text-decoration:none}.btn.focus,.btn:focus{outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,.25)}.btn.disabled,.btn:disabled{opacity:.65}a.btn.disabled,fieldset:disabled a.btn{pointer-events:none}.btn-primary{color:#fff;background-color:#007bff;border-color:#007bff}.btn-primary:hover{color:#fff;background-color:#0069d9;border-color:#0062cc}.btn-primary.focus,.btn-primary:focus{box-shadow:0 0 0 .2rem rgba(38,143,255,.5)}.btn-primary.disabled,.btn-primary:disabled{color:#fff;background-color:#007bff;border-color:#007bff}.btn-primary:not(:disabled):not(.disabled).active,.btn-primary:not(:disabled):not(.disabled):active,.show>.btn-primary.dropdown-toggle{color:#fff;background-color:#0062cc;border-color:#005cbf}.btn-primary:not(:disabled):not(.disabled).active:focus,.btn-primary:not(:disabled):not(.disabled):active:focus,.show>.btn-primary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(38,143,255,.5)}.btn-secondary{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-secondary:hover{color:#fff;background-color:#5a6268;border-color:#545b62}.btn-secondary.focus,.btn-secondary:focus{box-shadow:0 0 0 .2rem rgba(130,138,145,.5)}.btn-secondary.disabled,.btn-secondary:disabled{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-secondary:not(:disabled):not(.disabled).active,.btn-secondary:not(:disabled):not(.disabled):active,.show>.btn-secondary.dropdown-toggle{color:#fff;background-color:#545b62;border-color:#4e555b}.btn-secondary:not(:disabled):not(.disabled).active:focus,.btn-secondary:not(:disabled):not(.disabled):active:focus,.show>.btn-secondary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(130,138,145,.5)}.btn-success{color:#fff;background-color:#28a745;border-color:#28a745}.btn-success:hover{color:#fff;background-color:#218838;border-color:#1e7e34}.btn-success.focus,.btn-success:focus{box-shadow:0 0 0 .2rem rgba(72,180,97,.5)}.btn-success.disabled,.btn-success:disabled{color:#fff;background-color:#28a745;border-color:#28a745}.btn-success:not(:disabled):not(.disabled).active,.btn-success:not(:disabled):not(.disabled):active,.show>.btn-success.dropdown-toggle{color:#fff;background-color:#1e7e34;border-color:#1c7430}.btn-success:not(:disabled):not(.disabled).active:focus,.btn-success:not(:disabled):not(.disabled):active:focus,.show>.btn-success.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(72,180,97,.5)}.btn-info{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-info:hover{color:#fff;background-color:#138496;border-color:#117a8b}.btn-info.focus,.btn-info:focus{box-shadow:0 0 0 .2rem rgba(58,176,195,.5)}.btn-info.disabled,.btn-info:disabled{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-info:not(:disabled):not(.disabled).active,.btn-info:not(:disabled):not(.disabled):active,.show>.btn-info.dropdown-toggle{color:#fff;background-color:#117a8b;border-color:#10707f}.btn-info:not(:disabled):not(.disabled).active:focus,.btn-info:not(:disabled):not(.disabled):active:focus,.show>.btn-info.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(58,176,195,.5)}.btn-warning{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-warning:hover{color:#212529;background-color:#e0a800;border-color:#d39e00}.btn-warning.focus,.btn-warning:focus{box-shadow:0 0 0 .2rem rgba(222,170,12,.5)}.btn-warning.disabled,.btn-warning:disabled{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-warning:not(:disabled):not(.disabled).active,.btn-warning:not(:disabled):not(.disabled):active,.show>.btn-warning.dropdown-toggle{color:#212529;background-color:#d39e00;border-color:#c69500}.btn-warning:not(:disabled):not(.disabled).active:focus,.btn-warning:not(:disabled):not(.disabled):active:focus,.show>.btn-warning.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(222,170,12,.5)}.btn-danger{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-danger:hover{color:#fff;background-color:#c82333;border-color:#bd2130}.btn-danger.focus,.btn-danger:focus{box-shadow:0 0 0 .2rem rgba(225,83,97,.5)}.btn-danger.disabled,.btn-danger:disabled{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-danger:not(:disabled):not(.disabled).active,.btn-danger:not(:disabled):not(.disabled):active,.show>.btn-danger.dropdown-toggle{color:#fff;background-color:#bd2130;border-color:#b21f2d}.btn-danger:not(:disabled):not(.disabled).active:focus,.btn-danger:not(:disabled):not(.disabled):active:focus,.show>.btn-danger.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(225,83,97,.5)}.btn-light{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-light:hover{color:#212529;background-color:#e2e6ea;border-color:#dae0e5}.btn-light.focus,.btn-light:focus{box-shadow:0 0 0 .2rem rgba(216,217,219,.5)}.btn-light.disabled,.btn-light:disabled{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-light:not(:disabled):not(.disabled).active,.btn-light:not(:disabled):not(.disabled):active,.show>.btn-light.dropdown-toggle{color:#212529;background-color:#dae0e5;border-color:#d3d9df}.btn-light:not(:disabled):not(.disabled).active:focus,.btn-light:not(:disabled):not(.disabled):active:focus,.show>.btn-light.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(216,217,219,.5)}.btn-dark{color:#fff;background-color:#343a40;border-color:#343a40}.btn-dark:hover{color:#fff;background-color:#23272b;border-color:#1d2124}.btn-dark.focus,.btn-dark:focus{box-shadow:0 0 0 .2rem rgba(82,88,93,.5)}.btn-dark.disabled,.btn-dark:disabled{color:#fff;background-color:#343a40;border-color:#343a40}.btn-dark:not(:disabled):not(.disabled).active,.btn-dark:not(:disabled):not(.disabled):active,.show>.btn-dark.dropdown-toggle{color:#fff;background-color:#1d2124;border-color:#171a1d}.btn-dark:not(:disabled):not(.disabled).active:focus,.btn-dark:not(:disabled):not(.disabled):active:focus,.show>.btn-dark.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(82,88,93,.5)}.btn-outline-primary{color:#007bff;border-color:#007bff}.btn-outline-primary:hover{color:#fff;background-color:#007bff;border-color:#007bff}.btn-outline-primary.focus,.btn-outline-primary:focus{box-shadow:0 0 0 .2rem rgba(0,123,255,.5)}.btn-outline-primary.disabled,.btn-outline-primary:disabled{color:#007bff;background-color:transparent}.btn-outline-primary:not(:disabled):not(.disabled).active,.btn-outline-primary:not(:disabled):not(.disabled):active,.show>.btn-outline-primary.dropdown-toggle{color:#fff;background-color:#007bff;border-color:#007bff}.btn-outline-primary:not(:disabled):not(.disabled).active:focus,.btn-outline-primary:not(:disabled):not(.disabled):active:focus,.show>.btn-outline-primary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(0,123,255,.5)}.btn-outline-secondary{color:#6c757d;border-color:#6c757d}.btn-outline-secondary:hover{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-outline-secondary.focus,.btn-outline-secondary:focus{box-shadow:0 0 0 .2rem rgba(108,117,125,.5)}.btn-outline-secondary.disabled,.btn-outline-secondary:disabled{color:#6c757d;background-color:transparent}.btn-outline-secondary:not(:disabled):not(.disabled).active,.btn-outline-secondary:not(:disabled):not(.disabled):active,.show>.btn-outline-secondary.dropdown-toggle{color:#fff;background-color:#6c757d;border-color:#6c757d}.btn-outline-secondary:not(:disabled):not(.disabled).active:focus,.btn-outline-secondary:not(:disabled):not(.disabled):active:focus,.show>.btn-outline-secondary.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(108,117,125,.5)}.btn-outline-success{color:#28a745;border-color:#28a745}.btn-outline-success:hover{color:#fff;background-color:#28a745;border-color:#28a745}.btn-outline-success.focus,.btn-outline-success:focus{box-shadow:0 0 0 .2rem rgba(40,167,69,.5)}.btn-outline-success.disabled,.btn-outline-success:disabled{color:#28a745;background-color:transparent}.btn-outline-success:not(:disabled):not(.disabled).active,.btn-outline-success:not(:disabled):not(.disabled):active,.show>.btn-outline-success.dropdown-toggle{color:#fff;background-color:#28a745;border-color:#28a745}.btn-outline-success:not(:disabled):not(.disabled).active:focus,.btn-outline-success:not(:disabled):not(.disabled):active:focus,.show>.btn-outline-success.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(40,167,69,.5)}.btn-outline-info{color:#17a2b8;border-color:#17a2b8}.btn-outline-info:hover{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-outline-info.focus,.btn-outline-info:focus{box-shadow:0 0 0 .2rem rgba(23,162,184,.5)}.btn-outline-info.disabled,.btn-outline-info:disabled{color:#17a2b8;background-color:transparent}.btn-outline-info:not(:disabled):not(.disabled).active,.btn-outline-info:not(:disabled):not(.disabled):active,.show>.btn-outline-info.dropdown-toggle{color:#fff;background-color:#17a2b8;border-color:#17a2b8}.btn-outline-info:not(:disabled):not(.disabled).active:focus,.btn-outline-info:not(:disabled):not(.disabled):active:focus,.show>.btn-outline-info.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(23,162,184,.5)}.btn-outline-warning{color:#ffc107;border-color:#ffc107}.btn-outline-warning:hover{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-outline-warning.focus,.btn-outline-warning:focus{box-shadow:0 0 0 .2rem rgba(255,193,7,.5)}.btn-outline-warning.disabled,.btn-outline-warning:disabled{color:#ffc107;background-color:transparent}.btn-outline-warning:not(:disabled):not(.disabled).active,.btn-outline-warning:not(:disabled):not(.disabled):active,.show>.btn-outline-warning.dropdown-toggle{color:#212529;background-color:#ffc107;border-color:#ffc107}.btn-outline-warning:not(:disabled):not(.disabled).active:focus,.btn-outline-warning:not(:disabled):not(.disabled):active:focus,.show>.btn-outline-warning.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(255,193,7,.5)}.btn-outline-danger{color:#dc3545;border-color:#dc3545}.btn-outline-danger:hover{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-outline-danger.focus,.btn-outline-danger:focus{box-shadow:0 0 0 .2rem rgba(220,53,69,.5)}.btn-outline-danger.disabled,.btn-outline-danger:disabled{color:#dc3545;background-color:transparent}.btn-outline-danger:not(:disabled):not(.disabled).active,.btn-outline-danger:not(:disabled):not(.disabled):active,.show>.btn-outline-danger.dropdown-toggle{color:#fff;background-color:#dc3545;border-color:#dc3545}.btn-outline-danger:not(:disabled):not(.disabled).active:focus,.btn-outline-danger:not(:disabled):not(.disabled):active:focus,.show>.btn-outline-danger.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(220,53,69,.5)}.btn-outline-light{color:#f8f9fa;border-color:#f8f9fa}.btn-outline-light:hover{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-outline-light.focus,.btn-outline-light:focus{box-shadow:0 0 0 .2rem rgba(248,249,250,.5)}.btn-outline-light.disabled,.btn-outline-light:disabled{color:#f8f9fa;background-color:transparent}.btn-outline-light:not(:disabled):not(.disabled).active,.btn-outline-light:not(:disabled):not(.disabled):active,.show>.btn-outline-light.dropdown-toggle{color:#212529;background-color:#f8f9fa;border-color:#f8f9fa}.btn-outline-light:not(:disabled):not(.disabled).active:focus,.btn-outline-light:not(:disabled):not(.disabled):active:focus,.show>.btn-outline-light.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(248,249,250,.5)}.btn-outline-dark{color:#343a40;border-color:#343a40}.btn-outline-dark:hover{color:#fff;background-color:#343a40;border-color:#343a40}.btn-outline-dark.focus,.btn-outline-dark:focus{box-shadow:0 0 0 .2rem rgba(52,58,64,.5)}.btn-outline-dark.disabled,.btn-outline-dark:disabled{color:#343a40;background-color:transparent}.btn-outline-dark:not(:disabled):not(.disabled).active,.btn-outline-dark:not(:disabled):not(.disabled):active,.show>.btn-outline-dark.dropdown-toggle{color:#fff;background-color:#343a40;border-color:#343a40}.btn-outline-dark:not(:disabled):not(.disabled).active:focus,.btn-outline-dark:not(:disabled):not(.disabled):active:focus,.show>.btn-outline-dark.dropdown-toggle:focus{box-shadow:0 0 0 .2rem rgba(52,58,64,.5)}.btn-link{font-weight:400;color:#007bff;text-decoration:none}.btn-link:hover{color:#0056b3;text-decoration:underline}.btn-link.focus,.btn-link:focus{text-decoration:underline;box-shadow:none}.btn-link.disabled,.btn-link:disabled{color:#6c757d;pointer-events:none}.btn-group-lg>.btn,.btn-lg{padding:.5rem 1rem;font-size:1.25rem;line-height:1.5;border-radius:.3rem}.btn-group-sm>.btn,.btn-sm{padding:.25rem .5rem;font-size:.875rem;line-height:1.5;border-radius:.2rem}.btn-block{display:block;width:100%}.btn-block+.btn-block{margin-top:.5rem}input[type=button].btn-block,input[type=reset].btn-block,input[type=submit].btn-block{width:100%}.fade{transition:opacity .15s linear}@media (prefers-reduced-motion:reduce){.fade{transition:none}}.fade:not(.show){opacity:0}.collapse:not(.show){display:none}.collapsing{position:relative;height:0;overflow:hidden;transition:height .35s ease}@media (prefers-reduced-motion:reduce){.collapsing{transition:none}}.dropdown,.dropleft,.dropright,.dropup{position:relative}.dropdown-toggle{white-space:nowrap}.dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:"";border-top:.3em solid;border-right:.3em solid transparent;border-bottom:0;border-left:.3em solid transparent}.dropdown-toggle:empty::after{margin-left:0}.dropdown-menu{position:absolute;top:100%;left:0;z-index:1000;display:none;float:left;min-width:10rem;padding:.5rem 0;margin:.125rem 0 0;font-size:1rem;color:#212529;text-align:left;list-style:none;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,.15);border-radius:.25rem}.dropdown-menu-left{right:auto;left:0}.dropdown-menu-right{right:0;left:auto}@media (min-width:576px){.dropdown-menu-sm-left{right:auto;left:0}.dropdown-menu-sm-right{right:0;left:auto}}@media (min-width:768px){.dropdown-menu-md-left{right:auto;left:0}.dropdown-menu-md-right{right:0;left:auto}}@media (min-width:992px){.dropdown-menu-lg-left{right:auto;left:0}.dropdown-menu-lg-right{right:0;left:auto}}@media (min-width:1200px){.dropdown-menu-xl-left{right:auto;left:0}.dropdown-menu-xl-right{right:0;left:auto}}.dropup .dropdown-menu{top:auto;bottom:100%;margin-top:0;margin-bottom:.125rem}.dropup .dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:"";border-top:0;border-right:.3em solid transparent;border-bottom:.3em solid;border-left:.3em solid transparent}.dropup .dropdown-toggle:empty::after{margin-left:0}.dropright .dropdown-menu{top:0;right:auto;left:100%;margin-top:0;margin-left:.125rem}.dropright .dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:"";border-top:.3em solid transparent;border-right:0;border-bottom:.3em solid transparent;border-left:.3em solid}.dropright .dropdown-toggle:empty::after{margin-left:0}.dropright .dropdown-toggle::after{vertical-align:0}.dropleft .dropdown-menu{top:0;right:100%;left:auto;margin-top:0;margin-right:.125rem}.dropleft .dropdown-toggle::after{display:inline-block;margin-left:.255em;vertical-align:.255em;content:""}.dropleft .dropdown-toggle::after{display:none}.dropleft .dropdown-toggle::before{display:inline-block;margin-right:.255em;vertical-align:.255em;content:"";border-top:.3em solid transparent;border-right:.3em solid;border-bottom:.3em solid transparent}.dropleft .dropdown-toggle:empty::after{margin-left:0}.dropleft .dropdown-toggle::before{vertical-align:0}.dropdown-menu[x-placement^=bottom],.dropdown-menu[x-placement^=left],.dropdown-menu[x-placement^=right],.dropdown-menu[x-placement^=top]{right:auto;bottom:auto}.dropdown-divider{height:0;margin:.5rem 0;overflow:hidden;border-top:1px solid #e9ecef}.dropdown-item{display:block;width:100%;padding:.25rem 1.5rem;clear:both;font-weight:400;color:#212529;text-align:inherit;white-space:nowrap;background-color:transparent;border:0}.dropdown-item:focus,.dropdown-item:hover{color:#16181b;text-decoration:none;background-color:#f8f9fa}.dropdown-item.active,.dropdown-item:active{color:#fff;text-decoration:none;background-color:#007bff}.dropdown-item.disabled,.dropdown-item:disabled{color:#6c757d;pointer-events:none;background-color:transparent}.dropdown-menu.show{display:block}.dropdown-header{display:block;padding:.5rem 1.5rem;margin-bottom:0;font-size:.875rem;color:#6c757d;white-space:nowrap}.dropdown-item-text{display:block;padding:.25rem 1.5rem;color:#212529}.btn-group,.btn-group-vertical{position:relative;display:-ms-inline-flexbox;display:inline-flex;vertical-align:middle}.btn-group-vertical>.btn,.btn-group>.btn{position:relative;-ms-flex:1 1 auto;flex:1 1 auto}.btn-group-vertical>.btn:hover,.btn-group>.btn:hover{z-index:1}.btn-group-vertical>.btn.active,.btn-group-vertical>.btn:active,.btn-group-vertical>.btn:focus,.btn-group>.btn.active,.btn-group>.btn:active,.btn-group>.btn:focus{z-index:1}.btn-toolbar{display:-ms-flexbox;display:flex;-ms-flex-wrap:wrap;flex-wrap:wrap;-ms-flex-pack:start;justify-content:flex-start}.btn-toolbar .input-group{width:auto}.btn-group>.btn-group:not(:first-child),.btn-group>.btn:not(:first-child){margin-left:-1px}.btn-group>.btn-group:not(:last-child)>.btn,.btn-group>.btn:not(:last-child):not(.dropdown-toggle){border-top-right-radius:0;border-bottom-right-radius:0}.btn-group>.btn-group:not(:first-child)>.btn,.btn-group>.btn:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.dropdown-toggle-split{padding-right:.5625rem;padding-left:.5625rem}.dropdown-toggle-split::after,.dropright .dropdown-toggle-split::after,.dropup .dropdown-toggle-split::after{margin-left:0}.dropleft .dropdown-toggle-split::before{margin-right:0}.btn-group-sm>.btn+.dropdown-toggle-split,.btn-sm+.dropdown-toggle-split{padding-right:.375rem;padding-left:.375rem}.btn-group-lg>.btn+.dropdown-toggle-split,.btn-lg+.dropdown-toggle-split{padding-right:.75rem;padding-left:.75rem}.btn-group-vertical{-ms-flex-direction:column;flex-direction:column;-ms-flex-align:start;align-items:flex-start;-ms-flex-pack:center;justify-content:center}.btn-group-vertical>.btn,.btn-group-vertical>.btn-group{width:100%}.btn-group-vertical>.btn-group:not(:first-child),.btn-group-vertical>.btn:not(:first-child){margin-top:-1px}.btn-group-vertical>.btn-group:not(:last-child)>.btn,.btn-group-vertical>.btn:not(:last-child):not(.dropdown-toggle){border-bottom-right-radius:0;border-bottom-left-radius:0}.btn-group-vertical>.btn-group:not(:first-child)>.btn,.btn-group-vertical>.btn:not(:first-child){border-top-left-radius:0;border-top-right-radius:0}.btn-group-toggle>.btn,.btn-group-toggle>.btn-group>.btn{margin-bottom:0}.btn-group-toggle>.btn input[type=checkbox],.btn-group-toggle>.btn input[type=radio],.btn-group-toggle>.btn-group>.btn input[type=checkbox],.btn-group-toggle>.btn-group>.btn input[type=radio]{position:absolute;clip:rect(0,0,0,0);pointer-events:none}.input-group{position:relative;display:-ms-flexbox;display:flex;-ms-flex-wrap:wrap;flex-wrap:wrap;-ms-flex-align:stretch;align-items:stretch;width:100%}.input-group>.custom-file,.input-group>.custom-select,.input-group>.form-control,.input-group>.form-control-plaintext{position:relative;-ms-flex:1 1 auto;flex:1 1 auto;width:1%;margin-bottom:0}.input-group>.custom-file+.custom-file,.input-group>.custom-file+.custom-select,.input-group>.custom-file+.form-control,.input-group>.custom-select+.custom-file,.input-group>.custom-select+.custom-select,.input-group>.custom-select+.form-control,.input-group>.form-control+.custom-file,.input-group>.form-control+.custom-select,.input-group>.form-control+.form-control,.input-group>.form-control-plaintext+.custom-file,.input-group>.form-control-plaintext+.custom-select,.input-group>.form-control-plaintext+.form-control{margin-left:-1px}.input-group>.custom-file .custom-file-input:focus~.custom-file-label,.input-group>.custom-select:focus,.input-group>.form-control:focus{z-index:3}.input-group>.custom-file .custom-file-input:focus{z-index:4}.input-group>.custom-select:not(:last-child),.input-group>.form-control:not(:last-child){border-top-right-radius:0;border-bottom-right-radius:0}.input-group>.custom-select:not(:first-child),.input-group>.form-control:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.input-group>.custom-file{display:-ms-flexbox;display:flex;-ms-flex-align:center;align-items:center}.input-group>.custom-file:not(:last-child) .custom-file-label,.input-group>.custom-file:not(:last-child) .custom-file-label::after{border-top-right-radius:0;border-bottom-right-radius:0}.input-group>.custom-file:not(:first-child) .custom-file-label{border-top-left-radius:0;border-bottom-left-radius:0}.input-group-append,.input-group-prepend{display:-ms-flexbox;display:flex}.input-group-append .btn,.input-group-prepend .btn{position:relative;z-index:2}.input-group-append .btn:focus,.input-group-prepend .btn:focus{z-index:3}.input-group-append .btn+.btn,.input-group-append .btn+.input-group-text,.input-group-append .input-group-text+.btn,.input-group-append .input-group-text+.input-group-text,.input-group-prepend .btn+.btn,.input-group-prepend .btn+.input-group-text,.input-group-prepend .input-group-text+.btn,.input-group-prepend .input-group-text+.input-group-text{margin-left:-1px}.input-group-prepend{margin-right:-1px}.input-group-append{margin-left:-1px}.input-group-text{display:-ms-flexbox;display:flex;-ms-flex-align:center;align-items:center;padding:.375rem .75rem;margin-bottom:0;font-size:1rem;font-weight:400;line-height:1.5;color:#495057;text-align:center;white-space:nowrap;background-color:#e9ecef;border:1px solid #ced4da;border-radius:.25rem}.input-group-text input[type=checkbox],.input-group-text input[type=radio]{margin-top:0}.input-group-lg>.custom-select,.input-group-lg>.form-control:not(textarea){height:calc(1.5em + 1rem + 2px)}.input-group-lg>.custom-select,.input-group-lg>.form-control,.input-group-lg>.input-group-append>.btn,.input-group-lg>.input-group-append>.input-group-text,.input-group-lg>.input-group-prepend>.btn,.input-group-lg>.input-group-prepend>.input-group-text{padding:.5rem 1rem;font-size:1.25rem;line-height:1.5;border-radius:.3rem}.input-group-sm>.custom-select,.input-group-sm>.form-control:not(textarea){height:calc(1.5em + .5rem + 2px)}.input-group-sm>.custom-select,.input-group-sm>.form-control,.input-group-sm>.input-group-append>.btn,.input-group-sm>.input-group-append>.input-group-text,.input-group-sm>.input-group-prepend>.btn,.input-group-sm>.input-group-prepend>.input-group-text{padding:.25rem .5rem;font-size:.875rem;line-height:1.5;border-radius:.2rem}.input-group-lg>.custom-select,.input-group-sm>.custom-select{padding-right:1.75rem}.input-group>.input-group-append:last-child>.btn:not(:last-child):not(.dropdown-toggle),.input-group>.input-group-append:last-child>.input-group-text:not(:last-child),.input-group>.input-group-append:not(:last-child)>.btn,.input-group>.input-group-append:not(:last-child)>.input-group-text,.input-group>.input-group-prepend>.btn,.input-group>.input-group-prepend>.input-group-text{border-top-right-radius:0;border-bottom-right-radius:0}.input-group>.input-group-append>.btn,.input-group>.input-group-append>.input-group-text,.input-group>.input-group-prepend:first-child>.btn:not(:first-child),.input-group>.input-group-prepend:first-child>.input-group-text:not(:first-child),.input-group>.input-group-prepend:not(:first-child)>.btn,.input-group>.input-group-prepend:not(:first-child)>.input-group-text{border-top-left-radius:0;border-bottom-left-radius:0}.custom-control{position:relative;display:block;min-height:1.5rem;padding-left:1.5rem}.custom-control-inline{display:-ms-inline-flexbox;display:inline-flex;margin-right:1rem}.custom-control-input{position:absolute;z-index:-1;opacity:0}.custom-control-input:checked~.custom-control-label::before{color:#fff;border-color:#007bff;background-color:#007bff}.custom-control-input:focus~.custom-control-label::before{box-shadow:0 0 0 .2rem rgba(0,123,255,.25)}.custom-control-input:focus:not(:checked)~.custom-control-label::before{border-color:#80bdff}.custom-control-input:not(:disabled):active~.custom-control-label::before{color:#fff;background-color:#b3d7ff;border-color:#b3d7ff}.custom-control-input:disabled~.custom-control-label{color:#6c757d}.custom-control-input:disabled~.custom-control-label::before{background-color:#e9ecef}.custom-control-label{position:relative;margin-bottom:0;vertical-align:top}.custom-control-label::before{position:absolute;top:.25rem;left:-1.5rem;display:block;width:1rem;height:1rem;pointer-events:none;content:"";background-color:#fff;border:#adb5bd solid 1px}.custom-control-label::after{position:absolute;top:.25rem;left:-1.5rem;display:block;width:1rem;height:1rem;content:"";background:no-repeat 50%/50% 50%}.custom-checkbox .custom-control-label::before{border-radius:.25rem}.custom-checkbox .custom-control-input:checked~.custom-control-label::after{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3e%3cpath fill='%23fff' d='M6.564.75l-3.59 3.612-1.538-1.55L0 4.26 2.974 7.25 8 2.193z'/%3e%3c/svg%3e")}.custom-checkbox .custom-control-input:indeterminate~.custom-control-label::before{border-color:#007bff;background-color:#007bff}.custom-checkbox .custom-control-input:indeterminate~.custom-control-label::after{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 4'%3e%3cpath stroke='%23fff' d='M0 2h4'/%3e%3c/svg%3e")}.custom-checkbox .custom-control-input:disabled:checked~.custom-control-label::before{background-color:rgba(0,123,255,.5)}.custom-checkbox .custom-control-input:disabled:indeterminate~.custom-control-label::before{background-color:rgba(0,123,255,.5)}.custom-radio .custom-control-label::before{border-radius:50%}.custom-radio .custom-control-input:checked~.custom-control-label::after{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='-4 -4 8 8'%3e%3ccircle r='3' fill='%23fff'/%3e%3c/svg%3e")}.custom-radio .custom-control-input:disabled:checked~.custom-control-label::before{background-color:rgba(0,123,255,.5)}.custom-switch{padding-left:2.25rem}.custom-switch .custom-control-label::before{left:-2.25rem;width:1.75rem;pointer-events:all;border-radius:.5rem}.custom-switch .custom-control-label::after{top:calc(.25rem + 2px);left:calc(-2.25rem + 2px);width:calc(1rem - 4px);height:calc(1rem - 4px);background-color:#adb5bd;border-radius:.5rem;transition:background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out,-webkit-transform .15s ease-in-out;transition:transform .15s ease-in-out,background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out;transition:transform .15s ease-in-out,background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out,-webkit-transform .15s ease-in-out}@media (prefers-reduced-motion:reduce){.custom-switch .custom-control-label::after{transition:none}}.custom-switch .custom-control-input:checked~.custom-control-label::after{background-color:#fff;-webkit-transform:translateX(.75rem);transform:translateX(.75rem)}.custom-switch .custom-control-input:disabled:checked~.custom-control-label::before{background-color:rgba(0,123,255,.5)}.custom-select{display:inline-block;width:100%;height:calc(1.5em + .75rem + 2px);padding:.375rem 1.75rem .375rem .75rem;font-size:1rem;font-weight:400;line-height:1.5;color:#495057;vertical-align:middle;background:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3e%3cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3e%3c/svg%3e") no-repeat right .75rem center/8px 10px;background-color:#fff;border:1px solid #ced4da;border-radius:.25rem;-webkit-appearance:none;-moz-appearance:none;appearance:none}.custom-select:focus{border-color:#80bdff;outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,.25)}.custom-select:focus::-ms-value{color:#495057;background-color:#fff}.custom-select[multiple],.custom-select[size]:not([size="1"]){height:auto;padding-right:.75rem;background-image:none}.custom-select:disabled{color:#6c757d;background-color:#e9ecef}.custom-select::-ms-expand{display:none}.custom-select-sm{height:calc(1.5em + .5rem + 2px);padding-top:.25rem;padding-bottom:.25rem;padding-left:.5rem;font-size:.875rem}.custom-select-lg{height:calc(1.5em + 1rem + 2px);padding-top:.5rem;padding-bottom:.5rem;padding-left:1rem;font-size:1.25rem}.custom-file{position:relative;display:inline-block;width:100%;height:calc(1.5em + .75rem + 2px);margin-bottom:0}.custom-file-input{position:relative;z-index:2;width:100%;height:calc(1.5em + .75rem + 2px);margin:0;opacity:0}.custom-file-input:focus~.custom-file-label{border-color:#80bdff;box-shadow:0 0 0 .2rem rgba(0,123,255,.25)}.custom-file-input:disabled~.custom-file-label{background-color:#e9ecef}.custom-file-input:lang(en)~.custom-file-label::after{content:"Browse"}.custom-file-input~.custom-file-label[data-browse]::after{content:attr(data-browse)}.custom-file-label{position:absolute;top:0;right:0;left:0;z-index:1;height:calc(1.5em + .75rem + 2px);padding:.375rem .75rem;font-weight:400;line-height:1.5;color:#495057;background-color:#fff;border:1px solid #ced4da;border-radius:.25rem}.custom-file-label::after{position:absolute;top:0;right:0;bottom:0;z-index:3;display:block;height:calc(1.5em + .75rem);padding:.375rem .75rem;line-height:1.5;color:#495057;content:"Browse";background-color:#e9ecef;border-left:inherit;border-radius:0 .25rem .25rem 0}.custom-range{width:100%;height:calc(1rem + .4rem);padding:0;background-color:transparent;-webkit-appearance:none;-moz-appearance:none;appearance:none}.custom-range:focus{outline:0}.custom-range:focus::-webkit-slider-thumb{box-shadow:0 0 0 1px #fff,0 0 0 .2rem rgba(0,123,255,.25)}.custom-range:focus::-moz-range-thumb{box-shadow:0 0 0 1px #fff,0 0 0 .2rem rgba(0,123,255,.25)}.custom-range:focus::-ms-thumb{box-shadow:0 0 0 1px #fff,0 0 0 .2rem rgba(0,123,255,.25)}.custom-range::-moz-focus-outer{border:0}.custom-range::-webkit-slider-thumb{width:1rem;height:1rem;margin-top:-.25rem;background-color:#007bff;border:0;border-radius:1rem;transition:background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out;-webkit-appearance:none;appearance:none}@media (prefers-reduced-motion:reduce){.custom-range::-webkit-slider-thumb{transition:none}}.custom-range::-webkit-slider-thumb:active{background-color:#b3d7ff}.custom-range::-webkit-slider-runnable-track{width:100%;height:.5rem;color:transparent;cursor:pointer;background-color:#dee2e6;border-color:transparent;border-radius:1rem}.custom-range::-moz-range-thumb{width:1rem;height:1rem;background-color:#007bff;border:0;border-radius:1rem;transition:background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out;-moz-appearance:none;appearance:none}@media (prefers-reduced-motion:reduce){.custom-range::-moz-range-thumb{transition:none}}.custom-range::-moz-range-thumb:active{background-color:#b3d7ff}.custom-range::-moz-range-track{width:100%;height:.5rem;color:transparent;cursor:pointer;background-color:#dee2e6;border-color:transparent;border-radius:1rem}.custom-range::-ms-thumb{width:1rem;height:1rem;margin-top:0;margin-right:.2rem;margin-left:.2rem;background-color:#007bff;border:0;border-radius:1rem;transition:background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out;appearance:none}@media (prefers-reduced-motion:reduce){.custom-range::-ms-thumb{transition:none}}.custom-range::-ms-thumb:active{background-color:#b3d7ff}.custom-range::-ms-track{width:100%;height:.5rem;color:transparent;cursor:pointer;background-color:transparent;border-color:transparent;border-width:.5rem}.custom-range::-ms-fill-lower{background-color:#dee2e6;border-radius:1rem}.custom-range::-ms-fill-upper{margin-right:15px;background-color:#dee2e6;border-radius:1rem}.custom-range:disabled::-webkit-slider-thumb{background-color:#adb5bd}.custom-range:disabled::-webkit-slider-runnable-track{cursor:default}.custom-range:disabled::-moz-range-thumb{background-color:#adb5bd}.custom-range:disabled::-moz-range-track{cursor:default}.custom-range:disabled::-ms-thumb{background-color:#adb5bd}.custom-control-label::before,.custom-file-label,.custom-select{transition:background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out}@media (prefers-reduced-motion:reduce){.custom-control-label::before,.custom-file-label,.custom-select{transition:none}}.nav{display:-ms-flexbox;display:flex;-ms-flex-wrap:wrap;flex-wrap:wrap;padding-left:0;margin-bottom:0;list-style:none}.nav-link{display:block;padding:.5rem 1rem}.nav-link:focus,.nav-link:hover{text-decoration:none}.nav-link.disabled{color:#6c757d;pointer-events:none;cursor:default}.nav-tabs{border-bottom:1px solid #dee2e6}.nav-tabs .nav-item{margin-bottom:-1px}.nav-tabs .nav-link{border:1px solid transparent;border-top-left-radius:.25rem;border-top-right-radius:.25rem}.nav-tabs .nav-link:focus,.nav-tabs .nav-link:hover{border-color:#e9ecef #e9ecef #dee2e6}.nav-tabs .nav-link.disabled{color:#6c757d;background-color:transparent;border-color:transparent}.nav-tabs .nav-item.show .nav-link,.nav-tabs .nav-link.active{color:#495057;background-color:#fff;border-color:#dee2e6 #dee2e6 #fff}.nav-tabs .dropdown-menu{margin-top:-1px;border-top-left-radius:0;border-top-right-radius:0}.nav-pills .nav-link{border-radius:.25rem}.nav-pills .nav-link.active,.nav-pills .show>.nav-link{color:#fff;background-color:#007bff}.nav-fill .nav-item{-ms-flex:1 1 auto;flex:1 1 auto;text-align:center}.nav-justified .nav-item{-ms-flex-preferred-size:0;flex-basis:0;-ms-flex-positive:1;flex-grow:1;text-align:center}.tab-content>.tab-pane{display:none}.tab-content>.active{display:block}.navbar{position:relative;display:-ms-flexbox;display:flex;-ms-flex-wrap:wrap;flex-wrap:wrap;-ms-flex-align:center;align-items:center;-ms-flex-pack:justify;justify-content:space-between;padding:.5rem 1rem}.navbar>.container,.navbar>.container-fluid{display:-ms-flexbox;display:flex;-ms-flex-wrap:wrap;flex-wrap:wrap;-ms-flex-align:center;align-items:center;-ms-flex-pack:justify;justify-content:space-between}.navbar-brand{display:inline-block;padding-top:.3125rem;padding-bottom:.3125rem;margin-right:1rem;font-size:1.25rem;line-height:inherit;white-space:nowrap}.navbar-brand:focus,.navbar-brand:hover{text-decoration:none}.navbar-nav{display:-ms-flexbox;display:flex;-ms-flex-direction:column;flex-direction:column;padding-left:0;margin-bottom:0;list-style:none}.navbar-nav .nav-link{padding-right:0;padding-left:0}.navbar-nav .dropdown-menu{position:static;float:none}.navbar-text{display:inline-block;padding-top:.5rem;padding-bottom:.5rem}.navbar-collapse{-ms-flex-preferred-size:100%;flex-basis:100%;-ms-flex-positive:1;flex-grow:1;-ms-flex-align:center;align-items:center}.navbar-toggler{padding:.25rem .75rem;font-size:1.25rem;line-height:1;background-color:transparent;border:1px solid transparent;border-radius:.25rem}.navbar-toggler:focus,.navbar-toggler:hover{text-decoration:none}.navbar-toggler-icon{display:inline-block;width:1.5em;height:1.5em;vertical-align:middle;content:"";background:no-repeat center center;background-size:100% 100%}@media (max-width:575.98px){.navbar-expand-sm>.container,.navbar-expand-sm>.container-fluid{padding-right:0;padding-left:0}}@media (min-width:576px){.navbar-expand-sm{-ms-flex-flow:row nowrap;flex-flow:row nowrap;-ms-flex-pack:start;justify-content:flex-start}.navbar-expand-sm .navbar-nav{-ms-flex-direction:row;flex-direction:row}.navbar-expand-sm .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-sm .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-sm>.container,.navbar-expand-sm>.container-fluid{-ms-flex-wrap:nowrap;flex-wrap:nowrap}.navbar-expand-sm .navbar-collapse{display:-ms-flexbox!important;display:flex!important;-ms-flex-preferred-size:auto;flex-basis:auto}.navbar-expand-sm .navbar-toggler{display:none}}@media (max-width:767.98px){.navbar-expand-md>.container,.navbar-expand-md>.container-fluid{padding-right:0;padding-left:0}}@media (min-width:768px){.navbar-expand-md{-ms-flex-flow:row nowrap;flex-flow:row nowrap;-ms-flex-pack:start;justify-content:flex-start}.navbar-expand-md .navbar-nav{-ms-flex-direction:row;flex-direction:row}.navbar-expand-md .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-md .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-md>.container,.navbar-expand-md>.container-fluid{-ms-flex-wrap:nowrap;flex-wrap:nowrap}.navbar-expand-md .navbar-collapse{display:-ms-flexbox!important;display:flex!important;-ms-flex-preferred-size:auto;flex-basis:auto}.navbar-expand-md .navbar-toggler{display:none}}@media (max-width:991.98px){.navbar-expand-lg>.container,.navbar-expand-lg>.container-fluid{padding-right:0;padding-left:0}}@media (min-width:992px){.navbar-expand-lg{-ms-flex-flow:row nowrap;flex-flow:row nowrap;-ms-flex-pack:start;justify-content:flex-start}.navbar-expand-lg .navbar-nav{-ms-flex-direction:row;flex-direction:row}.navbar-expand-lg .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-lg .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-lg>.container,.navbar-expand-lg>.container-fluid{-ms-flex-wrap:nowrap;flex-wrap:nowrap}.navbar-expand-lg .navbar-collapse{display:-ms-flexbox!important;display:flex!important;-ms-flex-preferred-size:auto;flex-basis:auto}.navbar-expand-lg .navbar-toggler{display:none}}@media (max-width:1199.98px){.navbar-expand-xl>.container,.navbar-expand-xl>.container-fluid{padding-right:0;padding-left:0}}@media (min-width:1200px){.navbar-expand-xl{-ms-flex-flow:row nowrap;flex-flow:row nowrap;-ms-flex-pack:start;justify-content:flex-start}.navbar-expand-xl .navbar-nav{-ms-flex-direction:row;flex-direction:row}.navbar-expand-xl .navbar-nav .dropdown-menu{position:absolute}.navbar-expand-xl .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand-xl>.container,.navbar-expand-xl>.container-fluid{-ms-flex-wrap:nowrap;flex-wrap:nowrap}.navbar-expand-xl .navbar-collapse{display:-ms-flexbox!important;display:flex!important;-ms-flex-preferred-size:auto;flex-basis:auto}.navbar-expand-xl .navbar-toggler{display:none}}.navbar-expand{-ms-flex-flow:row nowrap;flex-flow:row nowrap;-ms-flex-pack:start;justify-content:flex-start}.navbar-expand>.container,.navbar-expand>.container-fluid{padding-right:0;padding-left:0}.navbar-expand .navbar-nav{-ms-flex-direction:row;flex-direction:row}.navbar-expand .navbar-nav .dropdown-menu{position:absolute}.navbar-expand .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-expand>.container,.navbar-expand>.container-fluid{-ms-flex-wrap:nowrap;flex-wrap:nowrap}.navbar-expand .navbar-collapse{display:-ms-flexbox!important;display:flex!important;-ms-flex-preferred-size:auto;flex-basis:auto}.navbar-expand .navbar-toggler{display:none}.navbar-light .navbar-brand{color:rgba(0,0,0,.9)}.navbar-light .navbar-brand:focus,.navbar-light .navbar-brand:hover{color:rgba(0,0,0,.9)}.navbar-light .navbar-nav .nav-link{color:rgba(0,0,0,.5)}.navbar-light .navbar-nav .nav-link:focus,.navbar-light .navbar-nav .nav-link:hover{color:rgba(0,0,0,.7)}.navbar-light .navbar-nav .nav-link.disabled{color:rgba(0,0,0,.3)}.navbar-light .navbar-nav .active>.nav-link,.navbar-light .navbar-nav .nav-link.active,.navbar-light .navbar-nav .nav-link.show,.navbar-light .navbar-nav .show>.nav-link{color:rgba(0,0,0,.9)}.navbar-light .navbar-toggler{color:rgba(0,0,0,.5);border-color:rgba(0,0,0,.1)}.navbar-light .navbar-toggler-icon{background-image:url("data:image/svg+xml,%3csvg viewBox='0 0 30 30' xmlns='http://www.w3.org/2000/svg'%3e%3cpath stroke='rgba(0, 0, 0, 0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 7h22M4 15h22M4 23h22'/%3e%3c/svg%3e")}.navbar-light .navbar-text{color:rgba(0,0,0,.5)}.navbar-light .navbar-text a{color:rgba(0,0,0,.9)}.navbar-light .navbar-text a:focus,.navbar-light .navbar-text a:hover{color:rgba(0,0,0,.9)}.navbar-dark .navbar-brand{color:#fff}.navbar-dark .navbar-brand:focus,.navbar-dark .navbar-brand:hover{color:#fff}.navbar-dark .navbar-nav .nav-link{color:rgba(255,255,255,.5)}.navbar-dark .navbar-nav .nav-link:focus,.navbar-dark .navbar-nav .nav-link:hover{color:rgba(255,255,255,.75)}.navbar-dark .navbar-nav .nav-link.disabled{color:rgba(255,255,255,.25)}.navbar-dark .navbar-nav .active>.nav-link,.navbar-dark .navbar-nav .nav-link.active,.navbar-dark .navbar-nav .nav-link.show,.navbar-dark .navbar-nav .show>.nav-link{color:#fff}.navbar-dark .navbar-toggler{color:rgba(255,255,255,.5);border-color:rgba(255,255,255,.1)}.navbar-dark .navbar-toggler-icon{background-image:url("data:image/svg+xml,%3csvg viewBox='0 0 30 30' xmlns='http://www.w3.org/2000/svg'%3e%3cpath stroke='rgba(255, 255, 255, 0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 7h22M4 15h22M4 23h22'/%3e%3c/svg%3e")}.navbar-dark .navbar-text{color:rgba(255,255,255,.5)}.navbar-dark .navbar-text a{color:#fff}.navbar-dark .navbar-text a:focus,.navbar-dark .navbar-text a:hover{color:#fff}.card{position:relative;display:-ms-flexbox;display:flex;-ms-flex-direction:column;flex-direction:column;min-width:0;word-wrap:break-word;background-color:#fff;background-clip:border-box;border:1px solid rgba(0,0,0,.125);border-radius:.25rem}.card>hr{margin-right:0;margin-left:0}.card>.list-group:first-child .list-group-item:first-child{border-top-left-radius:.25rem;border-top-right-radius:.25rem}.card>.list-group:last-child .list-group-item:last-child{border-bottom-right-radius:.25rem;border-bottom-left-radius:.25rem}.card-body{-ms-flex:1 1 auto;flex:1 1 auto;padding:1.25rem}.card-title{margin-bottom:.75rem}.card-subtitle{margin-top:-.375rem;margin-bottom:0}.card-text:last-child{margin-bottom:0}.card-link:hover{text-decoration:none}.card-link+.card-link{margin-left:1.25rem}.card-header{padding:.75rem 1.25rem;margin-bottom:0;background-color:rgba(0,0,0,.03);border-bottom:1px solid rgba(0,0,0,.125)}.card-header:first-child{border-radius:calc(.25rem - 1px) calc(.25rem - 1px) 0 0}.card-header+.list-group .list-group-item:first-child{border-top:0}.card-footer{padding:.75rem 1.25rem;background-color:rgba(0,0,0,.03);border-top:1px solid rgba(0,0,0,.125)}.card-footer:last-child{border-radius:0 0 calc(.25rem - 1px) calc(.25rem - 1px)}.card-header-tabs{margin-right:-.625rem;margin-bottom:-.75rem;margin-left:-.625rem;border-bottom:0}.card-header-pills{margin-right:-.625rem;margin-left:-.625rem}.card-img-overlay{position:absolute;top:0;right:0;bottom:0;left:0;padding:1.25rem}.card-img{width:100%;border-radius:calc(.25rem - 1px)}.card-img-top{width:100%;border-top-left-radius:calc(.25rem - 1px);border-top-right-radius:calc(.25rem - 1px)}.card-img-bottom{width:100%;border-bottom-right-radius:calc(.25rem - 1px);border-bottom-left-radius:calc(.25rem - 1px)}.card-deck{display:-ms-flexbox;display:flex;-ms-flex-direction:column;flex-direction:column}.card-deck .card{margin-bottom:15px}@media (min-width:576px){.card-deck{-ms-flex-flow:row wrap;flex-flow:row wrap;margin-right:-15px;margin-left:-15px}.card-deck .card{display:-ms-flexbox;display:flex;-ms-flex:1 0 0%;flex:1 0 0%;-ms-flex-direction:column;flex-direction:column;margin-right:15px;margin-bottom:0;margin-left:15px}}.card-group{display:-ms-flexbox;display:flex;-ms-flex-direction:column;flex-direction:column}.card-group>.card{margin-bottom:15px}@media (min-width:576px){.card-group{-ms-flex-flow:row wrap;flex-flow:row wrap}.card-group>.card{-ms-flex:1 0 0%;flex:1 0 0%;margin-bottom:0}.card-group>.card+.card{margin-left:0;border-left:0}.card-group>.card:not(:last-child){border-top-right-radius:0;border-bottom-right-radius:0}.card-group>.card:not(:last-child) .card-header,.card-group>.card:not(:last-child) .card-img-top{border-top-right-radius:0}.card-group>.card:not(:last-child) .card-footer,.card-group>.card:not(:last-child) .card-img-bottom{border-bottom-right-radius:0}.card-group>.card:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.card-group>.card:not(:first-child) .card-header,.card-group>.card:not(:first-child) .card-img-top{border-top-left-radius:0}.card-group>.card:not(:first-child) .card-footer,.card-group>.card:not(:first-child) .card-img-bottom{border-bottom-left-radius:0}}.card-columns .card{margin-bottom:.75rem}@media (min-width:576px){.card-columns{-webkit-column-count:3;-moz-column-count:3;column-count:3;-webkit-column-gap:1.25rem;-moz-column-gap:1.25rem;column-gap:1.25rem;orphans:1;widows:1}.card-columns .card{display:inline-block;width:100%}}.accordion>.card{overflow:hidden}.accordion>.card:not(:first-of-type) .card-header:first-child{border-radius:0}.accordion>.card:not(:first-of-type):not(:last-of-type){border-bottom:0;border-radius:0}.accordion>.card:first-of-type{border-bottom:0;border-bottom-right-radius:0;border-bottom-left-radius:0}.accordion>.card:last-of-type{border-top-left-radius:0;border-top-right-radius:0}.accordion>.card .card-header{margin-bottom:-1px}.breadcrumb{display:-ms-flexbox;display:flex;-ms-flex-wrap:wrap;flex-wrap:wrap;padding:.75rem 1rem;margin-bottom:1rem;list-style:none;background-color:#e9ecef;border-radius:.25rem}.breadcrumb-item+.breadcrumb-item{padding-left:.5rem}.breadcrumb-item+.breadcrumb-item::before{display:inline-block;padding-right:.5rem;color:#6c757d;content:"/"}.breadcrumb-item+.breadcrumb-item:hover::before{text-decoration:underline}.breadcrumb-item+.breadcrumb-item:hover::before{text-decoration:none}.breadcrumb-item.active{color:#6c757d}.pagination{display:-ms-flexbox;display:flex;padding-left:0;list-style:none;border-radius:.25rem}.page-link{position:relative;display:block;padding:.5rem .75rem;margin-left:-1px;line-height:1.25;color:#007bff;background-color:#fff;border:1px solid #dee2e6}.page-link:hover{z-index:2;color:#0056b3;text-decoration:none;background-color:#e9ecef;border-color:#dee2e6}.page-link:focus{z-index:2;outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,.25)}.page-item:first-child .page-link{margin-left:0;border-top-left-radius:.25rem;border-bottom-left-radius:.25rem}.page-item:last-child .page-link{border-top-right-radius:.25rem;border-bottom-right-radius:.25rem}.page-item.active .page-link{z-index:1;color:#fff;background-color:#007bff;border-color:#007bff}.page-item.disabled .page-link{color:#6c757d;pointer-events:none;cursor:auto;background-color:#fff;border-color:#dee2e6}.pagination-lg .page-link{padding:.75rem 1.5rem;font-size:1.25rem;line-height:1.5}.pagination-lg .page-item:first-child .page-link{border-top-left-radius:.3rem;border-bottom-left-radius:.3rem}.pagination-lg .page-item:last-child .page-link{border-top-right-radius:.3rem;border-bottom-right-radius:.3rem}.pagination-sm .page-link{padding:.25rem .5rem;font-size:.875rem;line-height:1.5}.pagination-sm .page-item:first-child .page-link{border-top-left-radius:.2rem;border-bottom-left-radius:.2rem}.pagination-sm .page-item:last-child .page-link{border-top-right-radius:.2rem;border-bottom-right-radius:.2rem}.badge{display:inline-block;padding:.25em .4em;font-size:75%;font-weight:700;line-height:1;text-align:center;white-space:nowrap;vertical-align:baseline;border-radius:.25rem;transition:color .15s ease-in-out,background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out}@media (prefers-reduced-motion:reduce){.badge{transition:none}}a.badge:focus,a.badge:hover{text-decoration:none}.badge:empty{display:none}.btn .badge{position:relative;top:-1px}.badge-pill{padding-right:.6em;padding-left:.6em;border-radius:10rem}.badge-primary{color:#fff;background-color:#007bff}a.badge-primary:focus,a.badge-primary:hover{color:#fff;background-color:#0062cc}a.badge-primary.focus,a.badge-primary:focus{outline:0;box-shadow:0 0 0 .2rem rgba(0,123,255,.5)}.badge-secondary{color:#fff;background-color:#6c757d}a.badge-secondary:focus,a.badge-secondary:hover{color:#fff;background-color:#545b62}a.badge-secondary.focus,a.badge-secondary:focus{outline:0;box-shadow:0 0 0 .2rem rgba(108,117,125,.5)}.badge-success{color:#fff;background-color:#28a745}a.badge-success:focus,a.badge-success:hover{color:#fff;background-color:#1e7e34}a.badge-success.focus,a.badge-success:focus{outline:0;box-shadow:0 0 0 .2rem rgba(40,167,69,.5)}.badge-info{color:#fff;background-color:#17a2b8}a.badge-info:focus,a.badge-info:hover{color:#fff;background-color:#117a8b}a.badge-info.focus,a.badge-info:focus{outline:0;box-shadow:0 0 0 .2rem rgba(23,162,184,.5)}.badge-warning{color:#212529;background-color:#ffc107}a.badge-warning:focus,a.badge-warning:hover{color:#212529;background-color:#d39e00}a.badge-warning.focus,a.badge-warning:focus{outline:0;box-shadow:0 0 0 .2rem rgba(255,193,7,.5)}.badge-danger{color:#fff;background-color:#dc3545}a.badge-danger:focus,a.badge-danger:hover{color:#fff;background-color:#bd2130}a.badge-danger.focus,a.badge-danger:focus{outline:0;box-shadow:0 0 0 .2rem rgba(220,53,69,.5)}.badge-light{color:#212529;background-color:#f8f9fa}a.badge-light:focus,a.badge-light:hover{color:#212529;background-color:#dae0e5}a.badge-light.focus,a.badge-light:focus{outline:0;box-shadow:0 0 0 .2rem rgba(248,249,250,.5)}.badge-dark{color:#fff;background-color:#343a40}a.badge-dark:focus,a.badge-dark:hover{color:#fff;background-color:#1d2124}a.badge-dark.focus,a.badge-dark:focus{outline:0;box-shadow:0 0 0 .2rem rgba(52,58,64,.5)}.jumbotron{padding:2rem 1rem;margin-bottom:2rem;background-color:#e9ecef;border-radius:.3rem}@media (min-width:576px){.jumbotron{padding:4rem 2rem}}.jumbotron-fluid{padding-right:0;padding-left:0;border-radius:0}.alert{position:relative;padding:.75rem 1.25rem;margin-bottom:1rem;border:1px solid transparent;border-radius:.25rem}.alert-heading{color:inherit}.alert-link{font-weight:700}.alert-dismissible{padding-right:4rem}.alert-dismissible .close{position:absolute;top:0;right:0;padding:.75rem 1.25rem;color:inherit}.alert-primary{color:#004085;background-color:#cce5ff;border-color:#b8daff}.alert-primary hr{border-top-color:#9fcdff}.alert-primary .alert-link{color:#002752}.alert-secondary{color:#383d41;background-color:#e2e3e5;border-color:#d6d8db}.alert-secondary hr{border-top-color:#c8cbcf}.alert-secondary .alert-link{color:#202326}.alert-success{color:#155724;background-color:#d4edda;border-color:#c3e6cb}.alert-success hr{border-top-color:#b1dfbb}.alert-success .alert-link{color:#0b2e13}.alert-info{color:#0c5460;background-color:#d1ecf1;border-color:#bee5eb}.alert-info hr{border-top-color:#abdde5}.alert-info .alert-link{color:#062c33}.alert-warning{color:#856404;background-color:#fff3cd;border-color:#ffeeba}.alert-warning hr{border-top-color:#ffe8a1}.alert-warning .alert-link{color:#533f03}.alert-danger{color:#721c24;background-color:#f8d7da;border-color:#f5c6cb}.alert-danger hr{border-top-color:#f1b0b7}.alert-danger .alert-link{color:#491217}.alert-light{color:#818182;background-color:#fefefe;border-color:#fdfdfe}.alert-light hr{border-top-color:#ececf6}.alert-light .alert-link{color:#686868}.alert-dark{color:#1b1e21;background-color:#d6d8d9;border-color:#c6c8ca}.alert-dark hr{border-top-color:#b9bbbe}.alert-dark .alert-link{color:#040505}@-webkit-keyframes progress-bar-stripes{from{background-position:1rem 0}to{background-position:0 0}}@keyframes progress-bar-stripes{from{background-position:1rem 0}to{background-position:0 0}}.progress{display:-ms-flexbox;display:flex;height:1rem;overflow:hidden;font-size:.75rem;background-color:#e9ecef;border-radius:.25rem}.progress-bar{display:-ms-flexbox;display:flex;-ms-flex-direction:column;flex-direction:column;-ms-flex-pack:center;justify-content:center;color:#fff;text-align:center;white-space:nowrap;background-color:#007bff;transition:width .6s ease}@media (prefers-reduced-motion:reduce){.progress-bar{transition:none}}.progress-bar-striped{background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-size:1rem 1rem}.progress-bar-animated{-webkit-animation:progress-bar-stripes 1s linear infinite;animation:progress-bar-stripes 1s linear infinite}@media (prefers-reduced-motion:reduce){.progress-bar-animated{-webkit-animation:none;animation:none}}.media{display:-ms-flexbox;display:flex;-ms-flex-align:start;align-items:flex-start}.media-body{-ms-flex:1;flex:1}.list-group{display:-ms-flexbox;display:flex;-ms-flex-direction:column;flex-direction:column;padding-left:0;margin-bottom:0}.list-group-item-action{width:100%;color:#495057;text-align:inherit}.list-group-item-action:focus,.list-group-item-action:hover{z-index:1;color:#495057;text-decoration:none;background-color:#f8f9fa}.list-group-item-action:active{color:#212529;background-color:#e9ecef}.list-group-item{position:relative;display:block;padding:.75rem 1.25rem;margin-bottom:-1px;background-color:#fff;border:1px solid rgba(0,0,0,.125)}.list-group-item:first-child{border-top-left-radius:.25rem;border-top-right-radius:.25rem}.list-group-item:last-child{margin-bottom:0;border-bottom-right-radius:.25rem;border-bottom-left-radius:.25rem}.list-group-item.disabled,.list-group-item:disabled{color:#6c757d;pointer-events:none;background-color:#fff}.list-group-item.active{z-index:2;color:#fff;background-color:#007bff;border-color:#007bff}.list-group-horizontal{-ms-flex-direction:row;flex-direction:row}.list-group-horizontal .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}@media (min-width:576px){.list-group-horizontal-sm{-ms-flex-direction:row;flex-direction:row}.list-group-horizontal-sm .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-sm .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-sm .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}@media (min-width:768px){.list-group-horizontal-md{-ms-flex-direction:row;flex-direction:row}.list-group-horizontal-md .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-md .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-md .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}@media (min-width:992px){.list-group-horizontal-lg{-ms-flex-direction:row;flex-direction:row}.list-group-horizontal-lg .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-lg .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-lg .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}@media (min-width:1200px){.list-group-horizontal-xl{-ms-flex-direction:row;flex-direction:row}.list-group-horizontal-xl .list-group-item{margin-right:-1px;margin-bottom:0}.list-group-horizontal-xl .list-group-item:first-child{border-top-left-radius:.25rem;border-bottom-left-radius:.25rem;border-top-right-radius:0}.list-group-horizontal-xl .list-group-item:last-child{margin-right:0;border-top-right-radius:.25rem;border-bottom-right-radius:.25rem;border-bottom-left-radius:0}}.list-group-flush .list-group-item{border-right:0;border-left:0;border-radius:0}.list-group-flush .list-group-item:last-child{margin-bottom:-1px}.list-group-flush:first-child .list-group-item:first-child{border-top:0}.list-group-flush:last-child .list-group-item:last-child{margin-bottom:0;border-bottom:0}.list-group-item-primary{color:#004085;background-color:#b8daff}.list-group-item-primary.list-group-item-action:focus,.list-group-item-primary.list-group-item-action:hover{color:#004085;background-color:#9fcdff}.list-group-item-primary.list-group-item-action.active{color:#fff;background-color:#004085;border-color:#004085}.list-group-item-secondary{color:#383d41;background-color:#d6d8db}.list-group-item-secondary.list-group-item-action:focus,.list-group-item-secondary.list-group-item-action:hover{color:#383d41;background-color:#c8cbcf}.list-group-item-secondary.list-group-item-action.active{color:#fff;background-color:#383d41;border-color:#383d41}.list-group-item-success{color:#155724;background-color:#c3e6cb}.list-group-item-success.list-group-item-action:focus,.list-group-item-success.list-group-item-action:hover{color:#155724;background-color:#b1dfbb}.list-group-item-success.list-group-item-action.active{color:#fff;background-color:#155724;border-color:#155724}.list-group-item-info{color:#0c5460;background-color:#bee5eb}.list-group-item-info.list-group-item-action:focus,.list-group-item-info.list-group-item-action:hover{color:#0c5460;background-color:#abdde5}.list-group-item-info.list-group-item-action.active{color:#fff;background-color:#0c5460;border-color:#0c5460}.list-group-item-warning{color:#856404;background-color:#ffeeba}.list-group-item-warning.list-group-item-action:focus,.list-group-item-warning.list-group-item-action:hover{color:#856404;background-color:#ffe8a1}.list-group-item-warning.list-group-item-action.active{color:#fff;background-color:#856404;border-color:#856404}.list-group-item-danger{color:#721c24;background-color:#f5c6cb}.list-group-item-danger.list-group-item-action:focus,.list-group-item-danger.list-group-item-action:hover{color:#721c24;background-color:#f1b0b7}.list-group-item-danger.list-group-item-action.active{color:#fff;background-color:#721c24;border-color:#721c24}.list-group-item-light{color:#818182;background-color:#fdfdfe}.list-group-item-light.list-group-item-action:focus,.list-group-item-light.list-group-item-action:hover{color:#818182;background-color:#ececf6}.list-group-item-light.list-group-item-action.active{color:#fff;background-color:#818182;border-color:#818182}.list-group-item-dark{color:#1b1e21;background-color:#c6c8ca}.list-group-item-dark.list-group-item-action:focus,.list-group-item-dark.list-group-item-action:hover{color:#1b1e21;background-color:#b9bbbe}.list-group-item-dark.list-group-item-action.active{color:#fff;background-color:#1b1e21;border-color:#1b1e21}.close{float:right;font-size:1.5rem;font-weight:700;line-height:1;color:#000;text-shadow:0 1px 0 #fff;opacity:.5}.close:hover{color:#000;text-decoration:none}.close:not(:disabled):not(.disabled):focus,.close:not(:disabled):not(.disabled):hover{opacity:.75}button.close{padding:0;background-color:transparent;border:0;-webkit-appearance:none;-moz-appearance:none;appearance:none}a.close.disabled{pointer-events:none}.toast{max-width:350px;overflow:hidden;font-size:.875rem;background-color:rgba(255,255,255,.85);background-clip:padding-box;border:1px solid rgba(0,0,0,.1);box-shadow:0 .25rem .75rem rgba(0,0,0,.1);-webkit-backdrop-filter:blur(10px);backdrop-filter:blur(10px);opacity:0;border-radius:.25rem}.toast:not(:last-child){margin-bottom:.75rem}.toast.showing{opacity:1}.toast.show{display:block;opacity:1}.toast.hide{display:none}.toast-header{display:-ms-flexbox;display:flex;-ms-flex-align:center;align-items:center;padding:.25rem .75rem;color:#6c757d;background-color:rgba(255,255,255,.85);background-clip:padding-box;border-bottom:1px solid rgba(0,0,0,.05)}.toast-body{padding:.75rem}.modal-open{overflow:hidden}.modal-open .modal{overflow-x:hidden;overflow-y:auto}.modal{position:fixed;top:0;left:0;z-index:1050;display:none;width:100%;height:100%;overflow:hidden;outline:0}.modal-dialog{position:relative;width:auto;margin:.5rem;pointer-events:none}.modal.fade .modal-dialog{transition:-webkit-transform .3s ease-out;transition:transform .3s ease-out;transition:transform .3s ease-out,-webkit-transform .3s ease-out;-webkit-transform:translate(0,-50px);transform:translate(0,-50px)}@media (prefers-reduced-motion:reduce){.modal.fade .modal-dialog{transition:none}}.modal.show .modal-dialog{-webkit-transform:none;transform:none}.modal-dialog-scrollable{display:-ms-flexbox;display:flex;max-height:calc(100% - 1rem)}.modal-dialog-scrollable .modal-content{max-height:calc(100vh - 1rem);overflow:hidden}.modal-dialog-scrollable .modal-footer,.modal-dialog-scrollable .modal-header{-ms-flex-negative:0;flex-shrink:0}.modal-dialog-scrollable .modal-body{overflow-y:auto}.modal-dialog-centered{display:-ms-flexbox;display:flex;-ms-flex-align:center;align-items:center;min-height:calc(100% - 1rem)}.modal-dialog-centered::before{display:block;height:calc(100vh - 1rem);content:""}.modal-dialog-centered.modal-dialog-scrollable{-ms-flex-direction:column;flex-direction:column;-ms-flex-pack:center;justify-content:center;height:100%}.modal-dialog-centered.modal-dialog-scrollable .modal-content{max-height:none}.modal-dialog-centered.modal-dialog-scrollable::before{content:none}.modal-content{position:relative;display:-ms-flexbox;display:flex;-ms-flex-direction:column;flex-direction:column;width:100%;pointer-events:auto;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,.2);border-radius:.3rem;outline:0}.modal-backdrop{position:fixed;top:0;left:0;z-index:1040;width:100vw;height:100vh;background-color:#000}.modal-backdrop.fade{opacity:0}.modal-backdrop.show{opacity:.5}.modal-header{display:-ms-flexbox;display:flex;-ms-flex-align:start;align-items:flex-start;-ms-flex-pack:justify;justify-content:space-between;padding:1rem 1rem;border-bottom:1px solid #dee2e6;border-top-left-radius:.3rem;border-top-right-radius:.3rem}.modal-header .close{padding:1rem 1rem;margin:-1rem -1rem -1rem auto}.modal-title{margin-bottom:0;line-height:1.5}.modal-body{position:relative;-ms-flex:1 1 auto;flex:1 1 auto;padding:1rem}.modal-footer{display:-ms-flexbox;display:flex;-ms-flex-align:center;align-items:center;-ms-flex-pack:end;justify-content:flex-end;padding:1rem;border-top:1px solid #dee2e6;border-bottom-right-radius:.3rem;border-bottom-left-radius:.3rem}.modal-footer>:not(:first-child){margin-left:.25rem}.modal-footer>:not(:last-child){margin-right:.25rem}.modal-scrollbar-measure{position:absolute;top:-9999px;width:50px;height:50px;overflow:scroll}@media (min-width:576px){.modal-dialog{max-width:500px;margin:1.75rem auto}.modal-dialog-scrollable{max-height:calc(100% - 3.5rem)}.modal-dialog-scrollable .modal-content{max-height:calc(100vh - 3.5rem)}.modal-dialog-centered{min-height:calc(100% - 3.5rem)}.modal-dialog-centered::before{height:calc(100vh - 3.5rem)}.modal-sm{max-width:300px}}@media (min-width:992px){.modal-lg,.modal-xl{max-width:800px}}@media (min-width:1200px){.modal-xl{max-width:1140px}}.tooltip{position:absolute;z-index:1070;display:block;margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-style:normal;font-weight:400;line-height:1.5;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;letter-spacing:normal;word-break:normal;word-spacing:normal;white-space:normal;line-break:auto;font-size:.875rem;word-wrap:break-word;opacity:0}.tooltip.show{opacity:.9}.tooltip .arrow{position:absolute;display:block;width:.8rem;height:.4rem}.tooltip .arrow::before{position:absolute;content:"";border-color:transparent;border-style:solid}.bs-tooltip-auto[x-placement^=top],.bs-tooltip-top{padding:.4rem 0}.bs-tooltip-auto[x-placement^=top] .arrow,.bs-tooltip-top .arrow{bottom:0}.bs-tooltip-auto[x-placement^=top] .arrow::before,.bs-tooltip-top .arrow::before{top:0;border-width:.4rem .4rem 0;border-top-color:#000}.bs-tooltip-auto[x-placement^=right],.bs-tooltip-right{padding:0 .4rem}.bs-tooltip-auto[x-placement^=right] .arrow,.bs-tooltip-right .arrow{left:0;width:.4rem;height:.8rem}.bs-tooltip-auto[x-placement^=right] .arrow::before,.bs-tooltip-right .arrow::before{right:0;border-width:.4rem .4rem .4rem 0;border-right-color:#000}.bs-tooltip-auto[x-placement^=bottom],.bs-tooltip-bottom{padding:.4rem 0}.bs-tooltip-auto[x-placement^=bottom] .arrow,.bs-tooltip-bottom .arrow{top:0}.bs-tooltip-auto[x-placement^=bottom] .arrow::before,.bs-tooltip-bottom .arrow::before{bottom:0;border-width:0 .4rem .4rem;border-bottom-color:#000}.bs-tooltip-auto[x-placement^=left],.bs-tooltip-left{padding:0 .4rem}.bs-tooltip-auto[x-placement^=left] .arrow,.bs-tooltip-left .arrow{right:0;width:.4rem;height:.8rem}.bs-tooltip-auto[x-placement^=left] .arrow::before,.bs-tooltip-left .arrow::before{left:0;border-width:.4rem 0 .4rem .4rem;border-left-color:#000}.tooltip-inner{max-width:200px;padding:.25rem .5rem;color:#fff;text-align:center;background-color:#000;border-radius:.25rem}.popover{position:absolute;top:0;left:0;z-index:1060;display:block;max-width:276px;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-style:normal;font-weight:400;line-height:1.5;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;letter-spacing:normal;word-break:normal;word-spacing:normal;white-space:normal;line-break:auto;font-size:.875rem;word-wrap:break-word;background-color:#fff;background-clip:padding-box;border:1px solid rgba(0,0,0,.2);border-radius:.3rem}.popover .arrow{position:absolute;display:block;width:1rem;height:.5rem;margin:0 .3rem}.popover .arrow::after,.popover .arrow::before{position:absolute;display:block;content:"";border-color:transparent;border-style:solid}.bs-popover-auto[x-placement^=top],.bs-popover-top{margin-bottom:.5rem}.bs-popover-auto[x-placement^=top]>.arrow,.bs-popover-top>.arrow{bottom:calc((.5rem + 1px) * -1)}.bs-popover-auto[x-placement^=top]>.arrow::before,.bs-popover-top>.arrow::before{bottom:0;border-width:.5rem .5rem 0;border-top-color:rgba(0,0,0,.25)}.bs-popover-auto[x-placement^=top]>.arrow::after,.bs-popover-top>.arrow::after{bottom:1px;border-width:.5rem .5rem 0;border-top-color:#fff}.bs-popover-auto[x-placement^=right],.bs-popover-right{margin-left:.5rem}.bs-popover-auto[x-placement^=right]>.arrow,.bs-popover-right>.arrow{left:calc((.5rem + 1px) * -1);width:.5rem;height:1rem;margin:.3rem 0}.bs-popover-auto[x-placement^=right]>.arrow::before,.bs-popover-right>.arrow::before{left:0;border-width:.5rem .5rem .5rem 0;border-right-color:rgba(0,0,0,.25)}.bs-popover-auto[x-placement^=right]>.arrow::after,.bs-popover-right>.arrow::after{left:1px;border-width:.5rem .5rem .5rem 0;border-right-color:#fff}.bs-popover-auto[x-placement^=bottom],.bs-popover-bottom{margin-top:.5rem}.bs-popover-auto[x-placement^=bottom]>.arrow,.bs-popover-bottom>.arrow{top:calc((.5rem + 1px) * -1)}.bs-popover-auto[x-placement^=bottom]>.arrow::before,.bs-popover-bottom>.arrow::before{top:0;border-width:0 .5rem .5rem .5rem;border-bottom-color:rgba(0,0,0,.25)}.bs-popover-auto[x-placement^=bottom]>.arrow::after,.bs-popover-bottom>.arrow::after{top:1px;border-width:0 .5rem .5rem .5rem;border-bottom-color:#fff}.bs-popover-auto[x-placement^=bottom] .popover-header::before,.bs-popover-bottom .popover-header::before{position:absolute;top:0;left:50%;display:block;width:1rem;margin-left:-.5rem;content:"";border-bottom:1px solid #f7f7f7}.bs-popover-auto[x-placement^=left],.bs-popover-left{margin-right:.5rem}.bs-popover-auto[x-placement^=left]>.arrow,.bs-popover-left>.arrow{right:calc((.5rem + 1px) * -1);width:.5rem;height:1rem;margin:.3rem 0}.bs-popover-auto[x-placement^=left]>.arrow::before,.bs-popover-left>.arrow::before{right:0;border-width:.5rem 0 .5rem .5rem;border-left-color:rgba(0,0,0,.25)}.bs-popover-auto[x-placement^=left]>.arrow::after,.bs-popover-left>.arrow::after{right:1px;border-width:.5rem 0 .5rem .5rem;border-left-color:#fff}.popover-header{padding:.5rem .75rem;margin-bottom:0;font-size:1rem;background-color:#f7f7f7;border-bottom:1px solid #ebebeb;border-top-left-radius:calc(.3rem - 1px);border-top-right-radius:calc(.3rem - 1px)}.popover-header:empty{display:none}.popover-body{padding:.5rem .75rem;color:#212529}.carousel{position:relative}.carousel.pointer-event{-ms-touch-action:pan-y;touch-action:pan-y}.carousel-inner{position:relative;width:100%;overflow:hidden}.carousel-inner::after{display:block;clear:both;content:""}.carousel-item{position:relative;display:none;float:left;width:100%;margin-right:-100%;-webkit-backface-visibility:hidden;backface-visibility:hidden;transition:-webkit-transform .6s ease-in-out;transition:transform .6s ease-in-out;transition:transform .6s ease-in-out,-webkit-transform .6s ease-in-out}@media (prefers-reduced-motion:reduce){.carousel-item{transition:none}}.carousel-item-next,.carousel-item-prev,.carousel-item.active{display:block}.active.carousel-item-right,.carousel-item-next:not(.carousel-item-left){-webkit-transform:translateX(100%);transform:translateX(100%)}.active.carousel-item-left,.carousel-item-prev:not(.carousel-item-right){-webkit-transform:translateX(-100%);transform:translateX(-100%)}.carousel-fade .carousel-item{opacity:0;transition-property:opacity;-webkit-transform:none;transform:none}.carousel-fade .carousel-item-next.carousel-item-left,.carousel-fade .carousel-item-prev.carousel-item-right,.carousel-fade .carousel-item.active{z-index:1;opacity:1}.carousel-fade .active.carousel-item-left,.carousel-fade .active.carousel-item-right{z-index:0;opacity:0;transition:0s .6s opacity}@media (prefers-reduced-motion:reduce){.carousel-fade .active.carousel-item-left,.carousel-fade .active.carousel-item-right{transition:none}}.carousel-control-next,.carousel-control-prev{position:absolute;top:0;bottom:0;z-index:1;display:-ms-flexbox;display:flex;-ms-flex-align:center;align-items:center;-ms-flex-pack:center;justify-content:center;width:15%;color:#fff;text-align:center;opacity:.5;transition:opacity .15s ease}@media (prefers-reduced-motion:reduce){.carousel-control-next,.carousel-control-prev{transition:none}}.carousel-control-next:focus,.carousel-control-next:hover,.carousel-control-prev:focus,.carousel-control-prev:hover{color:#fff;text-decoration:none;outline:0;opacity:.9}.carousel-control-prev{left:0}.carousel-control-next{right:0}.carousel-control-next-icon,.carousel-control-prev-icon{display:inline-block;width:20px;height:20px;background:no-repeat 50%/100% 100%}.carousel-control-prev-icon{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3e%3cpath d='M5.25 0l-4 4 4 4 1.5-1.5-2.5-2.5 2.5-2.5-1.5-1.5z'/%3e%3c/svg%3e")}.carousel-control-next-icon{background-image:url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3e%3cpath d='M2.75 0l-1.5 1.5 2.5 2.5-2.5 2.5 1.5 1.5 4-4-4-4z'/%3e%3c/svg%3e")}.carousel-indicators{position:absolute;right:0;bottom:0;left:0;z-index:15;display:-ms-flexbox;display:flex;-ms-flex-pack:center;justify-content:center;padding-left:0;margin-right:15%;margin-left:15%;list-style:none}.carousel-indicators li{box-sizing:content-box;-ms-flex:0 1 auto;flex:0 1 auto;width:30px;height:3px;margin-right:3px;margin-left:3px;text-indent:-999px;cursor:pointer;background-color:#fff;background-clip:padding-box;border-top:10px solid transparent;border-bottom:10px solid transparent;opacity:.5;transition:opacity .6s ease}@media (prefers-reduced-motion:reduce){.carousel-indicators li{transition:none}}.carousel-indicators .active{opacity:1}.carousel-caption{position:absolute;right:15%;bottom:20px;left:15%;z-index:10;padding-top:20px;padding-bottom:20px;color:#fff;text-align:center}@-webkit-keyframes spinner-border{to{-webkit-transform:rotate(360deg);transform:rotate(360deg)}}@keyframes spinner-border{to{-webkit-transform:rotate(360deg);transform:rotate(360deg)}}.spinner-border{display:inline-block;width:2rem;height:2rem;vertical-align:text-bottom;border:.25em solid currentColor;border-right-color:transparent;border-radius:50%;-webkit-animation:spinner-border .75s linear infinite;animation:spinner-border .75s linear infinite}.spinner-border-sm{width:1rem;height:1rem;border-width:.2em}@-webkit-keyframes spinner-grow{0%{-webkit-transform:scale(0);transform:scale(0)}50%{opacity:1}}@keyframes spinner-grow{0%{-webkit-transform:scale(0);transform:scale(0)}50%{opacity:1}}.spinner-grow{display:inline-block;width:2rem;height:2rem;vertical-align:text-bottom;background-color:currentColor;border-radius:50%;opacity:0;-webkit-animation:spinner-grow .75s linear infinite;animation:spinner-grow .75s linear infinite}.spinner-grow-sm{width:1rem;height:1rem}.align-baseline{vertical-align:baseline!important}.align-top{vertical-align:top!important}.align-middle{vertical-align:middle!important}.align-bottom{vertical-align:bottom!important}.align-text-bottom{vertical-align:text-bottom!important}.align-text-top{vertical-align:text-top!important}.bg-primary{background-color:#007bff!important}a.bg-primary:focus,a.bg-primary:hover,button.bg-primary:focus,button.bg-primary:hover{background-color:#0062cc!important}.bg-secondary{background-color:#6c757d!important}a.bg-secondary:focus,a.bg-secondary:hover,button.bg-secondary:focus,button.bg-secondary:hover{background-color:#545b62!important}.bg-success{background-color:#28a745!important}a.bg-success:focus,a.bg-success:hover,button.bg-success:focus,button.bg-success:hover{background-color:#1e7e34!important}.bg-info{background-color:#17a2b8!important}a.bg-info:focus,a.bg-info:hover,button.bg-info:focus,button.bg-info:hover{background-color:#117a8b!important}.bg-warning{background-color:#ffc107!important}a.bg-warning:focus,a.bg-warning:hover,button.bg-warning:focus,button.bg-warning:hover{background-color:#d39e00!important}.bg-danger{background-color:#dc3545!important}a.bg-danger:focus,a.bg-danger:hover,button.bg-danger:focus,button.bg-danger:hover{background-color:#bd2130!important}.bg-light{background-color:#f8f9fa!important}a.bg-light:focus,a.bg-light:hover,button.bg-light:focus,button.bg-light:hover{background-color:#dae0e5!important}.bg-dark{background-color:#343a40!important}a.bg-dark:focus,a.bg-dark:hover,button.bg-dark:focus,button.bg-dark:hover{background-color:#1d2124!important}.bg-white{background-color:#fff!important}.bg-transparent{background-color:transparent!important}.border{border:1px solid #dee2e6!important}.border-top{border-top:1px solid #dee2e6!important}.border-right{border-right:1px solid #dee2e6!important}.border-bottom{border-bottom:1px solid #dee2e6!important}.border-left{border-left:1px solid #dee2e6!important}.border-0{border:0!important}.border-top-0{border-top:0!important}.border-right-0{border-right:0!important}.border-bottom-0{border-bottom:0!important}.border-left-0{border-left:0!important}.border-primary{border-color:#007bff!important}.border-secondary{border-color:#6c757d!important}.border-success{border-color:#28a745!important}.border-info{border-color:#17a2b8!important}.border-warning{border-color:#ffc107!important}.border-danger{border-color:#dc3545!important}.border-light{border-color:#f8f9fa!important}.border-dark{border-color:#343a40!important}.border-white{border-color:#fff!important}.rounded-sm{border-radius:.2rem!important}.rounded{border-radius:.25rem!important}.rounded-top{border-top-left-radius:.25rem!important;border-top-right-radius:.25rem!important}.rounded-right{border-top-right-radius:.25rem!important;border-bottom-right-radius:.25rem!important}.rounded-bottom{border-bottom-right-radius:.25rem!important;border-bottom-left-radius:.25rem!important}.rounded-left{border-top-left-radius:.25rem!important;border-bottom-left-radius:.25rem!important}.rounded-lg{border-radius:.3rem!important}.rounded-circle{border-radius:50%!important}.rounded-pill{border-radius:50rem!important}.rounded-0{border-radius:0!important}.clearfix::after{display:block;clear:both;content:""}.d-none{display:none!important}.d-inline{display:inline!important}.d-inline-block{display:inline-block!important}.d-block{display:block!important}.d-table{display:table!important}.d-table-row{display:table-row!important}.d-table-cell{display:table-cell!important}.d-flex{display:-ms-flexbox!important;display:flex!important}.d-inline-flex{display:-ms-inline-flexbox!important;display:inline-flex!important}@media (min-width:576px){.d-sm-none{display:none!important}.d-sm-inline{display:inline!important}.d-sm-inline-block{display:inline-block!important}.d-sm-block{display:block!important}.d-sm-table{display:table!important}.d-sm-table-row{display:table-row!important}.d-sm-table-cell{display:table-cell!important}.d-sm-flex{display:-ms-flexbox!important;display:flex!important}.d-sm-inline-flex{display:-ms-inline-flexbox!important;display:inline-flex!important}}@media (min-width:768px){.d-md-none{display:none!important}.d-md-inline{display:inline!important}.d-md-inline-block{display:inline-block!important}.d-md-block{display:block!important}.d-md-table{display:table!important}.d-md-table-row{display:table-row!important}.d-md-table-cell{display:table-cell!important}.d-md-flex{display:-ms-flexbox!important;display:flex!important}.d-md-inline-flex{display:-ms-inline-flexbox!important;display:inline-flex!important}}@media (min-width:992px){.d-lg-none{display:none!important}.d-lg-inline{display:inline!important}.d-lg-inline-block{display:inline-block!important}.d-lg-block{display:block!important}.d-lg-table{display:table!important}.d-lg-table-row{display:table-row!important}.d-lg-table-cell{display:table-cell!important}.d-lg-flex{display:-ms-flexbox!important;display:flex!important}.d-lg-inline-flex{display:-ms-inline-flexbox!important;display:inline-flex!important}}@media (min-width:1200px){.d-xl-none{display:none!important}.d-xl-inline{display:inline!important}.d-xl-inline-block{display:inline-block!important}.d-xl-block{display:block!important}.d-xl-table{display:table!important}.d-xl-table-row{display:table-row!important}.d-xl-table-cell{display:table-cell!important}.d-xl-flex{display:-ms-flexbox!important;display:flex!important}.d-xl-inline-flex{display:-ms-inline-flexbox!important;display:inline-flex!important}}@media print{.d-print-none{display:none!important}.d-print-inline{display:inline!important}.d-print-inline-block{display:inline-block!important}.d-print-block{display:block!important}.d-print-table{display:table!important}.d-print-table-row{display:table-row!important}.d-print-table-cell{display:table-cell!important}.d-print-flex{display:-ms-flexbox!important;display:flex!important}.d-print-inline-flex{display:-ms-inline-flexbox!important;display:inline-flex!important}}.embed-responsive{position:relative;display:block;width:100%;padding:0;overflow:hidden}.embed-responsive::before{display:block;content:""}.embed-responsive .embed-responsive-item,.embed-responsive embed,.embed-responsive iframe,.embed-responsive object,.embed-responsive video{position:absolute;top:0;bottom:0;left:0;width:100%;height:100%;border:0}.embed-responsive-21by9::before{padding-top:42.857143%}.embed-responsive-16by9::before{padding-top:56.25%}.embed-responsive-4by3::before{padding-top:75%}.embed-responsive-1by1::before{padding-top:100%}.flex-row{-ms-flex-direction:row!important;flex-direction:row!important}.flex-column{-ms-flex-direction:column!important;flex-direction:column!important}.flex-row-reverse{-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-column-reverse{-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-wrap{-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-nowrap{-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-wrap-reverse{-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.flex-fill{-ms-flex:1 1 auto!important;flex:1 1 auto!important}.flex-grow-0{-ms-flex-positive:0!important;flex-grow:0!important}.flex-grow-1{-ms-flex-positive:1!important;flex-grow:1!important}.flex-shrink-0{-ms-flex-negative:0!important;flex-shrink:0!important}.flex-shrink-1{-ms-flex-negative:1!important;flex-shrink:1!important}.justify-content-start{-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-end{-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-center{-ms-flex-pack:center!important;justify-content:center!important}.justify-content-between{-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-around{-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-start{-ms-flex-align:start!important;align-items:flex-start!important}.align-items-end{-ms-flex-align:end!important;align-items:flex-end!important}.align-items-center{-ms-flex-align:center!important;align-items:center!important}.align-items-baseline{-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-stretch{-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-start{-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-end{-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-center{-ms-flex-line-pack:center!important;align-content:center!important}.align-content-between{-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-around{-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-stretch{-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-auto{-ms-flex-item-align:auto!important;align-self:auto!important}.align-self-start{-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-end{-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-center{-ms-flex-item-align:center!important;align-self:center!important}.align-self-baseline{-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-stretch{-ms-flex-item-align:stretch!important;align-self:stretch!important}@media (min-width:576px){.flex-sm-row{-ms-flex-direction:row!important;flex-direction:row!important}.flex-sm-column{-ms-flex-direction:column!important;flex-direction:column!important}.flex-sm-row-reverse{-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-sm-column-reverse{-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-sm-wrap{-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-sm-nowrap{-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-sm-wrap-reverse{-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.flex-sm-fill{-ms-flex:1 1 auto!important;flex:1 1 auto!important}.flex-sm-grow-0{-ms-flex-positive:0!important;flex-grow:0!important}.flex-sm-grow-1{-ms-flex-positive:1!important;flex-grow:1!important}.flex-sm-shrink-0{-ms-flex-negative:0!important;flex-shrink:0!important}.flex-sm-shrink-1{-ms-flex-negative:1!important;flex-shrink:1!important}.justify-content-sm-start{-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-sm-end{-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-sm-center{-ms-flex-pack:center!important;justify-content:center!important}.justify-content-sm-between{-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-sm-around{-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-sm-start{-ms-flex-align:start!important;align-items:flex-start!important}.align-items-sm-end{-ms-flex-align:end!important;align-items:flex-end!important}.align-items-sm-center{-ms-flex-align:center!important;align-items:center!important}.align-items-sm-baseline{-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-sm-stretch{-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-sm-start{-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-sm-end{-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-sm-center{-ms-flex-line-pack:center!important;align-content:center!important}.align-content-sm-between{-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-sm-around{-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-sm-stretch{-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-sm-auto{-ms-flex-item-align:auto!important;align-self:auto!important}.align-self-sm-start{-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-sm-end{-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-sm-center{-ms-flex-item-align:center!important;align-self:center!important}.align-self-sm-baseline{-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-sm-stretch{-ms-flex-item-align:stretch!important;align-self:stretch!important}}@media (min-width:768px){.flex-md-row{-ms-flex-direction:row!important;flex-direction:row!important}.flex-md-column{-ms-flex-direction:column!important;flex-direction:column!important}.flex-md-row-reverse{-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-md-column-reverse{-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-md-wrap{-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-md-nowrap{-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-md-wrap-reverse{-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.flex-md-fill{-ms-flex:1 1 auto!important;flex:1 1 auto!important}.flex-md-grow-0{-ms-flex-positive:0!important;flex-grow:0!important}.flex-md-grow-1{-ms-flex-positive:1!important;flex-grow:1!important}.flex-md-shrink-0{-ms-flex-negative:0!important;flex-shrink:0!important}.flex-md-shrink-1{-ms-flex-negative:1!important;flex-shrink:1!important}.justify-content-md-start{-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-md-end{-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-md-center{-ms-flex-pack:center!important;justify-content:center!important}.justify-content-md-between{-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-md-around{-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-md-start{-ms-flex-align:start!important;align-items:flex-start!important}.align-items-md-end{-ms-flex-align:end!important;align-items:flex-end!important}.align-items-md-center{-ms-flex-align:center!important;align-items:center!important}.align-items-md-baseline{-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-md-stretch{-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-md-start{-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-md-end{-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-md-center{-ms-flex-line-pack:center!important;align-content:center!important}.align-content-md-between{-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-md-around{-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-md-stretch{-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-md-auto{-ms-flex-item-align:auto!important;align-self:auto!important}.align-self-md-start{-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-md-end{-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-md-center{-ms-flex-item-align:center!important;align-self:center!important}.align-self-md-baseline{-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-md-stretch{-ms-flex-item-align:stretch!important;align-self:stretch!important}}@media (min-width:992px){.flex-lg-row{-ms-flex-direction:row!important;flex-direction:row!important}.flex-lg-column{-ms-flex-direction:column!important;flex-direction:column!important}.flex-lg-row-reverse{-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-lg-column-reverse{-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-lg-wrap{-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-lg-nowrap{-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-lg-wrap-reverse{-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.flex-lg-fill{-ms-flex:1 1 auto!important;flex:1 1 auto!important}.flex-lg-grow-0{-ms-flex-positive:0!important;flex-grow:0!important}.flex-lg-grow-1{-ms-flex-positive:1!important;flex-grow:1!important}.flex-lg-shrink-0{-ms-flex-negative:0!important;flex-shrink:0!important}.flex-lg-shrink-1{-ms-flex-negative:1!important;flex-shrink:1!important}.justify-content-lg-start{-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-lg-end{-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-lg-center{-ms-flex-pack:center!important;justify-content:center!important}.justify-content-lg-between{-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-lg-around{-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-lg-start{-ms-flex-align:start!important;align-items:flex-start!important}.align-items-lg-end{-ms-flex-align:end!important;align-items:flex-end!important}.align-items-lg-center{-ms-flex-align:center!important;align-items:center!important}.align-items-lg-baseline{-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-lg-stretch{-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-lg-start{-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-lg-end{-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-lg-center{-ms-flex-line-pack:center!important;align-content:center!important}.align-content-lg-between{-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-lg-around{-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-lg-stretch{-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-lg-auto{-ms-flex-item-align:auto!important;align-self:auto!important}.align-self-lg-start{-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-lg-end{-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-lg-center{-ms-flex-item-align:center!important;align-self:center!important}.align-self-lg-baseline{-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-lg-stretch{-ms-flex-item-align:stretch!important;align-self:stretch!important}}@media (min-width:1200px){.flex-xl-row{-ms-flex-direction:row!important;flex-direction:row!important}.flex-xl-column{-ms-flex-direction:column!important;flex-direction:column!important}.flex-xl-row-reverse{-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-xl-column-reverse{-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-xl-wrap{-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-xl-nowrap{-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-xl-wrap-reverse{-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.flex-xl-fill{-ms-flex:1 1 auto!important;flex:1 1 auto!important}.flex-xl-grow-0{-ms-flex-positive:0!important;flex-grow:0!important}.flex-xl-grow-1{-ms-flex-positive:1!important;flex-grow:1!important}.flex-xl-shrink-0{-ms-flex-negative:0!important;flex-shrink:0!important}.flex-xl-shrink-1{-ms-flex-negative:1!important;flex-shrink:1!important}.justify-content-xl-start{-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-xl-end{-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-xl-center{-ms-flex-pack:center!important;justify-content:center!important}.justify-content-xl-between{-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-xl-around{-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-xl-start{-ms-flex-align:start!important;align-items:flex-start!important}.align-items-xl-end{-ms-flex-align:end!important;align-items:flex-end!important}.align-items-xl-center{-ms-flex-align:center!important;align-items:center!important}.align-items-xl-baseline{-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-xl-stretch{-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-xl-start{-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-xl-end{-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-xl-center{-ms-flex-line-pack:center!important;align-content:center!important}.align-content-xl-between{-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-xl-around{-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-xl-stretch{-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-xl-auto{-ms-flex-item-align:auto!important;align-self:auto!important}.align-self-xl-start{-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-xl-end{-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-xl-center{-ms-flex-item-align:center!important;align-self:center!important}.align-self-xl-baseline{-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-xl-stretch{-ms-flex-item-align:stretch!important;align-self:stretch!important}}.float-left{float:left!important}.float-right{float:right!important}.float-none{float:none!important}@media (min-width:576px){.float-sm-left{float:left!important}.float-sm-right{float:right!important}.float-sm-none{float:none!important}}@media (min-width:768px){.float-md-left{float:left!important}.float-md-right{float:right!important}.float-md-none{float:none!important}}@media (min-width:992px){.float-lg-left{float:left!important}.float-lg-right{float:right!important}.float-lg-none{float:none!important}}@media (min-width:1200px){.float-xl-left{float:left!important}.float-xl-right{float:right!important}.float-xl-none{float:none!important}}.overflow-auto{overflow:auto!important}.overflow-hidden{overflow:hidden!important}.position-static{position:static!important}.position-relative{position:relative!important}.position-absolute{position:absolute!important}.position-fixed{position:fixed!important}.position-sticky{position:-webkit-sticky!important;position:sticky!important}.fixed-top{position:fixed;top:0;right:0;left:0;z-index:1030}.fixed-bottom{position:fixed;right:0;bottom:0;left:0;z-index:1030}@supports ((position:-webkit-sticky) or (position:sticky)){.sticky-top{position:-webkit-sticky;position:sticky;top:0;z-index:1020}}.sr-only{position:absolute;width:1px;height:1px;padding:0;overflow:hidden;clip:rect(0,0,0,0);white-space:nowrap;border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;overflow:visible;clip:auto;white-space:normal}.shadow-sm{box-shadow:0 .125rem .25rem rgba(0,0,0,.075)!important}.shadow{box-shadow:0 .5rem 1rem rgba(0,0,0,.15)!important}.shadow-lg{box-shadow:0 1rem 3rem rgba(0,0,0,.175)!important}.shadow-none{box-shadow:none!important}.w-25{width:25%!important}.w-50{width:50%!important}.w-75{width:75%!important}.w-100{width:100%!important}.w-auto{width:auto!important}.h-25{height:25%!important}.h-50{height:50%!important}.h-75{height:75%!important}.h-100{height:100%!important}.h-auto{height:auto!important}.mw-100{max-width:100%!important}.mh-100{max-height:100%!important}.min-vw-100{min-width:100vw!important}.min-vh-100{min-height:100vh!important}.vw-100{width:100vw!important}.vh-100{height:100vh!important}.stretched-link::after{position:absolute;top:0;right:0;bottom:0;left:0;z-index:1;pointer-events:auto;content:"";background-color:rgba(0,0,0,0)}.m-0{margin:0!important}.mt-0,.my-0{margin-top:0!important}.mr-0,.mx-0{margin-right:0!important}.mb-0,.my-0{margin-bottom:0!important}.ml-0,.mx-0{margin-left:0!important}.m-1{margin:.25rem!important}.mt-1,.my-1{margin-top:.25rem!important}.mr-1,.mx-1{margin-right:.25rem!important}.mb-1,.my-1{margin-bottom:.25rem!important}.ml-1,.mx-1{margin-left:.25rem!important}.m-2{margin:.5rem!important}.mt-2,.my-2{margin-top:.5rem!important}.mr-2,.mx-2{margin-right:.5rem!important}.mb-2,.my-2{margin-bottom:.5rem!important}.ml-2,.mx-2{margin-left:.5rem!important}.m-3{margin:1rem!important}.mt-3,.my-3{margin-top:1rem!important}.mr-3,.mx-3{margin-right:1rem!important}.mb-3,.my-3{margin-bottom:1rem!important}.ml-3,.mx-3{margin-left:1rem!important}.m-4{margin:1.5rem!important}.mt-4,.my-4{margin-top:1.5rem!important}.mr-4,.mx-4{margin-right:1.5rem!important}.mb-4,.my-4{margin-bottom:1.5rem!important}.ml-4,.mx-4{margin-left:1.5rem!important}.m-5{margin:3rem!important}.mt-5,.my-5{margin-top:3rem!important}.mr-5,.mx-5{margin-right:3rem!important}.mb-5,.my-5{margin-bottom:3rem!important}.ml-5,.mx-5{margin-left:3rem!important}.p-0{padding:0!important}.pt-0,.py-0{padding-top:0!important}.pr-0,.px-0{padding-right:0!important}.pb-0,.py-0{padding-bottom:0!important}.pl-0,.px-0{padding-left:0!important}.p-1{padding:.25rem!important}.pt-1,.py-1{padding-top:.25rem!important}.pr-1,.px-1{padding-right:.25rem!important}.pb-1,.py-1{padding-bottom:.25rem!important}.pl-1,.px-1{padding-left:.25rem!important}.p-2{padding:.5rem!important}.pt-2,.py-2{padding-top:.5rem!important}.pr-2,.px-2{padding-right:.5rem!important}.pb-2,.py-2{padding-bottom:.5rem!important}.pl-2,.px-2{padding-left:.5rem!important}.p-3{padding:1rem!important}.pt-3,.py-3{padding-top:1rem!important}.pr-3,.px-3{padding-right:1rem!important}.pb-3,.py-3{padding-bottom:1rem!important}.pl-3,.px-3{padding-left:1rem!important}.p-4{padding:1.5rem!important}.pt-4,.py-4{padding-top:1.5rem!important}.pr-4,.px-4{padding-right:1.5rem!important}.pb-4,.py-4{padding-bottom:1.5rem!important}.pl-4,.px-4{padding-left:1.5rem!important}.p-5{padding:3rem!important}.pt-5,.py-5{padding-top:3rem!important}.pr-5,.px-5{padding-right:3rem!important}.pb-5,.py-5{padding-bottom:3rem!important}.pl-5,.px-5{padding-left:3rem!important}.m-n1{margin:-.25rem!important}.mt-n1,.my-n1{margin-top:-.25rem!important}.mr-n1,.mx-n1{margin-right:-.25rem!important}.mb-n1,.my-n1{margin-bottom:-.25rem!important}.ml-n1,.mx-n1{margin-left:-.25rem!important}.m-n2{margin:-.5rem!important}.mt-n2,.my-n2{margin-top:-.5rem!important}.mr-n2,.mx-n2{margin-right:-.5rem!important}.mb-n2,.my-n2{margin-bottom:-.5rem!important}.ml-n2,.mx-n2{margin-left:-.5rem!important}.m-n3{margin:-1rem!important}.mt-n3,.my-n3{margin-top:-1rem!important}.mr-n3,.mx-n3{margin-right:-1rem!important}.mb-n3,.my-n3{margin-bottom:-1rem!important}.ml-n3,.mx-n3{margin-left:-1rem!important}.m-n4{margin:-1.5rem!important}.mt-n4,.my-n4{margin-top:-1.5rem!important}.mr-n4,.mx-n4{margin-right:-1.5rem!important}.mb-n4,.my-n4{margin-bottom:-1.5rem!important}.ml-n4,.mx-n4{margin-left:-1.5rem!important}.m-n5{margin:-3rem!important}.mt-n5,.my-n5{margin-top:-3rem!important}.mr-n5,.mx-n5{margin-right:-3rem!important}.mb-n5,.my-n5{margin-bottom:-3rem!important}.ml-n5,.mx-n5{margin-left:-3rem!important}.m-auto{margin:auto!important}.mt-auto,.my-auto{margin-top:auto!important}.mr-auto,.mx-auto{margin-right:auto!important}.mb-auto,.my-auto{margin-bottom:auto!important}.ml-auto,.mx-auto{margin-left:auto!important}@media (min-width:576px){.m-sm-0{margin:0!important}.mt-sm-0,.my-sm-0{margin-top:0!important}.mr-sm-0,.mx-sm-0{margin-right:0!important}.mb-sm-0,.my-sm-0{margin-bottom:0!important}.ml-sm-0,.mx-sm-0{margin-left:0!important}.m-sm-1{margin:.25rem!important}.mt-sm-1,.my-sm-1{margin-top:.25rem!important}.mr-sm-1,.mx-sm-1{margin-right:.25rem!important}.mb-sm-1,.my-sm-1{margin-bottom:.25rem!important}.ml-sm-1,.mx-sm-1{margin-left:.25rem!important}.m-sm-2{margin:.5rem!important}.mt-sm-2,.my-sm-2{margin-top:.5rem!important}.mr-sm-2,.mx-sm-2{margin-right:.5rem!important}.mb-sm-2,.my-sm-2{margin-bottom:.5rem!important}.ml-sm-2,.mx-sm-2{margin-left:.5rem!important}.m-sm-3{margin:1rem!important}.mt-sm-3,.my-sm-3{margin-top:1rem!important}.mr-sm-3,.mx-sm-3{margin-right:1rem!important}.mb-sm-3,.my-sm-3{margin-bottom:1rem!important}.ml-sm-3,.mx-sm-3{margin-left:1rem!important}.m-sm-4{margin:1.5rem!important}.mt-sm-4,.my-sm-4{margin-top:1.5rem!important}.mr-sm-4,.mx-sm-4{margin-right:1.5rem!important}.mb-sm-4,.my-sm-4{margin-bottom:1.5rem!important}.ml-sm-4,.mx-sm-4{margin-left:1.5rem!important}.m-sm-5{margin:3rem!important}.mt-sm-5,.my-sm-5{margin-top:3rem!important}.mr-sm-5,.mx-sm-5{margin-right:3rem!important}.mb-sm-5,.my-sm-5{margin-bottom:3rem!important}.ml-sm-5,.mx-sm-5{margin-left:3rem!important}.p-sm-0{padding:0!important}.pt-sm-0,.py-sm-0{padding-top:0!important}.pr-sm-0,.px-sm-0{padding-right:0!important}.pb-sm-0,.py-sm-0{padding-bottom:0!important}.pl-sm-0,.px-sm-0{padding-left:0!important}.p-sm-1{padding:.25rem!important}.pt-sm-1,.py-sm-1{padding-top:.25rem!important}.pr-sm-1,.px-sm-1{padding-right:.25rem!important}.pb-sm-1,.py-sm-1{padding-bottom:.25rem!important}.pl-sm-1,.px-sm-1{padding-left:.25rem!important}.p-sm-2{padding:.5rem!important}.pt-sm-2,.py-sm-2{padding-top:.5rem!important}.pr-sm-2,.px-sm-2{padding-right:.5rem!important}.pb-sm-2,.py-sm-2{padding-bottom:.5rem!important}.pl-sm-2,.px-sm-2{padding-left:.5rem!important}.p-sm-3{padding:1rem!important}.pt-sm-3,.py-sm-3{padding-top:1rem!important}.pr-sm-3,.px-sm-3{padding-right:1rem!important}.pb-sm-3,.py-sm-3{padding-bottom:1rem!important}.pl-sm-3,.px-sm-3{padding-left:1rem!important}.p-sm-4{padding:1.5rem!important}.pt-sm-4,.py-sm-4{padding-top:1.5rem!important}.pr-sm-4,.px-sm-4{padding-right:1.5rem!important}.pb-sm-4,.py-sm-4{padding-bottom:1.5rem!important}.pl-sm-4,.px-sm-4{padding-left:1.5rem!important}.p-sm-5{padding:3rem!important}.pt-sm-5,.py-sm-5{padding-top:3rem!important}.pr-sm-5,.px-sm-5{padding-right:3rem!important}.pb-sm-5,.py-sm-5{padding-bottom:3rem!important}.pl-sm-5,.px-sm-5{padding-left:3rem!important}.m-sm-n1{margin:-.25rem!important}.mt-sm-n1,.my-sm-n1{margin-top:-.25rem!important}.mr-sm-n1,.mx-sm-n1{margin-right:-.25rem!important}.mb-sm-n1,.my-sm-n1{margin-bottom:-.25rem!important}.ml-sm-n1,.mx-sm-n1{margin-left:-.25rem!important}.m-sm-n2{margin:-.5rem!important}.mt-sm-n2,.my-sm-n2{margin-top:-.5rem!important}.mr-sm-n2,.mx-sm-n2{margin-right:-.5rem!important}.mb-sm-n2,.my-sm-n2{margin-bottom:-.5rem!important}.ml-sm-n2,.mx-sm-n2{margin-left:-.5rem!important}.m-sm-n3{margin:-1rem!important}.mt-sm-n3,.my-sm-n3{margin-top:-1rem!important}.mr-sm-n3,.mx-sm-n3{margin-right:-1rem!important}.mb-sm-n3,.my-sm-n3{margin-bottom:-1rem!important}.ml-sm-n3,.mx-sm-n3{margin-left:-1rem!important}.m-sm-n4{margin:-1.5rem!important}.mt-sm-n4,.my-sm-n4{margin-top:-1.5rem!important}.mr-sm-n4,.mx-sm-n4{margin-right:-1.5rem!important}.mb-sm-n4,.my-sm-n4{margin-bottom:-1.5rem!important}.ml-sm-n4,.mx-sm-n4{margin-left:-1.5rem!important}.m-sm-n5{margin:-3rem!important}.mt-sm-n5,.my-sm-n5{margin-top:-3rem!important}.mr-sm-n5,.mx-sm-n5{margin-right:-3rem!important}.mb-sm-n5,.my-sm-n5{margin-bottom:-3rem!important}.ml-sm-n5,.mx-sm-n5{margin-left:-3rem!important}.m-sm-auto{margin:auto!important}.mt-sm-auto,.my-sm-auto{margin-top:auto!important}.mr-sm-auto,.mx-sm-auto{margin-right:auto!important}.mb-sm-auto,.my-sm-auto{margin-bottom:auto!important}.ml-sm-auto,.mx-sm-auto{margin-left:auto!important}}@media (min-width:768px){.m-md-0{margin:0!important}.mt-md-0,.my-md-0{margin-top:0!important}.mr-md-0,.mx-md-0{margin-right:0!important}.mb-md-0,.my-md-0{margin-bottom:0!important}.ml-md-0,.mx-md-0{margin-left:0!important}.m-md-1{margin:.25rem!important}.mt-md-1,.my-md-1{margin-top:.25rem!important}.mr-md-1,.mx-md-1{margin-right:.25rem!important}.mb-md-1,.my-md-1{margin-bottom:.25rem!important}.ml-md-1,.mx-md-1{margin-left:.25rem!important}.m-md-2{margin:.5rem!important}.mt-md-2,.my-md-2{margin-top:.5rem!important}.mr-md-2,.mx-md-2{margin-right:.5rem!important}.mb-md-2,.my-md-2{margin-bottom:.5rem!important}.ml-md-2,.mx-md-2{margin-left:.5rem!important}.m-md-3{margin:1rem!important}.mt-md-3,.my-md-3{margin-top:1rem!important}.mr-md-3,.mx-md-3{margin-right:1rem!important}.mb-md-3,.my-md-3{margin-bottom:1rem!important}.ml-md-3,.mx-md-3{margin-left:1rem!important}.m-md-4{margin:1.5rem!important}.mt-md-4,.my-md-4{margin-top:1.5rem!important}.mr-md-4,.mx-md-4{margin-right:1.5rem!important}.mb-md-4,.my-md-4{margin-bottom:1.5rem!important}.ml-md-4,.mx-md-4{margin-left:1.5rem!important}.m-md-5{margin:3rem!important}.mt-md-5,.my-md-5{margin-top:3rem!important}.mr-md-5,.mx-md-5{margin-right:3rem!important}.mb-md-5,.my-md-5{margin-bottom:3rem!important}.ml-md-5,.mx-md-5{margin-left:3rem!important}.p-md-0{padding:0!important}.pt-md-0,.py-md-0{padding-top:0!important}.pr-md-0,.px-md-0{padding-right:0!important}.pb-md-0,.py-md-0{padding-bottom:0!important}.pl-md-0,.px-md-0{padding-left:0!important}.p-md-1{padding:.25rem!important}.pt-md-1,.py-md-1{padding-top:.25rem!important}.pr-md-1,.px-md-1{padding-right:.25rem!important}.pb-md-1,.py-md-1{padding-bottom:.25rem!important}.pl-md-1,.px-md-1{padding-left:.25rem!important}.p-md-2{padding:.5rem!important}.pt-md-2,.py-md-2{padding-top:.5rem!important}.pr-md-2,.px-md-2{padding-right:.5rem!important}.pb-md-2,.py-md-2{padding-bottom:.5rem!important}.pl-md-2,.px-md-2{padding-left:.5rem!important}.p-md-3{padding:1rem!important}.pt-md-3,.py-md-3{padding-top:1rem!important}.pr-md-3,.px-md-3{padding-right:1rem!important}.pb-md-3,.py-md-3{padding-bottom:1rem!important}.pl-md-3,.px-md-3{padding-left:1rem!important}.p-md-4{padding:1.5rem!important}.pt-md-4,.py-md-4{padding-top:1.5rem!important}.pr-md-4,.px-md-4{padding-right:1.5rem!important}.pb-md-4,.py-md-4{padding-bottom:1.5rem!important}.pl-md-4,.px-md-4{padding-left:1.5rem!important}.p-md-5{padding:3rem!important}.pt-md-5,.py-md-5{padding-top:3rem!important}.pr-md-5,.px-md-5{padding-right:3rem!important}.pb-md-5,.py-md-5{padding-bottom:3rem!important}.pl-md-5,.px-md-5{padding-left:3rem!important}.m-md-n1{margin:-.25rem!important}.mt-md-n1,.my-md-n1{margin-top:-.25rem!important}.mr-md-n1,.mx-md-n1{margin-right:-.25rem!important}.mb-md-n1,.my-md-n1{margin-bottom:-.25rem!important}.ml-md-n1,.mx-md-n1{margin-left:-.25rem!important}.m-md-n2{margin:-.5rem!important}.mt-md-n2,.my-md-n2{margin-top:-.5rem!important}.mr-md-n2,.mx-md-n2{margin-right:-.5rem!important}.mb-md-n2,.my-md-n2{margin-bottom:-.5rem!important}.ml-md-n2,.mx-md-n2{margin-left:-.5rem!important}.m-md-n3{margin:-1rem!important}.mt-md-n3,.my-md-n3{margin-top:-1rem!important}.mr-md-n3,.mx-md-n3{margin-right:-1rem!important}.mb-md-n3,.my-md-n3{margin-bottom:-1rem!important}.ml-md-n3,.mx-md-n3{margin-left:-1rem!important}.m-md-n4{margin:-1.5rem!important}.mt-md-n4,.my-md-n4{margin-top:-1.5rem!important}.mr-md-n4,.mx-md-n4{margin-right:-1.5rem!important}.mb-md-n4,.my-md-n4{margin-bottom:-1.5rem!important}.ml-md-n4,.mx-md-n4{margin-left:-1.5rem!important}.m-md-n5{margin:-3rem!important}.mt-md-n5,.my-md-n5{margin-top:-3rem!important}.mr-md-n5,.mx-md-n5{margin-right:-3rem!important}.mb-md-n5,.my-md-n5{margin-bottom:-3rem!important}.ml-md-n5,.mx-md-n5{margin-left:-3rem!important}.m-md-auto{margin:auto!important}.mt-md-auto,.my-md-auto{margin-top:auto!important}.mr-md-auto,.mx-md-auto{margin-right:auto!important}.mb-md-auto,.my-md-auto{margin-bottom:auto!important}.ml-md-auto,.mx-md-auto{margin-left:auto!important}}@media (min-width:992px){.m-lg-0{margin:0!important}.mt-lg-0,.my-lg-0{margin-top:0!important}.mr-lg-0,.mx-lg-0{margin-right:0!important}.mb-lg-0,.my-lg-0{margin-bottom:0!important}.ml-lg-0,.mx-lg-0{margin-left:0!important}.m-lg-1{margin:.25rem!important}.mt-lg-1,.my-lg-1{margin-top:.25rem!important}.mr-lg-1,.mx-lg-1{margin-right:.25rem!important}.mb-lg-1,.my-lg-1{margin-bottom:.25rem!important}.ml-lg-1,.mx-lg-1{margin-left:.25rem!important}.m-lg-2{margin:.5rem!important}.mt-lg-2,.my-lg-2{margin-top:.5rem!important}.mr-lg-2,.mx-lg-2{margin-right:.5rem!important}.mb-lg-2,.my-lg-2{margin-bottom:.5rem!important}.ml-lg-2,.mx-lg-2{margin-left:.5rem!important}.m-lg-3{margin:1rem!important}.mt-lg-3,.my-lg-3{margin-top:1rem!important}.mr-lg-3,.mx-lg-3{margin-right:1rem!important}.mb-lg-3,.my-lg-3{margin-bottom:1rem!important}.ml-lg-3,.mx-lg-3{margin-left:1rem!important}.m-lg-4{margin:1.5rem!important}.mt-lg-4,.my-lg-4{margin-top:1.5rem!important}.mr-lg-4,.mx-lg-4{margin-right:1.5rem!important}.mb-lg-4,.my-lg-4{margin-bottom:1.5rem!important}.ml-lg-4,.mx-lg-4{margin-left:1.5rem!important}.m-lg-5{margin:3rem!important}.mt-lg-5,.my-lg-5{margin-top:3rem!important}.mr-lg-5,.mx-lg-5{margin-right:3rem!important}.mb-lg-5,.my-lg-5{margin-bottom:3rem!important}.ml-lg-5,.mx-lg-5{margin-left:3rem!important}.p-lg-0{padding:0!important}.pt-lg-0,.py-lg-0{padding-top:0!important}.pr-lg-0,.px-lg-0{padding-right:0!important}.pb-lg-0,.py-lg-0{padding-bottom:0!important}.pl-lg-0,.px-lg-0{padding-left:0!important}.p-lg-1{padding:.25rem!important}.pt-lg-1,.py-lg-1{padding-top:.25rem!important}.pr-lg-1,.px-lg-1{padding-right:.25rem!important}.pb-lg-1,.py-lg-1{padding-bottom:.25rem!important}.pl-lg-1,.px-lg-1{padding-left:.25rem!important}.p-lg-2{padding:.5rem!important}.pt-lg-2,.py-lg-2{padding-top:.5rem!important}.pr-lg-2,.px-lg-2{padding-right:.5rem!important}.pb-lg-2,.py-lg-2{padding-bottom:.5rem!important}.pl-lg-2,.px-lg-2{padding-left:.5rem!important}.p-lg-3{padding:1rem!important}.pt-lg-3,.py-lg-3{padding-top:1rem!important}.pr-lg-3,.px-lg-3{padding-right:1rem!important}.pb-lg-3,.py-lg-3{padding-bottom:1rem!important}.pl-lg-3,.px-lg-3{padding-left:1rem!important}.p-lg-4{padding:1.5rem!important}.pt-lg-4,.py-lg-4{padding-top:1.5rem!important}.pr-lg-4,.px-lg-4{padding-right:1.5rem!important}.pb-lg-4,.py-lg-4{padding-bottom:1.5rem!important}.pl-lg-4,.px-lg-4{padding-left:1.5rem!important}.p-lg-5{padding:3rem!important}.pt-lg-5,.py-lg-5{padding-top:3rem!important}.pr-lg-5,.px-lg-5{padding-right:3rem!important}.pb-lg-5,.py-lg-5{padding-bottom:3rem!important}.pl-lg-5,.px-lg-5{padding-left:3rem!important}.m-lg-n1{margin:-.25rem!important}.mt-lg-n1,.my-lg-n1{margin-top:-.25rem!important}.mr-lg-n1,.mx-lg-n1{margin-right:-.25rem!important}.mb-lg-n1,.my-lg-n1{margin-bottom:-.25rem!important}.ml-lg-n1,.mx-lg-n1{margin-left:-.25rem!important}.m-lg-n2{margin:-.5rem!important}.mt-lg-n2,.my-lg-n2{margin-top:-.5rem!important}.mr-lg-n2,.mx-lg-n2{margin-right:-.5rem!important}.mb-lg-n2,.my-lg-n2{margin-bottom:-.5rem!important}.ml-lg-n2,.mx-lg-n2{margin-left:-.5rem!important}.m-lg-n3{margin:-1rem!important}.mt-lg-n3,.my-lg-n3{margin-top:-1rem!important}.mr-lg-n3,.mx-lg-n3{margin-right:-1rem!important}.mb-lg-n3,.my-lg-n3{margin-bottom:-1rem!important}.ml-lg-n3,.mx-lg-n3{margin-left:-1rem!important}.m-lg-n4{margin:-1.5rem!important}.mt-lg-n4,.my-lg-n4{margin-top:-1.5rem!important}.mr-lg-n4,.mx-lg-n4{margin-right:-1.5rem!important}.mb-lg-n4,.my-lg-n4{margin-bottom:-1.5rem!important}.ml-lg-n4,.mx-lg-n4{margin-left:-1.5rem!important}.m-lg-n5{margin:-3rem!important}.mt-lg-n5,.my-lg-n5{margin-top:-3rem!important}.mr-lg-n5,.mx-lg-n5{margin-right:-3rem!important}.mb-lg-n5,.my-lg-n5{margin-bottom:-3rem!important}.ml-lg-n5,.mx-lg-n5{margin-left:-3rem!important}.m-lg-auto{margin:auto!important}.mt-lg-auto,.my-lg-auto{margin-top:auto!important}.mr-lg-auto,.mx-lg-auto{margin-right:auto!important}.mb-lg-auto,.my-lg-auto{margin-bottom:auto!important}.ml-lg-auto,.mx-lg-auto{margin-left:auto!important}}@media (min-width:1200px){.m-xl-0{margin:0!important}.mt-xl-0,.my-xl-0{margin-top:0!important}.mr-xl-0,.mx-xl-0{margin-right:0!important}.mb-xl-0,.my-xl-0{margin-bottom:0!important}.ml-xl-0,.mx-xl-0{margin-left:0!important}.m-xl-1{margin:.25rem!important}.mt-xl-1,.my-xl-1{margin-top:.25rem!important}.mr-xl-1,.mx-xl-1{margin-right:.25rem!important}.mb-xl-1,.my-xl-1{margin-bottom:.25rem!important}.ml-xl-1,.mx-xl-1{margin-left:.25rem!important}.m-xl-2{margin:.5rem!important}.mt-xl-2,.my-xl-2{margin-top:.5rem!important}.mr-xl-2,.mx-xl-2{margin-right:.5rem!important}.mb-xl-2,.my-xl-2{margin-bottom:.5rem!important}.ml-xl-2,.mx-xl-2{margin-left:.5rem!important}.m-xl-3{margin:1rem!important}.mt-xl-3,.my-xl-3{margin-top:1rem!important}.mr-xl-3,.mx-xl-3{margin-right:1rem!important}.mb-xl-3,.my-xl-3{margin-bottom:1rem!important}.ml-xl-3,.mx-xl-3{margin-left:1rem!important}.m-xl-4{margin:1.5rem!important}.mt-xl-4,.my-xl-4{margin-top:1.5rem!important}.mr-xl-4,.mx-xl-4{margin-right:1.5rem!important}.mb-xl-4,.my-xl-4{margin-bottom:1.5rem!important}.ml-xl-4,.mx-xl-4{margin-left:1.5rem!important}.m-xl-5{margin:3rem!important}.mt-xl-5,.my-xl-5{margin-top:3rem!important}.mr-xl-5,.mx-xl-5{margin-right:3rem!important}.mb-xl-5,.my-xl-5{margin-bottom:3rem!important}.ml-xl-5,.mx-xl-5{margin-left:3rem!important}.p-xl-0{padding:0!important}.pt-xl-0,.py-xl-0{padding-top:0!important}.pr-xl-0,.px-xl-0{padding-right:0!important}.pb-xl-0,.py-xl-0{padding-bottom:0!important}.pl-xl-0,.px-xl-0{padding-left:0!important}.p-xl-1{padding:.25rem!important}.pt-xl-1,.py-xl-1{padding-top:.25rem!important}.pr-xl-1,.px-xl-1{padding-right:.25rem!important}.pb-xl-1,.py-xl-1{padding-bottom:.25rem!important}.pl-xl-1,.px-xl-1{padding-left:.25rem!important}.p-xl-2{padding:.5rem!important}.pt-xl-2,.py-xl-2{padding-top:.5rem!important}.pr-xl-2,.px-xl-2{padding-right:.5rem!important}.pb-xl-2,.py-xl-2{padding-bottom:.5rem!important}.pl-xl-2,.px-xl-2{padding-left:.5rem!important}.p-xl-3{padding:1rem!important}.pt-xl-3,.py-xl-3{padding-top:1rem!important}.pr-xl-3,.px-xl-3{padding-right:1rem!important}.pb-xl-3,.py-xl-3{padding-bottom:1rem!important}.pl-xl-3,.px-xl-3{padding-left:1rem!important}.p-xl-4{padding:1.5rem!important}.pt-xl-4,.py-xl-4{padding-top:1.5rem!important}.pr-xl-4,.px-xl-4{padding-right:1.5rem!important}.pb-xl-4,.py-xl-4{padding-bottom:1.5rem!important}.pl-xl-4,.px-xl-4{padding-left:1.5rem!important}.p-xl-5{padding:3rem!important}.pt-xl-5,.py-xl-5{padding-top:3rem!important}.pr-xl-5,.px-xl-5{padding-right:3rem!important}.pb-xl-5,.py-xl-5{padding-bottom:3rem!important}.pl-xl-5,.px-xl-5{padding-left:3rem!important}.m-xl-n1{margin:-.25rem!important}.mt-xl-n1,.my-xl-n1{margin-top:-.25rem!important}.mr-xl-n1,.mx-xl-n1{margin-right:-.25rem!important}.mb-xl-n1,.my-xl-n1{margin-bottom:-.25rem!important}.ml-xl-n1,.mx-xl-n1{margin-left:-.25rem!important}.m-xl-n2{margin:-.5rem!important}.mt-xl-n2,.my-xl-n2{margin-top:-.5rem!important}.mr-xl-n2,.mx-xl-n2{margin-right:-.5rem!important}.mb-xl-n2,.my-xl-n2{margin-bottom:-.5rem!important}.ml-xl-n2,.mx-xl-n2{margin-left:-.5rem!important}.m-xl-n3{margin:-1rem!important}.mt-xl-n3,.my-xl-n3{margin-top:-1rem!important}.mr-xl-n3,.mx-xl-n3{margin-right:-1rem!important}.mb-xl-n3,.my-xl-n3{margin-bottom:-1rem!important}.ml-xl-n3,.mx-xl-n3{margin-left:-1rem!important}.m-xl-n4{margin:-1.5rem!important}.mt-xl-n4,.my-xl-n4{margin-top:-1.5rem!important}.mr-xl-n4,.mx-xl-n4{margin-right:-1.5rem!important}.mb-xl-n4,.my-xl-n4{margin-bottom:-1.5rem!important}.ml-xl-n4,.mx-xl-n4{margin-left:-1.5rem!important}.m-xl-n5{margin:-3rem!important}.mt-xl-n5,.my-xl-n5{margin-top:-3rem!important}.mr-xl-n5,.mx-xl-n5{margin-right:-3rem!important}.mb-xl-n5,.my-xl-n5{margin-bottom:-3rem!important}.ml-xl-n5,.mx-xl-n5{margin-left:-3rem!important}.m-xl-auto{margin:auto!important}.mt-xl-auto,.my-xl-auto{margin-top:auto!important}.mr-xl-auto,.mx-xl-auto{margin-right:auto!important}.mb-xl-auto,.my-xl-auto{margin-bottom:auto!important}.ml-xl-auto,.mx-xl-auto{margin-left:auto!important}}.text-monospace{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace!important}.text-justify{text-align:justify!important}.text-wrap{white-space:normal!important}.text-nowrap{white-space:nowrap!important}.text-truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.text-left{text-align:left!important}.text-right{text-align:right!important}.text-center{text-align:center!important}@media (min-width:576px){.text-sm-left{text-align:left!important}.text-sm-right{text-align:right!important}.text-sm-center{text-align:center!important}}@media (min-width:768px){.text-md-left{text-align:left!important}.text-md-right{text-align:right!important}.text-md-center{text-align:center!important}}@media (min-width:992px){.text-lg-left{text-align:left!important}.text-lg-right{text-align:right!important}.text-lg-center{text-align:center!important}}@media (min-width:1200px){.text-xl-left{text-align:left!important}.text-xl-right{text-align:right!important}.text-xl-center{text-align:center!important}}.text-lowercase{text-transform:lowercase!important}.text-uppercase{text-transform:uppercase!important}.text-capitalize{text-transform:capitalize!important}.font-weight-light{font-weight:300!important}.font-weight-lighter{font-weight:lighter!important}.font-weight-normal{font-weight:400!important}.font-weight-bold{font-weight:700!important}.font-weight-bolder{font-weight:bolder!important}.font-italic{font-style:italic!important}.text-white{color:#fff!important}.text-primary{color:#007bff!important}a.text-primary:focus,a.text-primary:hover{color:#0056b3!important}.text-secondary{color:#6c757d!important}a.text-secondary:focus,a.text-secondary:hover{color:#494f54!important}.text-success{color:#28a745!important}a.text-success:focus,a.text-success:hover{color:#19692c!important}.text-info{color:#17a2b8!important}a.text-info:focus,a.text-info:hover{color:#0f6674!important}.text-warning{color:#ffc107!important}a.text-warning:focus,a.text-warning:hover{color:#ba8b00!important}.text-danger{color:#dc3545!important}a.text-danger:focus,a.text-danger:hover{color:#a71d2a!important}.text-light{color:#f8f9fa!important}a.text-light:focus,a.text-light:hover{color:#cbd3da!important}.text-dark{color:#343a40!important}a.text-dark:focus,a.text-dark:hover{color:#121416!important}.text-body{color:#212529!important}.text-muted{color:#6c757d!important}.text-black-50{color:rgba(0,0,0,.5)!important}.text-white-50{color:rgba(255,255,255,.5)!important}.text-hide{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0}.text-decoration-none{text-decoration:none!important}.text-break{word-break:break-word!important;overflow-wrap:break-word!important}.text-reset{color:inherit!important}.visible{visibility:visible!important}.invisible{visibility:hidden!important}@media print{*,::after,::before{text-shadow:none!important;box-shadow:none!important}a:not(.btn){text-decoration:underline}abbr[title]::after{content:" (" attr(title) ")"}pre{white-space:pre-wrap!important}blockquote,pre{border:1px solid #adb5bd;page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}h2,h3,p{orphans:3;widows:3}h2,h3{page-break-after:avoid}@page{size:a3}body{min-width:992px!important}.container{min-width:992px!important}.navbar{display:none}.badge{border:1px solid #000}.table{border-collapse:collapse!important}.table td,.table th{background-color:#fff!important}.table-bordered td,.table-bordered th{border:1px solid #dee2e6!important}.table-dark{color:inherit}.table-dark tbody+tbody,.table-dark td,.table-dark th,.table-dark thead th{border-color:#dee2e6}.table .thead-dark th{color:inherit;border-color:#dee2e6}} -/*# sourceMappingURL=bootstrap.min.css.map */ \ No newline at end of file diff --git a/previews/PR44/cuda/index.html b/previews/PR44/cuda/index.html deleted file mode 100644 index 9d03bcc..0000000 --- a/previews/PR44/cuda/index.html +++ /dev/null @@ -1,211 +0,0 @@ - - - - - - - - - - - - - - - - - - - NVIDIA CUDA ⋅ JuliaGPU - - - - - - -
-
- - - - -

NVIDIA CUDA

- -

- - - - - - -

- -

The programming support for NVIDIA GPUs in Julia is provided by the CUDA.jl package. It is built on the CUDA toolkit, and aims to be as full-featured and offer the same performance as CUDA C. The toolchain is mature, has been under development since 2014 and can easily be installed on any current version of Julia using the integrated package manager.

-

CUDA.jl makes it possible to program NVIDIA GPUs at different abstraction levels:

- -

The documentation of CUDA.jl demonstrates each of these approaches.

-

Performance

-

Julia on the CPU is known for its good performance, approaching that of statically compiled languages like C. The same holds for programming NVIDIA GPUs with kernels written using CUDA.jl, where we have shown the performance to approach and even sometimes exceed that of CUDA C on a selection[1] of applications from the Rodinia benchmark suite:

- -
- - - -
-

- Relative performance of Rodinia benchmarks implemented in Julia with CUDA.jl. -

-
-
- -
-

- - - - -
[1]Since porting applications from one programming language to another is labour
-intensive, we only ported and analyzed the 10 smallest benchmarks from the suite. More details can be found in the paper.

- - -
-
- - - - - - - - - - - - - - - - - diff --git a/previews/PR44/index.html b/previews/PR44/index.html deleted file mode 100644 index ec1d18e..0000000 --- a/previews/PR44/index.html +++ /dev/null @@ -1,227 +0,0 @@ - - - - - - - - - - - - - - - - - - JuliaGPU - - - - - - - -
-
- - - - - -
-

- JuliaGPU

-

- High-performance GPU programming in a high-level language. -

-
- -

JuliaGPU is a Github organization created to unify the many packages for programming GPUs in Julia. With its high-level syntax and flexible compiler, Julia is well positioned to productively program hardware accelerators like GPUs without sacrificing performance.

-

Several GPU platforms are supported, but there are large differences in features and stability. On this website, you can find a brief introduction of the supported platforms and links to the respective home pages.

-
- -
- -


-

Supported platforms

-

The best supported GPU platform in Julia is NVIDIA CUDA, with mature and full-featured packages for both low-level kernel programming as well as working with high-level operations on arrays. All versions of Julia are supported, on Linux and Windows, and the functionality is actively used by a variety of applications and libraries.

-

Similar, but much newer capabilities exist for Intel GPUs with oneAPI. Currently, full-featured kernel programming capabilities are available, but there is no support for vendor libraries such as oneMKL or oneDNN yet.

-

Maturing support exists for AMD GPUs running on the ROCm stack. These GPUs can again be programmed in Julia at the kernel level or using high-level operations on arrays. Latest versions of Julia are supported, and the functionality is increasingly used by a variety of applications and libraries.

-

Experimental support also exists for Apple GPUs. Array programming and kernel programming are both supported.

-

Applications

-

Almost 300 packages rely directly or indirectly on Julia's GPU capabilities. A few noteworthy examples are:

- -

Many other Julia applications and libraries can be used with GPUs, too: By means of GPU-specific array types like CuArray from CUDA.jl or ROCArray from AMDGPU.jl, existing software that uses the Julia array interfaces can often be executed as-is on a GPU.

-

Publications

-

Much of Julia's GPU support was developed as part of academic research. If you would like to help support it, please star the relevant repositories as such metrics may help us secure funding in the future. If you use our software as part of your research, teaching, or other activities, we would be grateful if you could cite our work:

- -

Community

-

If you need help, or have questions about GPU programming in Julia, you can find members of the community at:

- - -
-
- - - - - - - - - - - - - - - - - diff --git a/previews/PR44/learn/index.html b/previews/PR44/learn/index.html deleted file mode 100644 index 4314484..0000000 --- a/previews/PR44/learn/index.html +++ /dev/null @@ -1,210 +0,0 @@ - - - - - - - - - - - - - - - - - - - Learn ⋅ JuliaGPU - - - - - - -
-
- - - - -

Learn

-

Currently, the Julia CUDA stack is the most mature, easiest to install, and full-featured. The CUDA.jl documentation is a central place for information on all relevant packages. Start with the instructions on how to install the stack, and follow with this introductory tutorial.

-

If you prefer videos, the presentations below highlight different aspects of the toolchain.

-

Concurrent GPU computing in CUDA.jl 3.0

-

Introduction to concurrent GPU computing:

- -
- -
- -
-

Effective CUDA GPU computing in Julia

- -
- -
- -
- -
-
- - - - - - - - - - - - - - - - - diff --git a/previews/PR44/libs/feather/feather.min.js b/previews/PR44/libs/feather/feather.min.js deleted file mode 100644 index af22fcd..0000000 --- a/previews/PR44/libs/feather/feather.min.js +++ /dev/null @@ -1,13 +0,0 @@ -!function(e,n){"object"==typeof exports&&"object"==typeof module?module.exports=n():"function"==typeof define&&define.amd?define([],n):"object"==typeof exports?exports.feather=n():e.feather=n()}("undefined"!=typeof self?self:this,function(){return function(e){var n={};function i(l){if(n[l])return n[l].exports;var t=n[l]={i:l,l:!1,exports:{}};return e[l].call(t.exports,t,t.exports,i),t.l=!0,t.exports}return i.m=e,i.c=n,i.d=function(e,n,l){i.o(e,n)||Object.defineProperty(e,n,{configurable:!1,enumerable:!0,get:l})},i.r=function(e){Object.defineProperty(e,"__esModule",{value:!0})},i.n=function(e){var n=e&&e.__esModule?function(){return e.default}:function(){return e};return i.d(n,"a",n),n},i.o=function(e,n){return Object.prototype.hasOwnProperty.call(e,n)},i.p="",i(i.s=61)}([function(e,n,i){var l=i(20)("wks"),t=i(11),r=i(1).Symbol,o="function"==typeof r;(e.exports=function(e){return l[e]||(l[e]=o&&r[e]||(o?r:t)("Symbol."+e))}).store=l},function(e,n){var i=e.exports="undefined"!=typeof window&&window.Math==Math?window:"undefined"!=typeof self&&self.Math==Math?self:Function("return this")();"number"==typeof __g&&(__g=i)},function(e,n){var i=e.exports={version:"2.5.6"};"number"==typeof __e&&(__e=i)},function(e,n){var i={}.hasOwnProperty;e.exports=function(e,n){return i.call(e,n)}},function(e,n,i){e.exports=!i(27)(function(){return 7!=Object.defineProperty({},"a",{get:function(){return 7}}).a})},function(e,n,i){var l=i(13);e.exports=function(e){if(!l(e))throw TypeError(e+" is not an object!");return e}},function(e,n,i){var l=i(5),t=i(56),r=i(55),o=Object.defineProperty;n.f=i(4)?Object.defineProperty:function(e,n,i){if(l(e),n=r(n,!0),l(i),t)try{return o(e,n,i)}catch(e){}if("get"in i||"set"in i)throw TypeError("Accessors not supported!");return"value"in i&&(e[n]=i.value),e}},function(e,n,i){var l=i(6),t=i(12);e.exports=i(4)?function(e,n,i){return l.f(e,n,t(1,i))}:function(e,n,i){return e[n]=i,e}},function(e,n,i){"use strict";Object.defineProperty(n,"__esModule",{value:!0});var l=o(i(35)),t=o(i(33)),r=o(i(32));function o(e){return e&&e.__esModule?e:{default:e}}n.default=Object.keys(t.default).map(function(e){return new l.default(e,t.default[e],r.default[e])}).reduce(function(e,n){return e[n.name]=n,e},{})},function(e,n,i){var l=i(20)("keys"),t=i(11);e.exports=function(e){return l[e]||(l[e]=t(e))}},function(e,n){e.exports={}},function(e,n){var i=0,l=Math.random();e.exports=function(e){return"Symbol(".concat(void 0===e?"":e,")_",(++i+l).toString(36))}},function(e,n){e.exports=function(e,n){return{enumerable:!(1&e),configurable:!(2&e),writable:!(4&e),value:n}}},function(e,n){e.exports=function(e){return"object"==typeof e?null!==e:"function"==typeof e}},function(e,n){e.exports=function(e){if(void 0==e)throw TypeError("Can't call method on "+e);return e}},function(e,n){var i=Math.ceil,l=Math.floor;e.exports=function(e){return isNaN(e=+e)?0:(e>0?l:i)(e)}},function(e,n,i){var l; -/*! - Copyright (c) 2016 Jed Watson. - Licensed under the MIT License (MIT), see - http://jedwatson.github.io/classnames -*/ -/*! - Copyright (c) 2016 Jed Watson. - Licensed under the MIT License (MIT), see - http://jedwatson.github.io/classnames -*/ -!function(){"use strict";var i=function(){function e(){}function n(e,n){for(var i=n.length,l=0;l0?t(l(e),9007199254740991):0}},function(e,n){var i={}.toString;e.exports=function(e){return i.call(e).slice(8,-1)}},function(e,n,i){var l=i(48),t=i(14);e.exports=function(e){return l(t(e))}},function(e,n,i){var l=i(54);e.exports=function(e,n,i){if(l(e),void 0===n)return e;switch(i){case 1:return function(i){return e.call(n,i)};case 2:return function(i,l){return e.call(n,i,l)};case 3:return function(i,l,t){return e.call(n,i,l,t)}}return function(){return e.apply(n,arguments)}}},function(e,n,i){var l=i(1),t=i(7),r=i(3),o=i(11)("src"),a=Function.toString,c=(""+a).split("toString");i(2).inspectSource=function(e){return a.call(e)},(e.exports=function(e,n,i,a){var y="function"==typeof i;y&&(r(i,"name")||t(i,"name",n)),e[n]!==i&&(y&&(r(i,o)||t(i,o,e[n]?""+e[n]:c.join(String(n)))),e===l?e[n]=i:a?e[n]?e[n]=i:t(e,n,i):(delete e[n],t(e,n,i)))})(Function.prototype,"toString",function(){return"function"==typeof this&&this[o]||a.call(this)})},function(e,n,i){var l=i(13),t=i(1).document,r=l(t)&&l(t.createElement);e.exports=function(e){return r?t.createElement(e):{}}},function(e,n){e.exports=function(e){try{return!!e()}catch(e){return!0}}},function(e,n,i){var l=i(1),t=i(2),r=i(7),o=i(25),a=i(24),c=function(e,n,i){var y,p,h,x,s=e&c.F,u=e&c.G,d=e&c.S,f=e&c.P,v=e&c.B,g=u?l:d?l[n]||(l[n]={}):(l[n]||{}).prototype,m=u?t:t[n]||(t[n]={}),M=m.prototype||(m.prototype={});for(y in u&&(i=n),i)h=((p=!s&&g&&void 0!==g[y])?g:i)[y],x=v&&p?a(h,l):f&&"function"==typeof h?a(Function.call,h):h,g&&o(g,y,h,e&c.U),m[y]!=h&&r(m,y,x),f&&M[y]!=h&&(M[y]=h)};l.core=t,c.F=1,c.G=2,c.S=4,c.P=8,c.B=16,c.W=32,c.U=64,c.R=128,e.exports=c},function(e,n){e.exports=!1},function(e,n,i){"use strict";Object.defineProperty(n,"__esModule",{value:!0});var l=Object.assign||function(e){for(var n=1;n0&&void 0!==arguments[0]?arguments[0]:{};if("undefined"==typeof document)throw new Error("`feather.replace()` only works in a browser environment.");var n=document.querySelectorAll("[data-feather]");Array.from(n).forEach(function(n){return function(e){var n=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{},i=function(e){return Array.from(e.attributes).reduce(function(e,n){return e[n.name]=n.value,e},{})}(e),o=i["data-feather"];delete i["data-feather"];var a=r.default[o].toSvg(l({},n,i,{class:(0,t.default)(n.class,i.class)})),c=(new DOMParser).parseFromString(a,"image/svg+xml").querySelector("svg");e.parentNode.replaceChild(c,e)}(n,e)})}},function(e,n,i){"use strict";Object.defineProperty(n,"__esModule",{value:!0});var l,t=i(8),r=(l=t)&&l.__esModule?l:{default:l};n.default=function(e){var n=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{};if(console.warn("feather.toSvg() is deprecated. Please use feather.icons[name].toSvg() instead."),!e)throw new Error("The required `key` (icon name) parameter is missing.");if(!r.default[e])throw new Error("No icon matching '"+e+"'. See the complete list of icons at https://feathericons.com");return r.default[e].toSvg(n)}},function(e){e.exports={activity:["pulse","health","action","motion"],airplay:["stream","cast","mirroring"],"alert-circle":["warning"],"alert-octagon":["warning"],"alert-triangle":["warning"],"at-sign":["mention"],award:["achievement","badge"],aperture:["camera","photo"],bell:["alarm","notification"],"bell-off":["alarm","notification","silent"],bluetooth:["wireless"],"book-open":["read"],book:["read","dictionary","booklet","magazine"],bookmark:["read","clip","marker","tag"],briefcase:["work","bag","baggage","folder"],clipboard:["copy"],clock:["time","watch","alarm"],"cloud-drizzle":["weather","shower"],"cloud-lightning":["weather","bolt"],"cloud-rain":["weather"],"cloud-snow":["weather","blizzard"],cloud:["weather"],codepen:["logo"],coffee:["drink","cup","mug","tea","cafe","hot","beverage"],command:["keyboard","cmd"],compass:["navigation","safari","travel"],copy:["clone","duplicate"],"corner-down-left":["arrow"],"corner-down-right":["arrow"],"corner-left-down":["arrow"],"corner-left-up":["arrow"],"corner-right-down":["arrow"],"corner-right-up":["arrow"],"corner-up-left":["arrow"],"corner-up-right":["arrow"],"credit-card":["purchase","payment","cc"],crop:["photo","image"],crosshair:["aim","target"],database:["storage"],delete:["remove"],disc:["album","cd","dvd","music"],"dollar-sign":["currency","money","payment"],droplet:["water"],edit:["pencil","change"],"edit-2":["pencil","change"],"edit-3":["pencil","change"],eye:["view","watch"],"eye-off":["view","watch"],"external-link":["outbound"],facebook:["logo"],"fast-forward":["music"],film:["movie","video"],"folder-minus":["directory"],"folder-plus":["directory"],folder:["directory"],frown:["emoji","face","bad","sad","emotion"],gift:["present","box","birthday","party"],"git-branch":["code","version control"],"git-commit":["code","version control"],"git-merge":["code","version control"],"git-pull-request":["code","version control"],github:["logo","version control"],gitlab:["logo","version control"],global:["world","browser","language","translate"],"hard-drive":["computer","server"],hash:["hashtag","number","pound"],headphones:["music","audio"],heart:["like","love"],"help-circle":["question mark"],home:["house"],image:["picture"],inbox:["email"],instagram:["logo","camera"],"life-bouy":["help","life ring","support"],linkedin:["logo"],lock:["security","password"],"log-in":["sign in","arrow"],"log-out":["sign out","arrow"],mail:["email"],"map-pin":["location","navigation","travel","marker"],map:["location","navigation","travel"],maximize:["fullscreen"],"maximize-2":["fullscreen","arrows"],meh:["emoji","face","neutral","emotion"],menu:["bars","navigation","hamburger"],"message-circle":["comment","chat"],"message-square":["comment","chat"],"mic-off":["record"],mic:["record"],minimize:["exit fullscreen"],"minimize-2":["exit fullscreen","arrows"],monitor:["tv"],moon:["dark","night"],"more-horizontal":["ellipsis"],"more-vertical":["ellipsis"],move:["arrows"],navigation:["location","travel"],"navigation-2":["location","travel"],octagon:["stop"],package:["box"],paperclip:["attachment"],pause:["music","stop"],"pause-circle":["music","stop"],play:["music","start"],"play-circle":["music","start"],plus:["add","new"],"plus-circle":["add","new"],"plus-square":["add","new"],pocket:["logo","save"],power:["on","off"],radio:["signal"],rewind:["music"],rss:["feed","subscribe"],save:["floppy disk"],send:["message","mail","paper airplane"],settings:["cog","edit","gear","preferences"],shield:["security"],"shield-off":["security"],"shopping-bag":["ecommerce","cart","purchase","store"],"shopping-cart":["ecommerce","cart","purchase","store"],shuffle:["music"],"skip-back":["music"],"skip-forward":["music"],slash:["ban","no"],sliders:["settings","controls"],smile:["emoji","face","happy","good","emotion"],speaker:["music"],star:["bookmark","favorite","like"],sun:["brightness","weather","light"],sunrise:["weather"],sunset:["weather"],tag:["label"],target:["bullseye"],terminal:["code","command line"],"thumbs-down":["dislike","bad"],"thumbs-up":["like","good"],"toggle-left":["on","off","switch"],"toggle-right":["on","off","switch"],trash:["garbage","delete","remove"],"trash-2":["garbage","delete","remove"],triangle:["delta"],truck:["delivery","van","shipping"],twitter:["logo"],umbrella:["rain","weather"],"video-off":["camera","movie","film"],video:["camera","movie","film"],voicemail:["phone"],volume:["music","sound","mute"],"volume-1":["music","sound"],"volume-2":["music","sound"],"volume-x":["music","sound","mute"],watch:["clock","time"],wind:["weather","air"],"x-circle":["cancel","close","delete","remove","times"],"x-square":["cancel","close","delete","remove","times"],x:["cancel","close","delete","remove","times"],youtube:["logo","video","play"],"zap-off":["flash","camera","lightning"],zap:["flash","camera","lightning"]}},function(e){e.exports={activity:'',airplay:'',"alert-circle":'',"alert-octagon":'',"alert-triangle":'',"align-center":'',"align-justify":'',"align-left":'',"align-right":'',anchor:'',aperture:'',archive:'',"arrow-down-circle":'',"arrow-down-left":'',"arrow-down-right":'',"arrow-down":'',"arrow-left-circle":'',"arrow-left":'',"arrow-right-circle":'',"arrow-right":'',"arrow-up-circle":'',"arrow-up-left":'',"arrow-up-right":'',"arrow-up":'',"at-sign":'',award:'',"bar-chart-2":'',"bar-chart":'',"battery-charging":'',battery:'',"bell-off":'',bell:'',bluetooth:'',bold:'',"book-open":'',book:'',bookmark:'',box:'',briefcase:'',calendar:'',"camera-off":'',camera:'',cast:'',"check-circle":'',"check-square":'',check:'',"chevron-down":'',"chevron-left":'',"chevron-right":'',"chevron-up":'',"chevrons-down":'',"chevrons-left":'',"chevrons-right":'',"chevrons-up":'',chrome:'',circle:'',clipboard:'',clock:'',"cloud-drizzle":'',"cloud-lightning":'',"cloud-off":'',"cloud-rain":'',"cloud-snow":'',cloud:'',code:'',codepen:'',coffee:'',command:'',compass:'',copy:'',"corner-down-left":'',"corner-down-right":'',"corner-left-down":'',"corner-left-up":'',"corner-right-down":'',"corner-right-up":'',"corner-up-left":'',"corner-up-right":'',cpu:'',"credit-card":'',crop:'',crosshair:'',database:'',delete:'',disc:'',"dollar-sign":'',"download-cloud":'',download:'',droplet:'',"edit-2":'',"edit-3":'',edit:'',"external-link":'',"eye-off":'',eye:'',facebook:'',"fast-forward":'',feather:'',"file-minus":'',"file-plus":'',"file-text":'',file:'',film:'',filter:'',flag:'',"folder-minus":'',"folder-plus":'',folder:'',frown:'',gift:'',"git-branch":'',"git-commit":'',"git-merge":'',"git-pull-request":'',github:'',gitlab:'',globe:'',grid:'',"hard-drive":'',hash:'',headphones:'',heart:'',"help-circle":'',home:'',image:'',inbox:'',info:'',instagram:'',italic:'',layers:'',layout:'',"life-buoy":'',"link-2":'',link:'',linkedin:'',list:'',loader:'',lock:'',"log-in":'',"log-out":'',mail:'',"map-pin":'',map:'',"maximize-2":'',maximize:'',meh:'',menu:'',"message-circle":'',"message-square":'',"mic-off":'',mic:'',"minimize-2":'',minimize:'',"minus-circle":'',"minus-square":'',minus:'',monitor:'',moon:'',"more-horizontal":'',"more-vertical":'',move:'',music:'',"navigation-2":'',navigation:'',octagon:'',package:'',paperclip:'',"pause-circle":'',pause:'',percent:'',"phone-call":'',"phone-forwarded":'',"phone-incoming":'',"phone-missed":'',"phone-off":'',"phone-outgoing":'',phone:'',"pie-chart":'',"play-circle":'',play:'',"plus-circle":'',"plus-square":'',plus:'',pocket:'',power:'',printer:'',radio:'',"refresh-ccw":'',"refresh-cw":'',repeat:'',rewind:'',"rotate-ccw":'',"rotate-cw":'',rss:'',save:'',scissors:'',search:'',send:'',server:'',settings:'',"share-2":'',share:'',"shield-off":'',shield:'',"shopping-bag":'',"shopping-cart":'',shuffle:'',sidebar:'',"skip-back":'',"skip-forward":'',slack:'',slash:'',sliders:'',smartphone:'',smile:'',speaker:'',square:'',star:'',"stop-circle":'',sun:'',sunrise:'',sunset:'',tablet:'',tag:'',target:'',terminal:'',thermometer:'',"thumbs-down":'',"thumbs-up":'',"toggle-left":'',"toggle-right":'',"trash-2":'',trash:'',trello:'',"trending-down":'',"trending-up":'',triangle:'',truck:'',tv:'',twitter:'',type:'',umbrella:'',underline:'',unlock:'',"upload-cloud":'',upload:'',"user-check":'',"user-minus":'',"user-plus":'',"user-x":'',user:'',users:'',"video-off":'',video:'',voicemail:'',"volume-1":'',"volume-2":'',"volume-x":'',volume:'',watch:'',"wifi-off":'',wifi:'',wind:'',"x-circle":'',"x-square":'',x:'',youtube:'',"zap-off":'',zap:'',"zoom-in":'',"zoom-out":''}},function(e){e.exports={xmlns:"http://www.w3.org/2000/svg",width:24,height:24,viewBox:"0 0 24 24",fill:"none",stroke:"currentColor","stroke-width":2,"stroke-linecap":"round","stroke-linejoin":"round"}},function(e,n,i){"use strict";Object.defineProperty(n,"__esModule",{value:!0});var l=Object.assign||function(e){for(var n=1;n2&&void 0!==arguments[2]?arguments[2]:[];!function(e,n){if(!(e instanceof n))throw new TypeError("Cannot call a class as a function")}(this,e),this.name=n,this.contents=i,this.tags=t,this.attrs=l({},o.default,{class:"feather feather-"+n})}return t(e,[{key:"toSvg",value:function(){var e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};return""+this.contents+""}},{key:"toString",value:function(){return this.contents}}]),e}();n.default=c},function(e,n,i){"use strict";var l=o(i(8)),t=o(i(31)),r=o(i(30));function o(e){return e&&e.__esModule?e:{default:e}}e.exports={icons:l.default,toSvg:t.default,replace:r.default}},function(e,n,i){var l=i(0)("iterator"),t=!1;try{var r=[7][l]();r.return=function(){t=!0},Array.from(r,function(){throw 2})}catch(e){}e.exports=function(e,n){if(!n&&!t)return!1;var i=!1;try{var r=[7],o=r[l]();o.next=function(){return{done:i=!0}},r[l]=function(){return o},e(r)}catch(e){}return i}},function(e,n,i){var l=i(22),t=i(0)("toStringTag"),r="Arguments"==l(function(){return arguments}());e.exports=function(e){var n,i,o;return void 0===e?"Undefined":null===e?"Null":"string"==typeof(i=function(e,n){try{return e[n]}catch(e){}}(n=Object(e),t))?i:r?l(n):"Object"==(o=l(n))&&"function"==typeof n.callee?"Arguments":o}},function(e,n,i){var l=i(38),t=i(0)("iterator"),r=i(10);e.exports=i(2).getIteratorMethod=function(e){if(void 0!=e)return e[t]||e["@@iterator"]||r[l(e)]}},function(e,n,i){"use strict";var l=i(6),t=i(12);e.exports=function(e,n,i){n in e?l.f(e,n,t(0,i)):e[n]=i}},function(e,n,i){var l=i(10),t=i(0)("iterator"),r=Array.prototype;e.exports=function(e){return void 0!==e&&(l.Array===e||r[t]===e)}},function(e,n,i){var l=i(5);e.exports=function(e,n,i,t){try{return t?n(l(i)[0],i[1]):n(i)}catch(n){var r=e.return;throw void 0!==r&&l(r.call(e)),n}}},function(e,n,i){"use strict";var l=i(24),t=i(28),r=i(17),o=i(42),a=i(41),c=i(21),y=i(40),p=i(39);t(t.S+t.F*!i(37)(function(e){Array.from(e)}),"Array",{from:function(e){var n,i,t,h,x=r(e),s="function"==typeof this?this:Array,u=arguments.length,d=u>1?arguments[1]:void 0,f=void 0!==d,v=0,g=p(x);if(f&&(d=l(d,u>2?arguments[2]:void 0,2)),void 0==g||s==Array&&a(g))for(i=new s(n=c(x.length));n>v;v++)y(i,v,f?d(x[v],v):x[v]);else for(h=g.call(x),i=new s;!(t=h.next()).done;v++)y(i,v,f?o(h,d,[t.value,v],!0):t.value);return i.length=v,i}})},function(e,n,i){var l=i(3),t=i(17),r=i(9)("IE_PROTO"),o=Object.prototype;e.exports=Object.getPrototypeOf||function(e){return e=t(e),l(e,r)?e[r]:"function"==typeof e.constructor&&e instanceof e.constructor?e.constructor.prototype:e instanceof Object?o:null}},function(e,n,i){var l=i(1).document;e.exports=l&&l.documentElement},function(e,n,i){var l=i(15),t=Math.max,r=Math.min;e.exports=function(e,n){return(e=l(e))<0?t(e+n,0):r(e,n)}},function(e,n,i){var l=i(23),t=i(21),r=i(46);e.exports=function(e){return function(n,i,o){var a,c=l(n),y=t(c.length),p=r(o,y);if(e&&i!=i){for(;y>p;)if((a=c[p++])!=a)return!0}else for(;y>p;p++)if((e||p in c)&&c[p]===i)return e||p||0;return!e&&-1}}},function(e,n,i){var l=i(22);e.exports=Object("z").propertyIsEnumerable(0)?Object:function(e){return"String"==l(e)?e.split(""):Object(e)}},function(e,n,i){var l=i(3),t=i(23),r=i(47)(!1),o=i(9)("IE_PROTO");e.exports=function(e,n){var i,a=t(e),c=0,y=[];for(i in a)i!=o&&l(a,i)&&y.push(i);for(;n.length>c;)l(a,i=n[c++])&&(~r(y,i)||y.push(i));return y}},function(e,n,i){var l=i(49),t=i(19);e.exports=Object.keys||function(e){return l(e,t)}},function(e,n,i){var l=i(6),t=i(5),r=i(50);e.exports=i(4)?Object.defineProperties:function(e,n){t(e);for(var i,o=r(n),a=o.length,c=0;a>c;)l.f(e,i=o[c++],n[i]);return e}},function(e,n,i){var l=i(5),t=i(51),r=i(19),o=i(9)("IE_PROTO"),a=function(){},c=function(){var e,n=i(26)("iframe"),l=r.length;for(n.style.display="none",i(45).appendChild(n),n.src="javascript:",(e=n.contentWindow.document).open(),e.write(" - - - - - - - - diff --git a/previews/PR44/oneapi/index.html b/previews/PR44/oneapi/index.html deleted file mode 100644 index e141201..0000000 --- a/previews/PR44/oneapi/index.html +++ /dev/null @@ -1,195 +0,0 @@ - - - - - - - - - - - - - - - - - - - Intel oneAPI ⋅ JuliaGPU - - - - - - -
-
- - - - -

Intel oneAPI

- -

- - - -

- -

oneAPI is an open standard for programming hardware accelerators, originally designed by Intel. The oneAPI.jl package offers a Julia interface to this programming model. The package is in early development, but already provides most features for application development.

-

Similarly to other GPU support packages in Julia, oneAPI.jl makes it possible to work with accelerators at three distinct abstraction levels:

-
    -
  • high-level, using the oneArray array type and Julia's powerful array abstractions;

    -
  • -
  • by writing your own kernels and launching them using the @oneapi macro;

    -
  • -
  • using the low-level Level Zero wrappers in the oneL0 submodule.

    -
  • -
-

For more information, refer to the following blog posts:

- - -
-
- - - - - - - - - - - - - - - - - diff --git a/previews/PR44/other/index.html b/previews/PR44/other/index.html deleted file mode 100644 index 6ee4159..0000000 --- a/previews/PR44/other/index.html +++ /dev/null @@ -1,174 +0,0 @@ - - - - - - - - - - - - - - - - - - - Other ⋅ JuliaGPU - - - - - - -
-
- - - - -

Other

-

Several other back-ends exist, not all of them with the same level of polish or support as the NVIDIA and AMD back-ends.

-

OpenCL

-

Programming OpenCL GPUs in Julia is much more limited than other supported platforms. On recent versions of Julia, only OpenCL.jl is available. This package can be used to compile and execute GPU kernels written in OpenCL C.

-

ArrayFire

-

ArrayFire is a general-purpose software library that targets CPUs, GPUs, and other accelerator hardware. The ArrayFire.jl package provides a Julia interface to this library, and makes it possible to program accelerators using an array abstraction built on the ArrayFire library.

-

SX-Aurora

-

The NEC SX-Aurora Tsubasa is a PCIe card which works as a Vector Computer. It can be programmed from Julia using the VectorEngine.jl package, which at the moment requires a custom Julia build using a LLVM fork. Support is expected to improve due NECs involement.

- -
-
- - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2019-12-12-new_site/index.html b/previews/PR44/post/2019-12-12-new_site/index.html deleted file mode 100644 index 0f2c8b9..0000000 --- a/previews/PR44/post/2019-12-12-new_site/index.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - - - - - - - - - - - - - - New website for JuliaGPU ⋅ JuliaGPU - - - - - - -
-
- - - - - -

New website for JuliaGPU

- -
- -Tim Besard - - -

- - - -

This post is located at /new_site/

-

Welcome to the new landing page for the JuliaGPU organization. This website serves as an introduction to the several packages for programming GPUs in Julia, with pointers to relevant resources for new users.

-

The sources for this website are hosted at GitHub and generated using Hugo, feel free to open an issue or pull request if you think it could be improved.

- -
-
- - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2020-03-25-cudanative_3.0-cuarrays_2.0/index.html b/previews/PR44/post/2020-03-25-cudanative_3.0-cuarrays_2.0/index.html deleted file mode 100644 index 0d0ebba..0000000 --- a/previews/PR44/post/2020-03-25-cudanative_3.0-cuarrays_2.0/index.html +++ /dev/null @@ -1,238 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDAnative.jl 3.0 and CuArrays.jl 2.0 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDAnative.jl 3.0 and CuArrays.jl 2.0

- -
- -Tim Besard - - -

- - - -

This post is located at /cudanative_3.0-cuarrays_2.0/

-

This release of the Julia CUDA stack contains some exciting new features: automatic installation of CUDA using artifacts, full support for GPU method redefinitions, and experimental support for multitasking and multithreading. The release is technically breaking, but most end-users should not be affected.

-

API changes

-

Changes to certain APIs require these releases to be breaking, however, most users should not be affected and chances are you can just bump your Compat entries without any additional changes. Flux.jl users will have to wait a little longer though, as the package uses non-public APIs that have changed and requires an update.

-

Artifacts

-

CUDA and its dependencies will now be automatically installed using artifacts generated by BinaryBuilder.jl. This greatly improves usability, and only requires a functioning NVIDIA driver:

-
julia> ENV["JULIA_DEBUG"] = "CUDAnative"
-
-julia> using CUDAnative
-
-julia> CUDAnative.version()
-┌ Debug: Trying to use artifacts...
-└ @ CUDAnative CUDAnative/src/bindeps.jl:52
-┌ Debug: Using CUDA 10.2.89 from an artifact at /depot/artifacts/...
-└ @ CUDAnative CUDAnative/src/bindeps.jl:108
-v"10.2.89"
-

Use of a local installation is still possible by setting the environment variable JULIA_CUDA_USE_BINARYBUILDER to false. For more details, refer to the documentation.

-

Relevant PRs: CUDAnative.jl#492 and CuArrays.jl#490

-

Method redefinitions

-

CUDAnative 3.0 now fully supports method redefinitions, commonly referred to as Julia issue #265, and makes it possible to use interactive programming tools like Revise.jl:

-
julia> child() = 0
-julia> parent() = (@cuprintln(child()); return)
-julia> @cuda parent()
-0
-
-julia> parent() = (@cuprintln(child() + 1); return)
-julia> @cuda parent()
-1
-
-
-julia> child() = 1
-julia> @cuda parent()
-2
-

Relevant PRs: CUDAnative.jl#581

-

Experimental: Multitasking and multithreading

-

With CUDAnative 3.0 and CuArrays 2.0 you can now use Julia tasks and threads to organize your code. In combination with CUDA streams, this makes it possible to execute kernels and other GPU operations in parallel:

-
@sync begin
-    function my_expensive_kernel()
-        return
-    end
-    @async @cuda stream=CuStream() my_expensive_kernel()
-    @async @cuda stream=CuStream() my_expensive_kernel()
-end
-

Every task, whether it runs on a separate thread or not, can work with a different device, as well as independently work with CUDA libraries like CUBLAS and CUFFT.

-

Note that this support is experimental, and lacks certain features to be fully effective. For one, the CuArrays memory allocator is not device-aware, and it is currently not possible to configure the CUDA stream for operations like map or broadcast.

-

Relevant PRs: CUDAnative.jl#609 and CuArrays.jl#645

-

Minor changes

-

GPU kernels are now name-mangled like C++, which offers better integration with NVIDIA tools (CUDAnative.jl#559).

-

A better N-dimensional mapreducedim! kernel, properly integrating with all Base interfaces (CuArrays.jl#602 and GPUArrays#246).

-

A CuIterator type for batching arrays to the GPU (by @jrevels, CuArrays.jl#467).

-

Integration with Base's 5-arg mul! (by @haampie, CuArrays.jl#641 and GPUArrays#253).

-

Integration with Cthulhu.jl for interactive inspection of generated code (CUDAnative.jl#597).

-

Known issues

-

With a release as big as this one there's bound to be some bugs, e.g., with the installation of artifacts on exotic systems, or due to the many changes to make the libraries thread-safe. If you need absolute stability, please wait for a point release.

-

There are also some known issues. CUDAnative is currently not compatible with Julia 1.5 due to Base compiler changes (julia#34993), the new mapreducedim! kernel appears to be slower in some cases (CuArrays.jl#611), and there are some remaining thread-safety issues when using the non-default memory pool (CuArrays.jl#647).

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2020-07-07-cuda_1.1/imgwarp.png b/previews/PR44/post/2020-07-07-cuda_1.1/imgwarp.png deleted file mode 100644 index 8879e2d..0000000 Binary files a/previews/PR44/post/2020-07-07-cuda_1.1/imgwarp.png and /dev/null differ diff --git a/previews/PR44/post/2020-07-07-cuda_1.1/index.html b/previews/PR44/post/2020-07-07-cuda_1.1/index.html deleted file mode 100644 index 83f653b..0000000 --- a/previews/PR44/post/2020-07-07-cuda_1.1/index.html +++ /dev/null @@ -1,297 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 1.1 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 1.1

- -
- -Tim Besard - - -

- - - -

CUDA.jl 1.1 marks the first feature release after merging several CUDA packages into one. It raises the minimal Julia version to 1.4, and comes with support for the impending 1.5 release.

-

CUDA.jl replacing CuArrays/CUDAnative.jl

-

As announced a while back, CUDA.jl is now the new package for programming CUDA GPUs in Julia, replacing CuArrays.jl, CUDAnative.jl, CUDAdrv.jl and CUDAapi.jl. The merged package should be a drop-in replacement: All existing functionality has been ported, and almost all exported functions are still there. Applications like Flux.jl or the DiffEq.jl stack are being updated to support this change.

-

CUDA 11 support

-

With CUDA.jl 1.1, we support the upcoming release of the CUDA toolkit. This only applies to locally-installed versions of the toolkit, i.e., you need to specify JULIA_CUDA_USE_BINARYBUILDER=false in your environment to pick up the locally-installed release candidate of the CUDA toolkit. New features, like the third-generation tensor cores and its extended type support, or any new APIs, are not yet natively supported by Julia code.

-

NVIDIA Management Library (NVML)

-

CUDA.jl now integrates with the NVIDIA Management Library, or NVML. With this library, it's possible to query information about the system, any GPU devices, their topology, etc.:

-
julia> using CUDA
-
-julia> dev = first(NVML.devices())
-CUDA.NVML.Device(Ptr{Nothing} @0x00007f987c7c6e38)
-
-julia> NVML.uuid(dev)
-UUID("b8d5e790-ea4d-f962-e0c3-0448f69f2e23")
-
-julia> NVML.name(dev)
-"Quadro RTX 5000"
-
-julia> NVML.power_usage(dev)
-37.863
-
-julia> NVML.energy_consumption(dev)
-65330.292
-

Experimental: Texture support

-

It is now also possible to use the GPU's hardware texture support from Julia, albeit using a fairly low-level and still experimental API (many thanks to @cdsousa for the initial development). As a demo, let's start with loading a sample image:

-
julia> using Images, TestImages, ColorTypes, FixedPointNumbers
-julia> img = RGBA{N0f8}.(testimage("lighthouse"))
-

We use RGBA since CUDA's texture hardware only supports 1, 2 or 4 channels. This support is also currently limited to "plain" types, so let's reinterpret the image:

-
julia> img′ = reinterpret(NTuple{4,UInt8}, img)
-

Now we can upload this image to the array, using the CuTextureArray type for optimized storage (normal CuArrays are supported too), and bind it to a CuTexture object that we can pass to a kernel:

-
julia> texturearray = CuTextureArray(img′)
-
-julia> texture = CuTexture(texturearray; normalized_coordinates=true)
-512×768 4-channel CuTexture(::CuTextureArray) with eltype NTuple{4,UInt8}
-

Let's write and a kernel that warps this image. Since we specified normalized_coordinates=true, we index the texture using values in [0,1]:

-
function warp(dst, texture)
-    tid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    I = CartesianIndices(dst)
-    @inbounds if tid <= length(I)
-        i,j = Tuple(I[tid])
-        u = Float32(i-1) / Float32(size(dst, 1)-1)
-        v = Float32(j-1) / Float32(size(dst, 2)-1)
-        x = u + 0.02f0 * CUDA.sin(30v)
-        y = v + 0.03f0 * CUDA.sin(20u)
-        dst[i,j] = texture[x,y]
-    end
-    return
-end
-

The size of the output image determines how many elements we need to process. This needs to be translated to a number of threads and blocks, keeping in mind device and kernel characteristics. We automate this using the occupancy API:

-
julia> outimg_d = CuArray{eltype(img′)}(undef, 500, 1000);
-
-julia> function configurator(kernel)
-           config = launch_configuration(kernel.fun)
-
-           threads = Base.min(length(outimg_d), config.threads)
-           blocks = cld(length(outimg_d), threads)
-
-           return (threads=threads, blocks=blocks)
-       end
-
-julia> @cuda config=configurator warp(outimg_d, texture)
-

Finally, we fetch and visualize the output:

-
julia> outimg = Array(outimg_d)
-
-julia> save("imgwarp.png", reinterpret(eltype(img), outimg))
-
- Warped lighthouse -
- -

Minor features

-

The test-suite is now parallelized, using up-to JULIA_NUM_THREADS processes:

-
$ JULIA_NUM_THREADS=4 julia -e 'using Pkg; Pkg.test("CUDA");'
-
-                                     |          | ---------------- GPU ---------------- | ---------------- CPU ---------------- |
-Test                        (Worker) | Time (s) | GC (s) | GC % | Alloc (MB) | RSS (MB) | GC (s) | GC % | Alloc (MB) | RSS (MB) |
-initialization                   (2) |     2.52 |   0.00 |  0.0 |       0.00 |   115.00 |   0.05 |  1.8 |     153.13 |   546.27 |
-apiutils                         (4) |     0.55 |   0.00 |  0.0 |       0.00 |   115.00 |   0.02 |  4.0 |      75.86 |   522.36 |
-codegen                          (4) |    14.81 |   0.36 |  2.5 |       0.00 |   157.00 |   0.62 |  4.2 |    1592.28 |   675.15 |
-...
-gpuarrays/mapreduce essentials   (2) |   113.52 |   0.01 |  0.0 |       3.19 |   641.00 |   2.61 |  2.3 |    8232.84 |  2449.35 |
-gpuarrays/mapreduce (old tests)  (5) |   138.35 |   0.01 |  0.0 |     130.20 |   507.00 |   2.94 |  2.1 |    8615.15 |  2353.62 |
-gpuarrays/mapreduce derivatives  (3) |   180.52 |   0.01 |  0.0 |       3.06 |   229.00 |   3.44 |  1.9 |   12262.67 |  1403.39 |
-
-Test Summary: |  Pass  Broken  Total
-  Overall     | 11213       3  11216
-    SUCCESS
-    Testing CUDA tests passed
-

A copy of Base.versioninfo() is available to report on the CUDA toolchain and any devices:

-
julia> CUDA.versioninfo()
-CUDA toolkit 10.2.89, artifact installation
-CUDA driver 11.0.0
-NVIDIA driver 450.36.6
-
-Libraries:
-- CUBLAS: 10.2.2
-- CURAND: 10.1.2
-- CUFFT: 10.1.2
-- CUSOLVER: 10.3.0
-- CUSPARSE: 10.3.1
-- CUPTI: 12.0.0
-- NVML: 11.0.0+450.36.6
-- CUDNN: 7.6.5 (for CUDA 10.2.0)
-- CUTENSOR: 1.1.0 (for CUDA 10.2.0)
-
-Toolchain:
-- Julia: 1.5.0-rc1.0
-- LLVM: 9.0.1
-- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4
-- Device support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75
-
-1 device(s):
-- Quadro RTX 5000 (sm_75, 14.479 GiB / 15.744 GiB available)
-

CUTENSOR artifacts have been upgraded to version 1.1.0.

-

Benchmarking infrastructure based on the Codespeed project has been set-up at speed.juliagpu.org to keep track of the performance of various operations.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2020-07-18-cuda_1.3/index.html b/previews/PR44/post/2020-07-18-cuda_1.3/index.html deleted file mode 100644 index b45c6c1..0000000 --- a/previews/PR44/post/2020-07-18-cuda_1.3/index.html +++ /dev/null @@ -1,268 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 1.3 - Multi-device programming ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 1.3 - Multi-device programming

- -
- -Tim Besard - - -

- - - -

Today we're releasing CUDA.jl 1.3, with several new features. The most prominent change is support for multiple GPUs within a single process.

-

Multi-GPU programming

-

With CUDA.jl 1.3, you can finally use multiple CUDA GPUs within a single process. To switch devices you can call device!, query the current device with device(), or reset it using device_reset!():

-
julia> collect(devices())
-9-element Array{CuDevice,1}:
- CuDevice(0): Tesla V100-PCIE-32GB
- CuDevice(1): Tesla V100-PCIE-32GB
- CuDevice(2): Tesla V100-PCIE-32GB
- CuDevice(3): Tesla V100-PCIE-32GB
- CuDevice(4): Tesla V100-PCIE-16GB
- CuDevice(5): Tesla P100-PCIE-16GB
- CuDevice(6): Tesla P100-PCIE-16GB
- CuDevice(7): GeForce GTX 1080 Ti
- CuDevice(8): GeForce GTX 1080 Ti
-
-julia> device!(5)
-
-julia> device()
-CuDevice(5): Tesla P100-PCIE-16GB
-

Let's define a kernel to show this really works:

-
julia> function kernel()
-           dev = Ref{Cint}()
-           CUDA.cudaGetDevice(dev)
-           @cuprintln("Running on device $(dev[])")
-           return
-       end
-
-julia> @cuda kernel()
-Running on device 5
-
-julia> device!(0)
-
-julia> device()
-CuDevice(0): Tesla V100-PCIE-32GB
-
-julia> @cuda kernel()
-Running on device 0
-

Memory allocations, like CuArrays, are implicitly bound to the device they were allocated on. That means you should take care to only use an array when the owning device is active, or you will run into errors:

-
julia> device()
-CuDevice(0): Tesla V100-PCIE-32GB
-
-julia> a = CUDA.rand(1)
-1-element CuArray{Float32,1}:
- 0.6322775
-
-julia> device!(1)
-
-julia> a
-ERROR: CUDA error: an illegal memory access was encountered
-

Future improvements might make the array type device-aware.

-

Multitasking and multithreading

-

Dovetailing with the support for multiple GPUs, is the ability to use these GPUs on separate Julia tasks and threads:

-
julia> device!(0)
-
-julia> @sync begin
-         @async begin
-           device!(1)
-           println("Working with $(device()) on $(current_task())")
-           yield()
-           println("Back to device $(device()) on $(current_task())")
-         end
-         @async begin
-           device!(2)
-           println("Working with $(device()) on $(current_task())")
-         end
-       end
-Working with CuDevice(1) on Task @0x00007fc9e6a48010
-Working with CuDevice(2) on Task @0x00007fc9e6a484f0
-Back to device CuDevice(1) on Task @0x00007fc9e6a48010
-
-julia> device()
-CuDevice(0): Tesla V100-PCIE-32GB
-

Each task has its own local GPU state, such as the device it was bound to, handles to libraries like CUBLAS or CUDNN (which means that each task can configure libraries independently), etc.

-

Minor features

-

CUDA.jl 1.3 also features some minor changes:

-
    -
  • Reinstated compatibility with Julia 1.3

    -
  • -
  • Support for CUDA 11.0 Update 1

    -
  • -
  • Support for CUDNN 8.0.2

    -
  • -
-

Known issues

-

Several operations on sparse arrays have been broken since CUDA.jl 1.2, due to the deprecations that were part of CUDA 11. The next version of CUDA.jl will drop support for CUDA 10.0 or older, which will make it possible to use new cuSPARSE APIs and add back missing functionality.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2020-09-28-gemmkernels/index.html b/previews/PR44/post/2020-09-28-gemmkernels/index.html deleted file mode 100644 index fd78bf8..0000000 --- a/previews/PR44/post/2020-09-28-gemmkernels/index.html +++ /dev/null @@ -1,206 +0,0 @@ - - - - - - - - - - - - - - - - - - - - Paper: Flexible Performant GEMM Kernels on GPUs ⋅ JuliaGPU - - - - - - -
-
- - - - - -

Paper: Flexible Performant GEMM Kernels on GPUs

- -
- -Thomas Faingnaert, Tim Besard, Bjorn De Sutter - - -

- - - -

General Matrix Multiplication or GEMM kernels take center place in high performance computing and machine learning. Recent NVIDIA GPUs include GEMM accelerators, such as NVIDIA's Tensor Cores. In this paper we show how it is possible to program these accelerators from Julia, and present abstractions and interfaces that allow to do so efficiently without sacrificing performance.

-

A pre-print of the paper has been published on arXiv: arXiv:2009.12263.
The source code can be found on GitHub: thomasfaingnaert/GemmKernels.jl.

-

With the APIs from GemmKernels.jl, it is possible to instantiate GEMM kernels that perform in the same ball park as, and sometimes even outperform state-of-the-art libraries like CUBLAS and CUTLASS. For example, performing a mixed-precision multiplication of two 16-bit matrixes into a 32-bit accumulator (on different combinations of layouts):

-
- Performance of mixed-precision GEMM -
- -

The APIs are also highly flexible and allow customization of each step, e.g., to apply the activation function max(x, 0) for implementing a rectified linear unit (ReLU):

-
a = CuArray(rand(Float16, (M, K)))
-b = CuArray(rand(Float16, (K, N)))
-c = CuArray(rand(Float32, (M, N)))
-d = similar(c)
-
-conf = GemmKernels.get_config(
-    gemm_shape = (M = M, N = N, K = K),
-    operator = Operator.WMMAOp{16, 16, 16},
-    global_a_layout = Layout.AlignedColMajor{Float16},
-    global_c_layout = Layout.AlignedColMajor{Float32})
-
-GemmKernels.matmul(
-    a, b, c, d, conf;
-    transform_regs_to_shared_d = Transform.Elementwise(x -> max(x, 0)))
-

The GemmKernels.jl framework is written entirely in Julia, demonstrating the high-performance GPU programming capabilities of this language, but at the same time keeping the research accessible and easy to modify or repurpose by other Julia developers.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2020-09-28-gemmkernels/mixed_precision.png b/previews/PR44/post/2020-09-28-gemmkernels/mixed_precision.png deleted file mode 100644 index ad7a981..0000000 Binary files a/previews/PR44/post/2020-09-28-gemmkernels/mixed_precision.png and /dev/null differ diff --git a/previews/PR44/post/2020-10-02-cuda_2.0/index.html b/previews/PR44/post/2020-10-02-cuda_2.0/index.html deleted file mode 100644 index e10efa7..0000000 --- a/previews/PR44/post/2020-10-02-cuda_2.0/index.html +++ /dev/null @@ -1,346 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 2.0 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 2.0

- -
- -Tim Besard - - -

- - - -

Today we're releasing CUDA.jl 2.0, a breaking release with several new features. Highlights include initial support for Float16, a switch to CUDA's new stream model, a much-needed rework of the sparse array support and support for CUDA 11.1.

-

The release now requires Julia 1.5, and assumes a GPU with compute capability 5.0 or higher (although most of the package will still work with an older GPU).

-

Low- and mixed-precision operations

-

With NVIDIA's latest GPUs featuring more and more low-precision operations, CUDA.jl now starts to support these data types. For example, the CUBLAS wrappers can be used with (B)Float16 inputs (running under JULIA_DEBUG=CUBLAS to illustrate the called methods) thanks to the cublasGemmEx API call:

-
julia> mul!(CUDA.zeros(Float32,2,2),
-            cu(rand(Float16,2,2)),
-            cu(rand(Float16,2,2)))
-
-I! cuBLAS (v11.0) function cublasStatus_t cublasGemmEx(...) called:
-i!  Atype: type=cudaDataType_t; val=CUDA_R_16F(2)
-i!  Btype: type=cudaDataType_t; val=CUDA_R_16F(2)
-i!  Ctype: type=cudaDataType_t; val=CUDA_R_32F(0)
-i!  computeType: type=cublasComputeType_t; val=CUBLAS_COMPUTE_32F(68)
-
-2×2 CuArray{Float32,2}:
- 0.481284  0.561241
- 1.12923   1.04541
-
julia> using BFloat16s
-
-julia> mul!(CUDA.zeros(BFloat16,2,2),
-            cu(BFloat16.(rand(2,2))),
-            cu(BFloat16.(rand(2,2))))
-
-I! cuBLAS (v11.0) function cublasStatus_t cublasGemmEx(...) called:
-i!  Atype: type=cudaDataType_t; val=CUDA_R_16BF(14)
-i!  Btype: type=cudaDataType_t; val=CUDA_R_16BF(14)
-i!  Ctype: type=cudaDataType_t; val=CUDA_R_16BF(14)
-i!  computeType: type=cublasComputeType_t; val=CUBLAS_COMPUTE_32F(68)
-
-2×2 CuArray{BFloat16,2}:
- 0.300781   0.71875
- 0.0163574  0.0241699
-

Alternatively, CUBLAS can be configured to automatically down-cast 32-bit inputs to Float16. This is now exposed through a task-local CUDA.jl math mode:

-
julia> CUDA.math_mode!(CUDA.FAST_MATH; precision=:Float16)
-
-julia> mul!(CuArray(zeros(Float32,2,2)),
-            CuArray(rand(Float32,2,2)),
-            CuArray(rand(Float32,2,2)))
-
-I! cuBLAS (v11.0) function cublasStatus_t cublasGemmEx(...) called:
-i!  Atype: type=cudaDataType_t; val=CUDA_R_32F(0)
-i!  Btype: type=cudaDataType_t; val=CUDA_R_32F(0)
-i!  Ctype: type=cudaDataType_t; val=CUDA_R_32F(0)
-i!  computeType: type=cublasComputeType_t; val=CUBLAS_COMPUTE_32F_FAST_16F(74)
-
-2×2 CuArray{Float32,2}:
- 0.175258  0.226159
- 0.511893  0.331351
-

As part of these changes, CUDA.jl now defaults to using tensor cores. This may affect accuracy; use math mode PEDANTIC if you want the old behavior.

-

Work is under way to extend these capabilities to the rest of CUDA.jl, e.g., the CUDNN wrappers, or the native kernel programming capabilities.

-

New default stream semantics

-

In CUDA.jl 2.0 we're switching to CUDA's simplified stream programming model. This simplifies working with multiple streams, and opens up more possibilities for concurrent execution of GPU operations.

-

Multi-stream programming

-

In the old model, the default stream (used by all GPU operations unless specified otherwise) was a special stream whose commands could not be executed concurrently with commands on regular, explicitly-created streams. For example, if we interleave kernels executed on a dedicated stream with ones on the default one, execution was serialized:

-
using CUDA
-
-N = 1 << 20
-
-function kernel(x, n)
-    tid = threadIdx().x + (blockIdx().x-1) * blockDim().x
-    for i = tid:blockDim().x*gridDim().x:n
-        x[i] = CUDA.sqrt(CUDA.pow(3.14159f0, i))
-    end
-    return
-end
-
-num_streams = 8
-
-for i in 1:num_streams
-    stream = CuStream()
-
-    data = CuArray{Float32}(undef, N)
-
-    @cuda blocks=1 threads=64 stream=stream kernel(data, N)
-
-    @cuda kernel(data, 0)
-end
-
- Multi-stream programming (old) -
- -

In the new model, default streams are regular streams and commands issued on them can execute concurrently with those on other streams:

-
- Multi-stream programming (new) -
- -

Multi-threading

-

Another consequence of the new stream model is that each thread gets its own default stream (accessible as CuStreamPerThread()). Together with Julia's threading capabilities, this makes it trivial to group independent work in tasks, benefiting from concurrent execution on the GPU where possible:

-
using CUDA
-
-N = 1 << 20
-
-function kernel(x, n)
-    tid = threadIdx().x + (blockIdx().x-1) * blockDim().x
-    for i = tid:blockDim().x*gridDim().x:n
-        x[i] = CUDA.sqrt(CUDA.pow(3.14159f0, i))
-    end
-    return
-end
-
-Threads.@threads for i in 1:Threads.nthreads()
-    data = CuArray{Float32}(undef, N)
-    @cuda blocks=1 threads=64 kernel(data, N)
-    synchronize(CuDefaultStream())
-end
-
- Multi-threading (new) -
- -

With the old model, execution would have been serialized because the default stream was the same across threads:

-
- Multi-threading (old) -
- -

Future improvements will make this behavior configurable, such that users can use a different default stream per task.

-

Sparse array clean-up

-

As part of CUDA.jl 2.0, the sparse array support has been refactored, bringing them in line with other array types and their expected behavior. For example, the custom switch2 methods have been removed in favor of calls to convert and array constructors:

-
julia> using SparseArrays
-julia> using CUDA, CUDA.CUSPARSE
-
-julia> CuSparseMatrixCSC(CUDA.rand(2,2))
-2×2 CuSparseMatrixCSC{Float32} with 4 stored entries:
-  [1, 1]  =  0.124012
-  [2, 1]  =  0.791714
-  [1, 2]  =  0.487905
-  [2, 2]  =  0.752466
-
-julia> CuSparseMatrixCOO(sprand(2,2, 0.5))
-2×2 CuSparseMatrixCOO{Float64} with 3 stored entries:
-  [1, 1]  =  0.183183
-  [2, 1]  =  0.966466
-  [2, 2]  =  0.064101
-
-julia> CuSparseMatrixCSR(ans)
-2×2 CuSparseMatrixCSR{Float64} with 3 stored entries:
-  [1, 1]  =  0.183183
-  [2, 1]  =  0.966466
-  [2, 2]  =  0.064101
-

Initial support for the COO sparse matrix type has also been added, along with more better support for sparse matrix-vector multiplication.

-

Support for CUDA 11.1

-

This release also features support for the brand-new CUDA 11.1. As there is no compatible release of CUDNN or CUTENSOR yet, CUDA.jl won't automatically select this version, but you can force it to by setting the JULIA_CUDA_VERSION environment variable to 11.1:

-
julia> ENV["JULIA_CUDA_VERSION"] = "11.1"
-
-julia> using CUDA
-
-julia> CUDA.versioninfo()
-CUDA toolkit 11.1.0, artifact installation
-
-Libraries:
-- CUDNN: missing
-- CUTENSOR: missing
-

Minor changes

-

Many other changes are part of this release:

-
    -
  • Views, reshapes and array reinterpretations are now represented by the Base array wrappers, simplifying the CuArray type definition.

    -
  • -
  • Various optimizations to CUFFT and CUDNN library wrappers.

    -
  • -
  • Support for LinearAlgebra.reflect! and rotate!

    -
  • -
  • Initial support for calling CUDA libraries with strided inputs

    -
  • -
- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2020-10-02-cuda_2.0/multistream_after.png b/previews/PR44/post/2020-10-02-cuda_2.0/multistream_after.png deleted file mode 100644 index 691f7c8..0000000 Binary files a/previews/PR44/post/2020-10-02-cuda_2.0/multistream_after.png and /dev/null differ diff --git a/previews/PR44/post/2020-10-02-cuda_2.0/multistream_before.png b/previews/PR44/post/2020-10-02-cuda_2.0/multistream_before.png deleted file mode 100644 index 0b1079c..0000000 Binary files a/previews/PR44/post/2020-10-02-cuda_2.0/multistream_before.png and /dev/null differ diff --git a/previews/PR44/post/2020-10-02-cuda_2.0/multithread_after.png b/previews/PR44/post/2020-10-02-cuda_2.0/multithread_after.png deleted file mode 100644 index 27a699c..0000000 Binary files a/previews/PR44/post/2020-10-02-cuda_2.0/multithread_after.png and /dev/null differ diff --git a/previews/PR44/post/2020-10-02-cuda_2.0/multithread_before.png b/previews/PR44/post/2020-10-02-cuda_2.0/multithread_before.png deleted file mode 100644 index 4816d06..0000000 Binary files a/previews/PR44/post/2020-10-02-cuda_2.0/multithread_before.png and /dev/null differ diff --git a/previews/PR44/post/2020-10-30-cuda_2.1/index.html b/previews/PR44/post/2020-10-30-cuda_2.1/index.html deleted file mode 100644 index 52e7f64..0000000 --- a/previews/PR44/post/2020-10-30-cuda_2.1/index.html +++ /dev/null @@ -1,216 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 2.1 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 2.1

- -
- -Tim Besard - - -

- - - -

CUDA.jl v2.1 is a bug-fix release, with one new feature: support for cubic texture interpolations. The release also partly reverts a change from v2.0: reshape, reinterpret and contiguous views now return a CuArray again.

-

Generalized texture interpolations

-

CUDA's texture hardware only supports nearest-neighbour and linear interpolation, for other modes one is required to perform the interpolation by hand. In CUDA.jl v2.1 we are generalizing the texture interpolation API so that it is possible to use both hardware-backed and software-implemented interpolation modes in exactly the same way:

-
# N is the dimensionality (1, 2 or 3)
-# T is the element type (needs to be supported by the texture hardware)
-
-# source array
-src = rand(T, fill(10, N)...)
-
-# indices we want to interpolate
-idx = [tuple(rand(1:0.1:10, N)...) for _ in 1:10]
-
-# upload to the GPU
-gpu_src = CuArray(src)
-gpu_idx = CuArray(idx)
-
-# create a texture array for optimized fetching
-# this is required for N=1, optional for N=2 and N=3
-gpu_src = CuTextureArray(gpu_src)
-
-# interpolate using a texture
-gpu_dst = CuArray{T}(undef, size(gpu_idx))
-gpu_tex = CuTexture(gpu_src; interpolation=CUDA.NearestNeighbour())
-broadcast!(gpu_dst, gpu_idx, Ref(gpu_tex)) do idx, tex
-    tex[idx...]
-end
-
-# back to the CPU
-dst = Array(gpu_dst)
-

Here, we can change the interpolation argument to CuTexture to either NearestNeighbour or LinearInterpolation, both supported by the hardware, or CubicInterpolation which is implemented in software (building on the hardware-supported linear interpolation).

-

Partial revert of array wrapper changes

-

In CUDA.jl v2.0, we changed the behavior of several important array operations to reuse available wrappers in Base: reshape started returning a ReshapedArray, view now returned a SubArray, and reinterpret was reworked to use ReinterpretArray. These changes were made to ensure maximal compatibility with Base's array type, and to simplify the implementation in CUDA.jl and GPUArrays.jl.

-

However, this change turned out to regress the time to precompile and load CUDA.jl. Consequently, the change has been reverted, and these wrappers are now implemented as part of the CuArray type again. Note however that we intend to revisit this change in the future. It is therefore recommended to use the DenseCuArray type alias for methods that need a CuArray backed by contiguous GPU memory. For strided CuArrays, i.e. non-contiguous views, you should use the StridedCuArray alias.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2020-11-05-oneapi_0.1/index.html b/previews/PR44/post/2020-11-05-oneapi_0.1/index.html deleted file mode 100644 index 1cdf4a8..0000000 --- a/previews/PR44/post/2020-11-05-oneapi_0.1/index.html +++ /dev/null @@ -1,291 +0,0 @@ - - - - - - - - - - - - - - - - - - - - Introducing: oneAPI.jl ⋅ JuliaGPU - - - - - - -
-
- - - - - -

Introducing: oneAPI.jl

- -
- -Tim Besard - - -

- - - -

We're proud to announce the first version of oneAPI.jl, a Julia package for programming accelerators with the oneAPI programming model. It is currently available for select Intel GPUs, including common integrated ones, and offers a similar experience to CUDA.jl.

-

The initial version of this package, v0.1, consists of three key components:

-
    -
  • wrappers for the oneAPI Level Zero interfaces;

    -
  • -
  • a compiler for Julia source code to SPIR-V IR;

    -
  • -
  • and an array interface for convenient data-parallel programming.

    -
  • -
-

In this post, I'll briefly describe each of these. But first, some essentials.

-

Installation

-

oneAPI.jl is currently only supported on 64-bit Linux, using a sufficiently recent kernel, and requires Julia 1.5. Furthermore, it currently only supports a limited set of Intel GPUs: Gen9 (Skylake, Kaby Lake, Coffee Lake), Gen11 (Ice Lake), and Gen12 (Tiger Lake).

-

If your Intel CPU has an integrated GPU supported by oneAPI, you can just go ahead and install the oneAPI.jl package:

-
pkg> add oneAPI
-

That's right, no additional drivers required! oneAPI.jl ships its own copy of the Intel Compute Runtime, which works out of the box on any (sufficiently recent) Linux kernel. The initial download, powered by Julia's artifact subsystem, might take a while to complete. After that, you can import the package and start using its functionality:

-
julia> using oneAPI
-
-julia> oneAPI.versioninfo()
-Binary dependencies:
-- NEO_jll: 20.42.18209+0
-- libigc_jll: 1.0.5186+0
-- gmmlib_jll: 20.3.2+0
-- SPIRV_LLVM_Translator_jll: 9.0.0+1
-- SPIRV_Tools_jll: 2020.2.0+1
-
-Toolchain:
-- Julia: 1.5.2
-- LLVM: 9.0.1
-
-1 driver:
-- 00007fee-06cb-0a10-1642-ca9f01000000 (v1.0.0, API v1.0.0)
-
-1 device:
-- Intel(R) Graphics Gen9
-

The oneArray type

-

Similar to CUDA.jl's CuArray type, oneAPI.jl provides an array abstraction that you can use to easily perform data parallel operations on your GPU:

-
julia> a = oneArray(zeros(2,3))
-2×3 oneArray{Float64,2}:
- 0.0  0.0  0.0
- 0.0  0.0  0.0
-
-julia> a .+ 1
-2×3 oneArray{Float64,2}:
- 1.0  1.0  1.0
- 1.0  1.0  1.0
-
-julia> sum(ans; dims=2)
-2×1 oneArray{Float64,2}:
- 3.0
- 3.0
-

This functionality builds on the GPUArrays.jl package, which means that a lot of operations are supported out of the box. Some are still missing, of course, and we haven't carefully optimized for performance either.

-

Kernel programming

-

The above array operations are made possible by a compiler that transforms Julia source code into SPIR-V IR for use with oneAPI. Most of this work is part of GPUCompiler.jl. In oneAPI.jl, we use this compiler to provide a kernel programming model:

-
julia> function vadd(a, b, c)
-           i = get_global_id()
-           @inbounds c[i] = a[i] + b[i]
-           return
-       end
-
-julia> a = oneArray(rand(10));
-
-julia> b = oneArray(rand(10));
-
-julia> c = similar(a);
-
-julia> @oneapi items=10 vadd(a, b, c)
-
-julia> @test Array(a) .+ Array(b) == Array(c)
-Test Passed
-

Again, the @oneapi macro resembles @cuda from CUDA.jl. One of the differences with the CUDA stack is that we use OpenCL-style built-ins, like get_global_id instead of threadIdx and barrier instead of sync_threads. Other familiar functionality, e.g. to reflect on the compiler, is available as well:

-
julia> @device_code_spirv @oneapi vadd(a, b, c)
-; CompilerJob of kernel vadd(oneDeviceArray{Float64,1,1},
-;                            oneDeviceArray{Float64,1,1},
-;                            oneDeviceArray{Float64,1,1})
-; for GPUCompiler.SPIRVCompilerTarget
-
-; SPIR-V
-; Version: 1.0
-; Generator: Khronos LLVM/SPIR-V Translator; 14
-; Bound: 46
-; Schema: 0
-               OpCapability Addresses
-               OpCapability Linkage
-               OpCapability Kernel
-               OpCapability Float64
-               OpCapability Int64
-               OpCapability Int8
-          %1 = OpExtInstImport "OpenCL.std"
-               OpMemoryModel Physical64 OpenCL
-               OpEntryPoint Kernel
-               ...
-               OpReturn
-               OpFunctionEnd
-

Level Zero wrappers

-

To interface with the oneAPI driver, we use the Level Zero API. Wrappers for this API is available under the oneL0 submodule of oneAPI.jl:

-
julia> using oneAPI.oneL0
-
-julia> drv = first(drivers())
-ZeDriver(00000000-0000-0000-1642-ca9f01000000, version 1.0.0)
-
-julia> dev = first(devices(drv))
-ZeDevice(GPU, vendor 0x8086, device 0x1912): Intel(R) Graphics Gen9
-

This is a low-level interface, and importing this submodule should not be required for the vast majority of users. It is only useful when you want to perform very specific operations, like submitting an certain operations to the command queue, working with events, etc. In that case, you should refer to the upstream specification; The wrappers in the oneL0 module closely mimic the C APIs.

-

Status

-

Version 0.1 of oneAPI.jl forms a solid base for future oneAPI developments in Julia. Thanks to the continued effort of generalizing the Julia GPU support in packages like GPUArrays.jl and GPUCompiler.jl, this initial version is already much more usable than early versions of CUDA.jl or AMDGPU.jl ever were.

-

That said, there are crucial parts missing. For one, oneAPI.jl does not integrate with any of the vendor libraries like oneMKL or oneDNN. That means several important operations, e.g. matrix-matrix multiplication, will be slow. Hardware support is also limited, and the package currently only works on Linux.

-

If you want to contribute to oneAPI.jl, or run into problems, check out the GitHub repository at JuliaGPU/oneAPI.jl. For questions, please use the Julia Discourse forum under the GPU domain and/or in the #gpu channel of the Julia Slack.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2021-01-08-cuda_2.4_2.5/index.html b/previews/PR44/post/2021-01-08-cuda_2.4_2.5/index.html deleted file mode 100644 index f7824b3..0000000 --- a/previews/PR44/post/2021-01-08-cuda_2.4_2.5/index.html +++ /dev/null @@ -1,210 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 2.4 and 2.5 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 2.4 and 2.5

- -
- -Tim Besard - - -

- - - -

CUDA.jl v2.4 and v2.5 are two almost-identical feature releases, respectively for Julia 1.5 and 1.6. These releases feature a greatly improved findmin and findmax kernels, an improved interface for kernel introspection, support for CUDA 11.2, and of course many bug fixes.

-

Improved findmin and findmax kernels

-

Thanks to @tkf and @Ellipse0934, CUDA.jl now uses a single-pass kernel for finding the minimum or maximum item in a CuArray. This fixes compatibility with NaN-valued elements, while on average improving performance. Depending on the rank, shape and size of the array these improvements vary from a minor regression to order-of-magnitude improvements.

-

New kernel introspection interface

-

It is now possible to obtain a compiled-but-not-launched kernel by passing the launch=false keyword to @cuda. This is useful when you want to reflect, e.g., query the amount of registers, or other kernel properties:

-
julia> kernel = @cuda launch=false identity(nothing)
-CUDA.HostKernel{identity,Tuple{Nothing}}(...)
-
-julia> CUDA.registers(kernel)
-4
-

The old API is still available, and will even be extended in future versions of CUDA.jl for the purpose of compiling device functions (not kernels):

-
julia> kernel = cufunction(identity, Tuple{Nothing})
-CUDA.HostKernel{identity,Tuple{Nothing}}(...)
-

Support for CUDA 11.2

-

CUDA.jl now supports the latest version of CUDA, version 11.2. Because CUDNN and CUTENSOR are not compatible with this release yet, CUDA.jl won't automatically switch to it unless you explicitly request so:

-
julia> ENV["JULIA_CUDA_VERSION"] = "11.2"
-"11.2"
-
-julia> using CUDA
-
-julia> CUDA.versioninfo()
-CUDA toolkit 11.2.0, artifact installation
-CUDA driver 11.2.0
-NVIDIA driver 460.27.4
-

Alternatively, if you disable use of artifacts through JULIA_CUDA_USE_BINARYBUILDER=false, CUDA 11.2 can be picked up from your local system.

-

Future developments

-

Due to upstream compiler changes, CUDA.jl 2.4 is expected to be the last release compatible with Julia 1.5. Patch releases are still possible, but are not automatic: If you need a specific bugfix from a future CUDA.jl release, create an issue or PR to backport the change.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2021-04-09-cuda_3.0/index.html b/previews/PR44/post/2021-04-09-cuda_3.0/index.html deleted file mode 100644 index bc62c6e..0000000 --- a/previews/PR44/post/2021-04-09-cuda_3.0/index.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 3.0 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 3.0

- -
- -Tim Besard - - -

- - - -

CUDA.jl 3.0 is a significant, semi-breaking release that features greatly improved multi-tasking and multi-threading, support for CUDA 11.2 and its new memory allocator, compiler tooling for GPU method overrides, device-side random number generation and a completely revamped cuDNN interface.

-

Improved multi-tasking and multi-threading

-

Before this release, CUDA operations were enqueued on a single global stream, and many of these operations (like copying memory, or synchronizing execution) were fully blocking. This posed difficulties when using multiple tasks to perform independent operations: Blocking operations prevent all tasks from making progress, and using the same stream introduces unintended dependencies on otherwise independend operations. CUDA.jl now uses private streams for each Julia task, and avoids blocking operations where possible, enabling task-based concurrent execution. It is also possible to use different devices on each task, and there is experimental support for executing those tasks from different threads.

-

A picture snippet of code is worth a thousand words, so let's demonstrate using a computation that uses both a library function (GEMM from CUBLAS) and a native Julia broadcast kernel:

-
using CUDA, LinearAlgebra
-
-function compute(a,b,c)
-    mul!(c, a, b)
-    broadcast!(sin, c, c)
-    synchronize()
-    c
-end
-

To execute multiple invocations of this function concurrently, we can simply use Julia's task-based programming interfaces and wrap each call to compute in an @async block. Then, we synchronize execution again by wrapping in a @sync block:

-
function iteration(a,b,c)
-    results = Vector{Any}(undef, 2)
-    NVTX.@range "computation" @sync begin
-        @async begin
-            results[1] = compute(a,b,c)
-        end
-        @async begin
-            results[2] = compute(a,b,c)
-        end
-    end
-    NVTX.@range "comparison" Array(results[1]) == Array(results[2])
-end
-

The calls to the @range macro from NVTX, a submodule of CUDA.jl, will visualize the different phases of execution when we profile our program. We now invoke our function using some random data:

-
function main(N=1024)
-    a = CUDA.rand(N,N)
-    b = CUDA.rand(N,N)
-    c = CUDA.rand(N,N)
-
-    # make sure this data can be used by other tasks!
-    synchronize()
-
-    # warm-up
-    iteration(a,b,c)
-    GC.gc(true)
-
-    NVTX.@range "main" iteration(a,b,c)
-end
-

The snippet above illustrates one breaking aspect of this release: Because each task uses its own stream, you now need to synchronize when re-using data in another task. Although it is unlikely that any user code was relying on the old behavior, it is technically a breaking change, and as such we are bumping the major version of the CUDA.jl package.

-

If we profile these our program using NSight Systems, we can see how the execution of both calls to compute was overlapped:

-
- Overlapping execution on the GPU using task-based concurrency -
- -

The region highlighted in green was spent enqueueing operations from the CPU, which includes the call to synchronize(). This used to be a blocking operation, whereas now it only synchronizes the task-local stream while yielding to the Julia scheduler so that it can continue execution on another task. For synchronizing the entire device, use the new device_synchronize() function.

-

The remainder of computation was then spent executing kernels. Here, execution was overlapped, but that obviously depends on the exact characteristics of the computations and your GPU. Also note that copying to and from the CPU is always going to block for some time, unless the memory was page-locked. CUDA.jl now supports locking memory like that using the pin function; for more details refer to the CUDA.jl documentation on tasks and threads.

-

CUDA 11.2 and stream-ordered allocations

-

CUDA.jl now also fully supports CUDA 11.2, and it will default to using that version of the toolkit if your driver supports it. The release came with several new features, such as the new stream-ordered memory allocator. Without going into details, it is now possible to asynchonously allocate memory, obviating much of the need to cache those allocations in a memory pool. Initial benchmarks have shown nice speed-ups from using this allocator, while lowering memory pressure and thus reducing invocations of the Julia garbage collector.

-

When using CUDA 11.2, CUDA.jl will default to the CUDA-backed memory pool and disable its own caching layer. If you want to compare performance, you can still use the old allocator and caching memory pool by setting the JULIA_CUDA_MEMORY_POOL environment variable to, e.g. binned. On older versions of CUDA, the binned pool is still used by default.

-

GPU method overrides

-

With the new AbstractInterpreter functionality in Julia 1.6, it is now much easier to further customize the Base compiler. This has enabled us to develop a mechanism for overriding methods with GPU-specific counterparts. It used to be required to explicitly pick CUDA-specific versions, e.g. CUDA.sin, because the Base version performed some GPU-incompatible operation. This was problematic as it did not compose with generic code, and the CUDA-specific versions often lacked support for specific combinations of argument types (for example, CUDA.sin(::Complex) was not supported).

-

With CUDA 3.0, it is possible to define GPU-specific methods that override an existing definition, without requiring a new function type. For now, this functionality is private to CUDA.jl, but we expect to make it available to other packages starting with Julia 1.7.

-

This functionality has unblocked many issues, as can be seen in the corresponding pull request. It is now no longer needed to prefix a call with the CUDA module to ensure a GPU-compatible version is used. Furthermore, it also protects users from accidentally calling GPU intrinsics, as doing so will now result in an error instead of a crash:

-
julia> CUDA.saturate(1f0)
-ERROR: This function is not intended for use on the CPU
-Stacktrace:
- [1] error(s::String)
-   @ Base ./error.jl:33
- [2] saturate(x::Float32)
-   @ CUDA ~/Julia/pkg/CUDA/src/device/intrinsics.jl:23
- [3] top-level scope
-   @ REPL[10]:1
-

Device-side random number generation

-

As an illustration of the value of GPU method overrides, CUDA.jl now provides a device-side random number generator that is accessible by simply calling rand() from a kernel:

-
julia> function kernel()
-         @cushow rand()
-         return
-       end
-kernel (generic function with 1 method)
-
-julia> @cuda kernel()
-rand() = 0.668274
-

This works by overriding the Random.default_rng() method, and providing a GPU-compatible random number generator: Building on exploratory work by @S-D-R, the current generator is a maximally equidistributed combined Tausworthe RNG that shares 32-bytes of random state across threads in a warp for performance. The generator performs well, but does not pass the Crush battery of tests, so PRs are welcome here to improve the implementation!

-

Note that for host-side operations, e.g. rand!(::CuArray), the generator is not yet used by default. Instead, we use CURAND whenever possible, and fall back to the slower but more full-featured GPUArrays.jl-generator in other cases.

-

Revamped cuDNN interface

-

Finally, the cuDNN wrappers have been completely revamped by @denizyuret. The goal of the redesign is to more faithfully map the cuDNN API to more natural Julia functions, so that packages like Knet.jl or NNlib.jl can more easily use advanced cuDNN features without having to resort to low-level C calls. For more details, refer to the design document. As part of this redesign, the high-level wrappers of CUDNN have been moved to a subpackage of NNlib.jl.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2021-04-09-cuda_3.0/task_based_concurrency.png b/previews/PR44/post/2021-04-09-cuda_3.0/task_based_concurrency.png deleted file mode 100644 index 25e672c..0000000 Binary files a/previews/PR44/post/2021-04-09-cuda_3.0/task_based_concurrency.png and /dev/null differ diff --git a/previews/PR44/post/2021-06-10-cuda_3.3/index.html b/previews/PR44/post/2021-06-10-cuda_3.3/index.html deleted file mode 100644 index c7f0cde..0000000 --- a/previews/PR44/post/2021-06-10-cuda_3.3/index.html +++ /dev/null @@ -1,303 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 3.3 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 3.3

- -
- -Tim Besard - - -

- - - -

There have been several releases of CUDA.jl in the past couple of months, with many bugfixes and many exciting new features to improve GPU programming in Julia: CuArray now supports isbits Unions, CUDA.jl can emit debug info for use with NVIDIA tools, and changes to the compiler make it even easier to use the latest version of the CUDA toolkit.

-

CuArray support for isbits Unions

-

Unions are a way to represent values of one type or another, e.g., a value that can be an integer or a floating point. If all possible element types of a Union are so-called bitstypes, which can be stored contiguously in memory, the Union of these types can be stored contiguously too. This kind of optimization is implemented by the Array type, which can store such "isbits Unions" inline, as opposed to storing a pointer to a heap-allocated box. For more details, refer to the Julia documentation.

-

With CUDA.jl 3.3, the CuArray GPU array type now supports this optimization too. That means you can safely allocate CuArrays with isbits union element types and perform GPU-accelerated operations on then:

-
julia> a = CuArray([1, nothing, 3])
-3-element CuArray{Union{Nothing, Int64}, 1}:
- 1
-  nothing
- 3
-
-julia> findfirst(isnothing, a)
-2
-

It is also safe to pass these CuArrays to a kernel and use unions there:

-
julia> function kernel(a)
-         i = threadIdx().x
-         if a[i] !== nothing
-           a[i] += 1
-         end
-         return
-       end
-
-julia> @cuda threads=3 kernel(a)
-
-julia> a
-3-element CuArray{Union{Nothing, Int64}, 1}:
- 2
-  nothing
- 4
-

This feature is especially valuable to represent missing values, and is an important step towards GPU support for DataFrames.jl.

-

Debug and location information

-

Another noteworthy addition is the support for emitting debug and location information. The debug level, set by passing -g <level> to the julia executable, determines how much info is emitted. The default of level 1 only enables location information instructions which should not impact performance. Passing -g0 disables this, while passing -g2 also enables the output of DWARF debug information and compiles in debug mode.

-

Location information is useful for a variety of reasons. Many tools, like the NVIDIA profilers, use it corelate instructions to source code:

-
- NVIDIA Visual Profiler with source-code location information -
- -

Debug information can be used to debug compiled code using cuda-gdb:

-
$ cuda-gdb --args julia -g2 examples/vadd.jl
-(cuda-gdb) set cuda break_on_launch all
-(cuda-gdb) run
-[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
-macro expansion () at .julia/packages/LLVM/hHQuD/src/interop/base.jl:74
-74                  Base.llvmcall(($ir,$fn), $rettyp, $argtyp, $(args.args...))
-
-(cuda-gdb) bt
-#0  macro expansion () at .julia/packages/LLVM/hHQuD/src/interop/base.jl:74
-#1  macro expansion () at .julia/dev/CUDA/src/device/intrinsics/indexing.jl:6
-#2  _index () at .julia/dev/CUDA/src/device/intrinsics/indexing.jl:6
-#3  blockIdx_x () at .julia/dev/CUDA/src/device/intrinsics/indexing.jl:56
-#4  blockIdx () at .julia/dev/CUDA/src/device/intrinsics/indexing.jl:76
-#5  julia_vadd<<<(1,1,1),(12,1,1)>>> (a=..., b=..., c=...) at .julia/dev/CUDA/examples/vadd.jl:6
-
-(cuda-gdb) f 5
-#5  julia_vadd<<<(1,1,1),(12,1,1)>>> (a=..., b=..., c=...) at .julia/dev/CUDA/examples/vadd.jl:6
-6           i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-
-(cuda-gdb) l
-1       using Test
-2
-3       using CUDA
-4
-5       function vadd(a, b, c)
-6           i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-7           c[i] = a[i] + b[i]
-8           return
-9       end
-10
-

Improved CUDA compatibility support

-

As always, new CUDA.jl releases come with updated support for the CUDA toolkit. CUDA.jl is now compatible with CUDA 11.3, as well as CUDA 11.3 Update 1. Users don't have to do anything to update to these versions, as CUDA.jl will automatically select and download the latest supported version.

-

Of course, for CUDA.jl to use the latest versions of the CUDA toolkit, a sufficiently recent version of the NVIDIA driver is required. Before CUDA 11.0, the driver's CUDA compatibility was a strict lower bound, and every minor CUDA release required a driver update. CUDA 11.0 comes with an enhanced compatibility option that follows semantic versioning, e.g., CUDA 11.3 can be used on an NVIDIA driver that only supports up to CUDA 11.0. CUDA.jl now follows semantic versioning when selecting a compatible toolkit, making it easier to use the latest version of the CUDA toolkit in Julia.

-

For those interested: Implementing semantic versioning required the CUDA.jl compiler to use ptxas instead of the driver's embedded JIT to generate GPU machine code. At the same time, many parts of CUDA.jl still use the CUDA driver APIs, so it's always recommended to keep your NVIDIA driver up-to-date.

-

High-level graph APIs

-

To overcome the cost of launching kernels, CUDA makes it possible to build computational graphs, and execute those graphs with less overhead than the underlying operations. In CUDA.jl we provide easy access to the APIs to record and execute these graphs:

-
A = CUDA.zeros(Int, 1)
-
-# ensure the operation is compiled
-A .+= 1
-
-# capture
-graph = capture() do
-    A .+= 1
-end
-@test Array(A) == [1]   # didn't change anything
-
-# instantiate and launch
-exec = instantiate(graph)
-CUDA.launch(exec)
-@test Array(A) == [2]
-
-# update and instantiate/launch again
-graph′ = capture() do
-    A .+= 2
-end
-update(exec, graph′)
-CUDA.launch(exec)
-@test Array(A) == [4]
-

This sequence of operations is common enough that we provide a high-level @captured macro wraps that automatically records, updates, instantiates and launches the graph:

-
A = CUDA.zeros(Int, 1)
-
-for i in 1:2
-    @captured A .+= 1
-end
-@test Array(A) == [2]
-

Minor changes and features

- - -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2021-06-10-cuda_3.3/nvvp.png b/previews/PR44/post/2021-06-10-cuda_3.3/nvvp.png deleted file mode 100644 index 8e2b2a1..0000000 Binary files a/previews/PR44/post/2021-06-10-cuda_3.3/nvvp.png and /dev/null differ diff --git a/previews/PR44/post/2021-08-13-cuda_3.4/index.html b/previews/PR44/post/2021-08-13-cuda_3.4/index.html deleted file mode 100644 index 509b85e..0000000 --- a/previews/PR44/post/2021-08-13-cuda_3.4/index.html +++ /dev/null @@ -1,310 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 3.4 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 3.4

- -
- -Tim Besard - - -

- - - -

The latest version of CUDA.jl brings several new features, from improved atomic operations to initial support for arrays with unified memory. The native random number generator introduced in CUDA.jl 3.0 is now the default fallback, and support for memory pools other than the CUDA stream-ordered one has been removed.

-

Streamlined atomic operations

-

In preparation of integrating with the new standard @atomic macro introduced in Julia 1.7, we have streamlined the capabilities of atomic operations in CUDA.jl. The API is now split into two levels: low-level atomic_ methods for atomic functionality that's directly supported by the hardware, and a high-level @atomic macro that tries to perform operations natively or falls back to a loop with compare-and-swap. This fall-back implementation makes it possible to use more complex operations that do not map onto a single atomic operation:

-
julia> a = CuArray([1]);
-
-julia> function kernel(a)
-         CUDA.@atomic a[] <<= 1
-         return
-       end
-
-julia> @cuda threads=16 kernel(a)
-
-julia> a
-1-element CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}:
- 65536
-
-julia> 1<<16
-65536
-

The only requirement is that the types being used are supported by CUDA.atomic_cas!. This includes common types like 32 and 64-bit integers and floating-point numbers, as well as 16-bit numbers on devices with compute capability 7.0 or higher.

-

Note that on Julia 1.7 and higher, CUDA.jl does not export the @atomic macro anymore to avoid conflicts with the version in Base. That means it is recommended to always fully specify uses of the macro, i.e., use CUDA.@atomic as in the example above.

-

Arrays with unified memory

-

You may have noticed that the CuArray type in the example above included an additional parameter, Mem.DeviceBuffer. This has been introduced to support arrays backed by different kinds of buffers. By default, we will use an ordinary device buffer, but it's now possible to allocate arrays backed by unified buffers that can be used on multiple devices:

-
julia> a = cu([0]; unified=true)
-1-element CuArray{Int64, 1, CUDA.Mem.UnifiedBuffer}:
- 0
-
-julia> a .+= 1
-1-element CuArray{Int64, 1, CUDA.Mem.UnifiedBuffer}:
- 1
-
-julia> device!(1)
-
-julia> a .+= 1
-1-element CuArray{Int64, 1, CUDA.Mem.UnifiedBuffer}:
- 2
-

Although all operations should work equally well with arrays backed by unified memory, they have not been optimized yet. For example, copying memory to the device could be avoided as the driver can automatically page in unified memory on-demand.

-

New default random number generator

-

CUDA.jl 3.0 introduced a new random number generator, and starting with CUDA.jl 3.2 performance and quality of this generator was improved up to the point it could be used by applications. A couple of features were still missing though, such as generating normally-distributed random numbers, or support for complex numbers. These features have been added in CUDA.jl 3.3, and the generator is now used as the default fallback when CURAND does not support the requested element types.

-

Both the performance and quality of this generator is much better than the previous, GPUArrays.jl-based one:

-
julia> using BenchmarkTools
-julia> cuda_rng = CUDA.RNG();
-julia> gpuarrays_rng = GPUArrays.default_rng(CuArray);
-julia> a = CUDA.zeros(1024,1024);
-
-julia> @benchmark CUDA.@sync rand!($cuda_rng, $a)
-BenchmarkTools.Trial: 10000 samples with 1 evaluation.
- Range (min … max):  17.040 μs …  2.430 ms  ┊ GC (min … max): 0.00% … 99.04%
- Time  (median):     18.500 μs              ┊ GC (median):    0.00%
- Time  (mean ± σ):   20.604 μs ± 34.734 μs  ┊ GC (mean ± σ):  1.17% ±  0.99%
-
-         ▃▆█▇▇▅▄▂▁
-  ▂▂▂▃▄▆███████████▇▆▆▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂ ▄
-  17 μs           Histogram: frequency by time        24.1 μs <
-
-julia> @benchmark CUDA.@sync rand!($gpuarrays_rng, $a)
-BenchmarkTools.Trial: 10000 samples with 1 evaluation.
- Range (min … max):  72.489 μs …  2.790 ms  ┊ GC (min … max): 0.00% … 98.44%
- Time  (median):     74.479 μs              ┊ GC (median):    0.00%
- Time  (mean ± σ):   81.211 μs ± 61.598 μs  ┊ GC (mean ± σ):  0.67% ±  1.40%
-
-  █                                                           ▁
-  █▆▃▁▃▃▅▆▅▁▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▄▆▁▁▁▁▁▁▁▁▄▄▃▄▃▁▁▁▁▁▁▁▁▁▃▃▄▆▄▁▄▃▆ █
-  72.5 μs      Histogram: log(frequency) by time       443 μs <
-
julia> using RNGTest
-julia> test_cuda_rng = RNGTest.wrap(cuda_rng, UInt32);
-julia> test_gpuarrays_rng = RNGTest.wrap(gpuarrays_rng, UInt32);
-
-julia> RNGTest.smallcrushTestU01(test_cuda_rng)
- All tests were passed
-
-julia> RNGTest.smallcrushTestU01(test_gpuarrays_rng)
- The following tests gave p-values outside [0.001, 0.9990]:
-
-       Test                          p-value
- ----------------------------------------------
-  1  BirthdaySpacings                 eps
-  2  Collision                        eps
-  3  Gap                              eps
-  4  SimpPoker                       1.0e-4
-  5  CouponCollector                  eps
-  6  MaxOft                           eps
-  7  WeightDistrib                    eps
- 10  RandomWalk1 M                   6.0e-4
- ----------------------------------------------
- (eps  means a value < 1.0e-300):
-

Removal of old memory pools

-

With the new stream-ordered allocator, caching memory allocations at the CUDA library level, much of the need for memory pools to cache memory allocations has disappeared. To simplify the allocation code, we have removed support for those Julia-managed memory pools (i.e., binned, split and simple). You can now only use the cuda memory pool, or use no pool at all by setting the JULIA_CUDA_MEMORY_POOL environment variable to none.

-

Not using a memory pool degrades performance, so if you are stuck on an NVIDIA driver that does not support CUDA 11.2, it is advised to remain on CUDA.jl 3.3 until you can upgrade.

-

Also note that the new stream-ordered allocator has turned out incompatible with legacy cuIpc APIs as used by OpenMPI. If that applies to you, consider disabling the memory pool or reverting to CUDA.jl 3.3 if your application's allocation pattern benefits from a memory pool.

-

Because of this, we will be maintaining CUDA.jl 3.3 longer than usual. All bug fixes in CUDA.jl 3.4 have already been backported to the previous release, which is currently at version 3.3.6.

-

Device capability-dependent kernel code

-

Some of the improvements in this release depend on the ability to write generic code that only uses certain hardware features when they are available. To facilitate writing such code, the compiler now embeds metadata in the generated code that can be used to branch on.

-

Currently, the device capability and PTX ISA version are embedded and made available using respectively the compute_capability and ptx_isa_version functions. A simplified version number type, constructable using the sv"..." string macro, can be used to test against these properties. For example:

-
julia> function kernel(a)
-           a[] = compute_capability() >= sv"6.0" ? 1 : 2
-           return
-       end
-kernel (generic function with 1 method)
-
-julia> CUDA.code_llvm(kernel, Tuple{CuDeviceVector{Float32, AS.Global}})
-define void @julia_kernel_1({ i8 addrspace(1)*, i64, [1 x i64] }* %0) {
-top:
-  %1 = bitcast { i8 addrspace(1)*, i64, [1 x i64] }* %0 to float addrspace(1)**
-  %2 = load float addrspace(1)*, float addrspace(1)** %1, align 8
-  store float 1.000000e+00, float addrspace(1)* %2, align 4
-  ret void
-}
-
-julia> capability(device!(1))
-v"3.5.0"
-
-julia> CUDA.code_llvm(kernel, Tuple{CuDeviceVector{Float32, AS.Global}})
-define void @julia_kernel_2({ i8 addrspace(1)*, i64, [1 x i64] }* %0) {
-top:
-  %1 = bitcast { i8 addrspace(1)*, i64, [1 x i64] }* %0 to float addrspace(1)**
-  %2 = load float addrspace(1)*, float addrspace(1)** %1, align 8
-  store float 2.000000e+00, float addrspace(1)* %2, align 4
-  ret void
-}
-

The branch on the compute capability is completely optimized away. At the same time, this does not require re-inferring the function as the optimization happens at the LLVM level.

-

Other changes

- - -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2022-01-28-cuda_3.5_3.8/index.html b/previews/PR44/post/2022-01-28-cuda_3.5_3.8/index.html deleted file mode 100644 index e3a1ad8..0000000 --- a/previews/PR44/post/2022-01-28-cuda_3.5_3.8/index.html +++ /dev/null @@ -1,339 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 3.5-3.8 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 3.5-3.8

- -
- -Tim Besard - - -

- - - -

CUDA.jl versions 3.5 to 3.8 have brought several new features to improve performance and productivity. This blog post will highlight a couple: direct copies between devices, better performance by preserving array index types and changing the memory pool, and a much-improved interface to the compute sanitizer utility.

-

Copies between devices

-

Typically, when sending data between devices you need to stage through the CPU. CUDA.jl now does this automatically, making it possible to directly copy between CuArrays on different devices:

-
julia> device!(0);
-
-julia> a = CUDA.rand(2,2)
-2×2 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
- 0.440147  0.986939
- 0.622901  0.698119
-
-julia> device!(1);
-
-julia> b = CUDA.zeros(2,2);
-
-julia> copyto!(b, a)
-2×2 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
- 0.440147  0.986939
- 0.622901  0.698119
-

When your hardware supports it, CUDA.jl will automatically enable so-called peer-to-peer mode, making it possible to copy data directly without going through the CPU. This can result in significant bandwidth and latency reductions. You can check if this mode of communication is possible:

-
julia> src = CuDevice(0)
-CuDevice(0): NVIDIA A100-PCIE-40GB
-
-julia> dst = CuDevice(1)
-CuDevice(1): Tesla V100-PCIE-32GB
-
-julia> can_access_peer(src, dst)
-false
-

In this case, peer-to-peer communication is not possible because the devices have a different compute capability major revision number. With a compatible device, the function reports true:

-
julia> src = CuDevice(1)
-CuDevice(1): Tesla V100-PCIE-32GB
-
-julia> dst = CuDevice(2)
-CuDevice(2): Tesla V100-PCIE-16GB
-
-julia> can_access_peer(src, dst)
-true
-

Thanks to @kshyatt for help with this change!

-

Helper function to use compute-sanitizer

-

The CUDA toolkit comes with a powerful tool to check GPU kernels for common issues like memory errors and race conditions: the compute sanitizer. To make it easier to use this tool, CUDA.jl now ships the binary as part of its artifacts, and provides a helper function to restart Julia under the compute-sanitizer. Let's demonstrate, and trigger a memory error to show what the compute sanitizer can detect:

-
julia> using CUDA
-
-julia> CUDA.run_compute_sanitizer()
-Re-starting your active Julia session...
-
-========= COMPUTE-SANITIZER
-julia> using CUDA
-
-julia> unsafe_wrap(CuArray, pointer(CuArray([1])), 2) .= 1
-========= Invalid __global__ write of size 8 bytes
-=========     at 0x2a0 in LLVM/src/interop/base.jl:45:julia_broadcast_kernel_1892(CuKernelContext, CuDeviceArray<Int64, (int)1, (int)1>, Broadcasted<void, Tuple<OneTo<Int64>>, _identity, Broadcasted<Int64>>, Int64)
-=========     by thread (1,0,0) in block (0,0,0)
-=========     Address 0xa64000008 is out of bounds
-=========     and is 1 bytes after the nearest allocation at 0xa64000000 of size 8 bytes
-

Other tools are available too, e.g. racecheck for detecting races or synccheck for finding synchronization issues. These tools can be selected using the tool keyword argument to run_compute_sanitizer.

-

Updated binary dependencies

-

As is common with every release, CUDA.jl now supports newer versions of NVIDIA's tools and libraries:

- -

The update to CUDA toolkit 11.6 comes with improved debug info compatibility. If you need to debug Julia GPU code with tools like compute-sanitizer or cuda-gdb, and you need debug info (the equivalent of nvcc -G), ensure CUDA.jl can use the latest version of the CUDA toolkit.

-

To make it easier to use the latest supported toolkit, CUDA.jl now implements CUDA's so-called Forward Compatibility mode: When your driver is outdated, CUDA.jl will attempt to load a newer version of the CUDA driver library, enabling use of a newer CUDA toolkit and libraries. Note that this is only supported on select hardware, refer to the NVIDIA documentation for more details.

-

Preserving array indices

-

Julia's integers are typically 64-bits wide, which can be wasteful when dealing with GPU indexing intrinsics that are typically only 32-bits wide. CUDA.jl's device array type now carefully preserves the type of indices so that 32-bits indices aren't unnecessarily promoted to 64-bits. With some careful kernel programming (note the use of 0x1 instead of 1 below), this makes it possible to significantly reduce the register pressure surrounding indexing operations, which may be useful in register-constrained situations:

-
julia> function memset(arr, val)
-           i = (blockIdx().x-0x1) * blockDim().x + threadIdx().x
-           @inbounds arr[i] = val
-           return
-       end
-
-julia> CUDA.code_ptx(memset, Tuple{CuDeviceArray{Float32,1,AS.Global},Float32})
-.func julia_memset(.param .b64 arr, .param .b32 val) {
-        .reg .f32       %f<2>;
-        .reg .b32       %r<5>;
-        .reg .b64       %rd<5>;
-
-        ld.param.u64    %rd1, [arr];
-        ld.param.f32    %f1, [val];
-        mov.u32         %r1, %ctaid.x;
-        mov.u32         %r2, %ntid.x;
-        mov.u32         %r3, %tid.x;
-        mad.lo.s32      %r4, %r2, %r1, %r3;
-        ld.u64          %rd2, [%rd1];
-        mul.wide.s32    %rd3, %r4, 4;
-        add.s64         %rd4, %rd2, %rd3;
-        st.global.f32   [%rd4], %f1;
-        ret;
-}
-

On CUDA.jl 3.4, this simple function used 3 more 64-bit registers:

-
.func julia_memset(.param .b64 arr, .param .b32 val) {
-        .reg .f32       %f<2>;
-        .reg .b32       %r<5>;
-        .reg .b64       %rd<8>;
-
-        ld.param.u64    %rd1, [arr];
-        ld.param.f32    %f1, [val];
-        mov.u32         %r1, %ctaid.x;
-        mov.u32         %r2, %ntid.x;
-        mul.wide.u32    %rd2, %r2, %r1;
-        mov.u32         %r3, %tid.x;
-        add.s32         %r4, %r3, 1;
-        cvt.u64.u32     %rd3, %r4;
-        ld.u64          %rd4, [%rd1];
-        add.s64         %rd5, %rd2, %rd3;
-        shl.b64         %rd6, %rd5, 2;
-        add.s64         %rd7, %rd4, %rd6;
-        st.global.f32   [%rd7+-4], %f1;
-        ret;
-}
-

More aggressive memory management

-

Starting with CUDA 3.8, the memory pool used to allocate CuArrays will be configured differently: The pool will now be allowed to use all available GPU memory, whereas previously all cached memory was released at each synchronization point. This can significantly improve performance, and makes synchronization much cheaper.

-

This behavior can be observed by calling the memory_status() function:

-
julia> CUDA.memory_status()
-Effective GPU memory usage: 13.57% (2.001 GiB/14.751 GiB)
-Memory pool usage: 0 bytes (0 bytes reserved)
-
-julia> a = CuArray{Float32}(undef, (1024, 1024, 1024));
-julia> Base.format_bytes(sizeof(a))
-"4.000 GiB"
-
-julia> a = nothing
-julia> GC.gc()
-
-julia> CUDA.memory_status()
-Effective GPU memory usage: 40.59% (5.988 GiB/14.751 GiB)
-Memory pool usage: 0 bytes (4.000 GiB reserved)
-

So far nothing new. On previous versions of CUDA.jl however, any subsequent synchronization of the GPU (e.g., by copying memory to the CPU) would have resulted in a release of this reserved memory. This is not the case anymore:

-
julia> synchronize()
-
-julia> CUDA.memory_status()
-Effective GPU memory usage: 40.59% (5.988 GiB/14.751 GiB)
-Memory pool usage: 0 bytes (4.000 GiB reserved)
-

If you still want to release this memory, you can call the reclaim() function:

-
julia> CUDA.reclaim()
-
-julia> CUDA.memory_status()
-Effective GPU memory usage: 13.48% (1.988 GiB/14.751 GiB)
-Memory pool usage: 0 bytes (0 bytes reserved)
-

With interactive Julia sessions, this function is called periodically so that the GPU's memory isn't held on to unnecessarily. Otherwise it shouldn't be necessary to call this function, as memory is freed automatically when it is needed.

-

Minor changes and improvements

- - -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2022-04-06-oneapi_update/index.html b/previews/PR44/post/2022-04-06-oneapi_update/index.html deleted file mode 100644 index b1c4d7a..0000000 --- a/previews/PR44/post/2022-04-06-oneapi_update/index.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - - - - - - - - - - - - - oneAPI.jl status update ⋅ JuliaGPU - - - - - - -
-
- - - - - -

oneAPI.jl status update

- -
- -Tim Besard - - -

- - - -

It has been over a year since the last update on oneAPI.jl, the Julia package for programming Intel GPUs (and other accelerators) using the oneAPI toolkit. Since then, the package has been under steady development, and several new features have been added to improve the developer experience and usability of the package.

-

@atomic intrinsics

-

oneAPI.jl now supports atomic operations, which are required to implement a variety of parallel algorithms. Low-level atomic functions (atomic_add!, atomic_xchg!, etc) are available as unexported methods in the oneAPI module:

-
a = oneArray(Int32[0])
-
-function kernel(a)
-    oneAPI.atomic_add!(pointer(a), Int32(1))
-    return
-end
-
-@oneapi items=256 kernel(a)
-@test Array(a)[1] == 256
-

Note that these methods are only available for those types that are supported by the underlying OpenCL intrinsics. For example, the atomic_add! from above can only be used with Int32 and UInt32 inputs.

-

Most users will instead rely on the higher-level @atomic macro, which can be easily put in front of many array operations to make them behave atomically. To avoid clashing with the new @atomic macro in Julia 1.7, this macro is also unexported:

-
a = oneArray(Int32[0])
-
-function kernel(a)
-    oneAPI.@atomic a[1] += Int32(1)
-    return
-end
-
-@oneapi items=256 kernel(a)
-@test Array(a)[1] == 512
-

When used with operations that are supported by OpenCL, this macro will lower to calls like atomic_add!. For other operations, a compare-and-exchange loop will be used. Note that for now, this is still restricted to 32-bit operations, as we do not support the cl_khr_int64_base_atomics extension for 64-bit atomics.

-

Initial integration with vendor libraries

-

One significant missing features is the integration with vendor libraries like oneMKL. These integrations are required to ensure good performance for important operations like matrix multiplication, which currently fall-back to generic implementations in Julia that may not always perform as good.

-

To improve this situation, we are working on a wrapper library that allows us to integrate with oneMKL and other oneAPI and SYCL libraries. Currently, only matrix multiplication is supported, but once the infrastructural issues are worked out we expect to quickly support many more operations.

-

If you need support for specific libraries, please have a look at this PR. As the API surface is significant, we will need help to extend the wrapper library and integrate it with high-level Julia libraries like LinearAlgebra.jl.

-

Correctness issues

-

In porting existing Julia GPU applications to oneAPI.jl, we fixed several issues that caused correctness issues when executing code on Intel GPUs:

-
    -
  • when the garbage collector frees GPU memory, it now blocks until all outstanding commands (which may include uses of said memory) are completes

    -
  • -
  • the barrier function to synchronize threads is now marked as convert to avoid LLVM miscompilations

    -
  • -
-

Note that if you are using Tiger Lake hardware, there is currently a known issue in the back-end Intel compiler that affects oneAPI.jl, causing correctness issues that can be spotted by running the oneAPI.jl test suite.

-

Future work

-

To significantly improve usability of oneAPI.jl, we will add support to the KernelAbstraction.jl package. This library is used by many other packages for adding GPU acceleration to algorithms that cannot be easily expressed using only array operations. As such, support for oneAPI.jl will make it possible to use your oneAPI GPUs with all of these packages.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2022-06-24-metal/index.html b/previews/PR44/post/2022-06-24-metal/index.html deleted file mode 100644 index 57ca8ad..0000000 --- a/previews/PR44/post/2022-06-24-metal/index.html +++ /dev/null @@ -1,352 +0,0 @@ - - - - - - - - - - - - - - - - - - - - Technical preview: Programming Apple M1 GPUs in Julia with Metal.jl ⋅ JuliaGPU - - - - - - -
-
- - - - - -

Technical preview: Programming Apple M1 GPUs in Julia with Metal.jl

- -
- -Tim Besard - - -

- - - -

Julia has gained a new GPU back-end: Metal.jl, for working with Apple's M1 GPUs. The back-end is built on the same foundations that make up existing GPU packages like CUDA.jl and AMDGPU.jl, so it should be familiar to anybody who's already programmed GPUs in Julia. In the following post I'll demonstrate some of that functionality and explain how it works.

-

But first, note that Metal.jl is under heavy development: The package is considered experimental for now, as we're still working on squashing bugs and adding essential functionality. We also haven't optimized for performance yet. If you're interesting in using Metal.jl, please consider contributing to its development! Most of the package is written in Julia, and checking-out the source code is a single Pkg.develop away :-)

-

Quick start

-

Start by getting a hold of the upcoming Julia 1.8, launch it, and enter the package manager by pressing ]:

-
julia> ]
-
-pkg> add Metal
-  Installed Metal
-

Installation is as easy as that, and we'll automatically download the necessary binary artifacts (a C wrapper for the Metal APIs, and an LLVM back-end). Then, leave the package manager by pressing backspace, import the Metal package, and e.g. call the versioninfo() method for some details on the toolchain:

-
julia> using Metal
-
-julia> Metal.versioninfo()
-macOS 13.0.0, Darwin 21.3.0
-
-Toolchain:
-- Julia: 1.8.0-rc1
-- LLVM: 13.0.1
-
-1 device:
-- Apple M1 Pro (64.000 KiB allocated)
-

And there we go! You'll note here that I'm using the upcoming macOS 13 (Ventura); this is currently the only supported operating system. We also only support M-series GPUs, even though Metal does support other GPUs. These choices were made to simplify development, and aren't technical limitations. In fact, Metal.jl does work on e.g. macOS Monterey with an Intel GPU, but it's an untested combination that may suffer from bugs.

-

Array programming

-

Just like our other GPU back-ends, Metal.jl offers an array abstraction that greatly simplifies GPU programming. The abstraction centers around the MtlArray type that can be used to manage memory and perform GPU computations:

-
# allocate + initialize
-julia> a = MtlArray(rand(Float32, 2, 2))
-2×2 MtlArray{Float32, 2}:
- 0.158752  0.836366
- 0.535798  0.153554
-
-# perform some GPU-accelerated operations
-julia> b = a * a
-2×2 MtlArray{Float32, 2}:
- 0.473325  0.261202
- 0.167333  0.471702
-
-# back to the CPU
-julia> Array(b)
-2×2 Matrix{Float32}:
- 0.473325  0.261202
- 0.167333  0.471702
-

Beyond these simple operations, Julia's higher-order array abstractions can be used to express more complex operations without ever having to write a kernel:

-
julia> mapreduce(sin, +, a; dims=1)
-1×2 MtlArray{Float32, 2}:
- 1.15276  0.584146
-
-julia> cos.(a .+ 2) .* 3
-2×2 MtlArray{Float32, 2}:
- -2.0472   -1.25332
- -2.96594  -2.60351
-

Much of this functionality comes from the GPUArrays.jl package, which provides vendor-neutral implementations of common array operations. As a result, MtlArray is already pretty capable, and should be usable with realistic array-based applications.

-

Kernel programming

-

Metal.jl's array operations are implemented in Julia, using our native kernel programming capabilities and accompanying JIT-compiler. A small demonstration:

-
# a simple kernel that sets elements of an array to a value
-function memset_kernel(array, value)
-  i = thread_position_in_grid_1d()
-  if i <= length(array)
-    @inbounds array[i] = value
-  end
-  return
-end
-
-a = MtlArray{Float32}(undef, 512)
-@metal threads=512 grid=2 memset_kernel(a, 42)
-
-# verify
-@assert all(isequal(42), Array(a))
-

As can be seen here, we've opted to deviate slightly from the Metal Shading Language, instead providing a programming experience that's similar to Julia's existing back-ends. Some key differences:

-
    -
  • we use intrinsic functions instead of special kernel function arguments to access properties like the thread position, grid size, ...;

    -
  • -
  • all types of arguments (buffers, indirect buffers, value-typed inputs) are transparently converted to a GPU-compatible structure[1];

    -
  • -
  • global (task-bound) state is used to keep track of the active device and a queue;

    -
  • -
  • compute pipeline set-up and command encoding is hidden behind a single macro.

    -
  • -
-

Behind the scenes, we compile Julia to LLVM IR and use a tiny LLVM back-end (based on @a2flo's libfloor) that (re)writes the bitcode to a Metal-compatible library containing LLVM 5 bitcode. You can inspect the generated IR using @device_code_metal:

-
julia> @device_code_metal @metal threads=512 grid=2 memset_kernel(a, 42)
-
[header]
-program_count: 1
-...
-
-[program]
-name: julia_memset_kernel
-type: kernel
-...
-
target datalayout = "..."
-target triple = "air64-apple-macosx13.0.0"
-
-; the (rewritten) kernel function:
-;  - %value argument passed by reference
-;  - %thread_position_in_grid argument added
-;  - sitofp rewritten to AIR-specific intrinsic
-define void @julia_memset_kernel(
-    { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %array,
-    i64 addrspace(1)* %value,
-    i32 %thread_position_in_grid) {
-  ...
-  %9 = tail call float @air.convert.f.f32.s.i64(i64 %7)
-  ...
-  ret void
-}
-
-; minimal required argument metadata
-!air.kernel = !{!10}
-!10 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*,
-              i64 addrspace(1)*, i32)* @julia_memset_kernel, !11, !12}
-!12 = !{!13, !14, !15}
-!13 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1,
-       !"air.read_write", !"air.address_space", i32 1,
-       !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8}
-!14 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1,
-       !"air.read_write", !"air.address_space", i32 1,
-       !"air.arg_type_size", i32 8, !"air.arg_type_align_size", i32 8}
-!15 = !{i32 0, !"air.thread_position_in_grid"}
-
-; other metadata not shown, for brevity
-

Shout-out to @max-Hawkins for exploring Metal code generation during his internship at Julia Computing!

-

Metal APIs in Julia

-

Lacking an Objective C or C++ FFI, we interface with the Metal libraries using a shim C library. Most users won't have to interface with Metal directly – the array abstraction is sufficient for many – but more experienced developers can make use of the high-level wrappers that we've designed for the Metal APIs:

-
julia> dev = MtlDevice(1)
-MtlDevice:
-  name:             Apple M1 Pro
-  lowpower:         false
-  headless:         false
-  removable:        false
-  unified memory:   true
-
-julia> desc = MtlHeapDescriptor()
-MtlHeapDescriptor:
-  type:             MtHeapTypeAutomatic
-  storageMode:      MtStorageModePrivate
-  size:             0
-
-julia> desc.size = 16384
-16384
-
-julia> heap = MtlHeap(dev, desc)
-MtlHeap:
-  type:                 MtHeapTypeAutomatic
-  size:                 16384
-  usedSize:             0
-  currentAllocatedSize: 16384
-
-# etc
-

These wrappers are based on @PhilipVinc's excellent work on MetalCore.jl, which formed the basis for (and has been folded into) Metal.jl.

-

What's next?

-

The current release of Metal.jl focusses on code generation capabilities, and is meant as a preview for users and developers to try out on their system or with their specific GPU application. It is not production-ready yet, and is lacking some crucial features:

-
    -
  • performance optimization

    -
  • -
  • integration with Metal Performance Shaders

    -
  • -
  • integration / documentation for use with Xcode tools

    -
  • -
  • fleshing out the array abstraction based on user feedback

    -
  • -
-

Please consider helping out with any of these! Since Metal.jl and its dependencies are almost entirely implemented in Julia, any experience with the language is sufficient to contribute. If you're not certain, or have any questions, please drop by the #gpu channel on the JuliaLang Slack, ask questions on our Discourse, or chat to us during the GPU office hours every other Monday.

-

If you encounter any bugs, feel free to let us know on the Metal.jl issue tracker. For information on upcoming releases, subscribe to this website's blog where we post about significant developments in Julia's GPU ecosystem.

-
-

- - - - -
[1]This relies on Metal 3 from macOS 13, which introduced bindless argument
- buffers, as we didn't fully figure out how to reliably encode arbitrarily-nested indirect buffers in argument encoder metadata.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2023-02-01-cuda_4.0/index.html b/previews/PR44/post/2023-02-01-cuda_4.0/index.html deleted file mode 100644 index 6a54893..0000000 --- a/previews/PR44/post/2023-02-01-cuda_4.0/index.html +++ /dev/null @@ -1,302 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 4.0 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 4.0

- -
- -Tim Besard - - -

- - - -

CUDA.jl 4.0 is a breaking release that introduces the use of JLLs to provide the CUDA toolkit. This makes it possible to compile other binary libaries against the CUDA runtime, and use them together with CUDA.jl. The release also brings CUSPARSE improvements, the ability to limit memory use, and many bug fixes and performance improvements.

-

JLLs for CUDA artifacts

-

While CUDA.jl has been using binary artifacts for a while, it was manually managing installation and selection of them, i.e., not by using standardised JLL packages. This complicated use of the artifacts by other packages, and made it difficult to build other binary packages against the CUDA runtime.

-

With CUDA.jl 4.0, we now use JLLs to load the CUDA driver and runtime. Specifically, there are two JLLs in play: CUDA_Driver_jll and CUDA_Runtime_jll. The former is responsible for loading the CUDA driver library (possibly upgrading it using a forward-compatible version), and determining the CUDA version that your set-up supports:

-
❯ JULIA_DEBUG=CUDA_Driver_jll julia
-julia> using CUDA_Driver_jll
-┌ System CUDA driver found at libcuda.so.1, detected as version 12.0.0
-└ @ CUDA_Driver_jll
-┌ System CUDA driver is recent enough; not using forward-compatible driver
-└ @ CUDA_Driver_jll
-

With the driver identified and loaded, CUDA_Runtime_jll can select a compatible toolkit. By default, it uses the latest supported toolkit that is compatible with the driver:

-
julia> using CUDA_Runtime_jll
-
-julia> CUDA_Runtime_jll.cuda_toolkits
-10-element Vector{VersionNumber}:
- v"10.2.0"
- v"11.0.0"
- v"11.1.0"
- v"11.2.0"
- v"11.3.0"
- v"11.4.0"
- v"11.5.0"
- v"11.6.0"
- v"11.7.0"
- v"11.8.0"
-
-julia> CUDA_Runtime_jll.host_platform
-Linux x86_64 {cuda=11.8}
-

As you can see, the selected CUDA runtime is encoded in the host platform. This makes it possible for Julia to automatically select compatible versions of other binary packages. For example, if we install and load SuiteSparse_GPU_jll, which right now provides builds for CUDA 10.2, 11.0 and 12.0, the artifact resolution code knows to load the build for CUDA 11.0 which is compatible with the selected CUDA 11.8 runtime:

-
julia> using SuiteSparse_GPU_jll
-
-julia> SuiteSparse_GPU_jll.best_wrapper
-"~/.julia/packages/SuiteSparse_GPU_jll/.../x86_64-linux-gnu-cuda+11.0.jl"
-

The change to JLLs requires a breaking change: the JULIA_CUDA_VERSION and JULIA_CUDA_USE_BINARYBUILDER environment variables have been removed, and are replaced by preferences that are set in the current environment. For convenience, you can set these preferences by calling CUDA.set_runtime_version!:

-
❯ julia --project
-julia> using CUDA
-julia> CUDA.runtime_version()
-v"11.8.0"
-
-julia> CUDA.set_runtime_version!(v"11.7")
-┌ Set CUDA Runtime version preference to 11.7,
-└ please re-start Julia for this to take effect.
-
-❯ julia --project
-julia> using CUDA
-julia> CUDA.runtime_version()
-v"11.7.0"
-
-julia> using CUDA_Runtime_jll
-julia> CUDA_Runtime_jll.host_platform
-Linux x86_64 {cuda=11.7}
-

The changed preference is reflected in the host platform, which means that you can use this mechanism to load a different builds of other binary packages. For example, if you rely on a package or JLL that does not yet have a build for CUDA 12, you could set the preference to v"11.x" to load an available build.

-

For discovering a local runtime, you can set the version to "local", which will replace the use of CUDA_Runtime_jll by CUDA_Runtime_discovery.jl, an API-compatible package that replaces the JLL with a local runtime discovery mechanism:

-
❯ julia --project
-julia> CUDA.set_runtime_version!("local")
-┌ Set CUDA Runtime version preference to local,
-└ please re-start Julia for this to take effect.
-
-❯ JULIA_DEBUG=CUDA_Runtime_Discovery julia --project
-julia> using CUDA
-┌ Looking for CUDA toolkit via environment variables CUDA_PATH
-└ @ CUDA_Runtime_Discovery
-┌ Looking for binary ptxas in /opt/cuda
-│   all_locations =
-│    2-element Vector{String}:
-│     "/opt/cuda"
-│     "/opt/cuda/bin"
-└ @ CUDA_Runtime_Discovery
-┌ Debug: Found ptxas at /opt/cuda/bin/ptxas
-└ @ CUDA_Runtime_Discovery
-...
-

Memory limits

-

By popular demand, support for memory limits has been reinstated. This functionality had been removed after the switch to CUDA memory pools, as the memory pool allocator does not yet support memory limits. Awaiting improvements by NVIDIA, we have added functionality to impose memory limits from the Julia side, in the form of two environment variables:

-
    -
  • JULIA_CUDA_SOFT_MEMORY_LIMIT: This is an advisory limit, used to configure the memory pool, which will result in the pool being shrunk down to the requested limit at every synchronization point. That means that the pool may temporarily grow beyond the limit. This limit is unavailable when disabling memory pools (with JULIA_CUDA_MEMORY_POOL=none).

    -
  • -
  • JULIA_CUDA_HARD_MEMORY_LIMIT: This is a hard limit, checked before every allocation. Doing so is relatively expensive, so it is recommended to use the soft limit instead.

    -
  • -
-

The value of these variables can be formatted as a numer of bytes, optionally followed by a unit, or as a percentage of the total device memory. Examples: 100M, 50%, 1.5GiB, 10000.

-

CUSPARSE improvements

-

Thanks to the work of @amontoison, the CUSPARSE interface has undergone many improvements:

-
    -
  • Better support of the CuSparseMatrixCOO format with, in particular, the addition of CuSparseMatrixCOO * CuVector and CuSparseMatrixCOO * CuMatrix products;

    -
  • -
  • Routines specialized for -, +, * operations between sparse matrices (CuSparseMatrixCOO, CuSparseMatrixCSC and CuSparseMatrixCSR) have been interfaced;

    -
  • -
  • New generic routines for backward and forward sweeps with sparse triangular matrices are now used by \;

    -
  • -
  • CuMatrix * CuSparseVector and CuMatrix * CuSparseMatrix products have been added;

    -
  • -
  • Conversions between sparse and dense matrices have been updated for using more recent and optimized routines;

    -
  • -
  • High-level Julia functions for the new set of sparse BLAS 1 routines such as dot products between CuSparseVector;

    -
  • -
  • Add missing dispatchs for mul! and ldiv! functions;

    -
  • -
  • Interfacing of almost all new CUSPARSE routines added by the CUDA toolkits v"11.x".

    -
  • -
-

Other changes

-
    -
  • Removal of the CUDNN, CUTENSOR, CUTENSORNET and CUSTATEVEC submodules: These have been moved into their own packages, respectively cuDNN.jl, cuTENSOR.jl, cuTensorNet.jl and cuStateVec.jl (note the change in capitalization, now following NVIDIA's naming scheme);

    -
  • -
  • Removal of the NVTX submodule: NVTX.jl should be used instead, which is a more complete implementation of the NVTX API;

    -
  • -
  • Support for CUDA 11.8 (support for CUDA 12.0 is being worked on);

    -
  • -
  • Support for Julia 1.9.

    -
  • -
-

Backport releases

-

Because CUDA.jl 4.0 is a breaking release, two additional releases have been made that backport bugfixes and select features:

-
    -
  • CUDA.jl 3.12.1 and 3.12.2: backports of bugfixes since 3.12

    -
  • -
  • CUDA.jl 3.13.0: additionally adding the memory limit functionality

    -
  • -
- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2023-02-08-oneapi_1.0/index.html b/previews/PR44/post/2023-02-08-oneapi_1.0/index.html deleted file mode 100644 index 4b0f280..0000000 --- a/previews/PR44/post/2023-02-08-oneapi_1.0/index.html +++ /dev/null @@ -1,255 +0,0 @@ - - - - - - - - - - - - - - - - - - - - oneAPI.jl 1.0: oneMKL, Intel Arc and Julia 1.9 ⋅ JuliaGPU - - - - - - -
-
- - - - - -

oneAPI.jl 1.0: oneMKL, Intel Arc and Julia 1.9

- -
- -Tim Besard - - -

- - - -

The release of oneAPI.jl 1.0 adds integration with the oneAPI Math Kernel Library (oneMKL) to accelerate linear algebra operations on Intel GPUs. It also brings support for Julia 1.9 and Intel Arc GPUs.

-

oneMKL integration

-

oneAPI.jl now uses the Intel oneAPI Math Kernel Library (oneMKL), automatically downloaded as part of oneAPI_Support_jll.jl, to accelerate a great number of BLAS and LAPACK operations on Intel GPUs. Similar to how it is implemented in our other GPU back-ends, these wrappers are available at different levels of abstraction.

-

At the lowest level, we use a C library that wraps the oneMKL C++ APIs. For example, the oneapi::mkl::blas::column_major::gemm function for matrix-matrix multiplication is wrapped by the C functions onemklSgemm, onemklDgemm, etc. These wrappers are used to implement low-level methods like oneMKL.gemm!:

-
julia> using oneAPI
-
-julia> A = oneArray(rand(Float32, 2, 3));
-2×3 oneMatrix{Float32, oneAPI.oneL0.DeviceBuffer}:
- 0.44302   0.125576  0.859145
- 0.674291  0.428346  0.0400119
-julia> B = oneArray(rand(Float32, 3, 4))
-3×4 oneMatrix{Float32, oneAPI.oneL0.DeviceBuffer}:
- 0.592748   0.529413   0.0323396  0.659528
- 0.22489    0.0872259  0.253291   0.376519
- 0.0121506  0.591135   0.706755   0.751686
-julia> C = similar(B, (2, 4));
-
-julia> oneMKL.gemm!('N', 'N', true, A, B, true, C)
-2×4 oneMatrix{Float32, oneAPI.oneL0.DeviceBuffer}:
- 0.301279  0.753365  0.65334   0.985274
- 0.496501  0.417994  0.158581  0.63607
-
-julia> Array(C) ≈ Array(A) * Array(B)
-true
-

Of course, these low-level functions aren't very user-friendly, so we also integrate with Julia's standard libraries where possible:

-
julia> A = oneArray(rand(Float32, 2, 3));
-julia> B = oneArray(rand(Float32, 3, 4));
-
-julia> using LinearAlgebra
-julia> C = A * B;
-
-julia> Array(C) ≈ Array(A) * Array(B)
-true
-

The most frequently used oneMKL BLAS functions have been wrapped and integrated with Julia’s standard linear algebra libraries. If you run into a missing function, please file a request to add it, or take a look at the source and contribute to oneAPI.jl! The current state of the wrappers should make it easy to extend their functionality, as well as form a good basis for integrating with other libraries like oneDNN.

-

Intel Arc support

-

The new Arc series of discrete Intel GPUs are now fully supported by oneAPI.jl. These GPUs offer a significant performance improvement over their integrated predecessors:

-
julia> using oneAPI
-julia> oneAPI.versioninfo()
-1 device:
-- Intel(R) Arc(TM) A770 Graphics [0x56a0]
-
-julia> T = Float32;
-julia> n = p = m = 2048;
-julia> a = oneArray(rand(T, n, p));
-julia> b = oneArray(rand(T, p, m));
-julia> c = oneArray(zeros(T, n, m));
-
-julia> using BenchmarkTools, LinearAlgebra
-julia> bench = @benchmark oneAPI.@sync mul!(c, a, b)
-BenchmarkTools.Trial: 1510 samples with 1 evaluation.
- Range (min … max):  3.233 ms …  3.791 ms  ┊ GC (min … max): 0.00% … 0.00%
- Time  (median):     3.298 ms              ┊ GC (median):    0.00%
- Time  (mean ± σ):   3.308 ms ± 48.426 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%
-
-        ▁▃▄▇█▅▄▃▂   ▁▁▁
-  ▁▁▃▃▅▇██████████████████▇▇▇▅▆▄▅▅▄▂▃▂▂▂▂▂▂▁▂▂▂▁▂▁▂▁▂▂▂▂▁▁▂▂ ▃
-  3.23 ms        Histogram: frequency by time        3.47 ms <
-
- Memory estimate: 272 bytes, allocs estimate: 11.
-
-julia> flops = n*m*(2p-1)
-17175674880
-
-julia> flops / (minimum(bench.times)/1e9)
-5.3131281169900205e12
-

For example, here we're getting over 5 TFlops of Float32 performance, which is over 10x faster than the Intel Xe Graphics G7 we had been previously using for oneAPI.jl development. At the same time, the A770 used above should be able to deliver close to 20 TFlops, so there's still room for improvement in our software stack.

-

To use oneAPI.jl with an Arc series GPU, you need to run Linux 6.2. At the time of writing, that kernel is still in beta, so refer to your distribution's documentation for how to install it. For example, on Arch Linux you can use the linux-mainline package from the AUR, Ubuntu has the kernel-ppa archive, Fedora provides the stable-rc repository, etc.

-

Other changes

-
    -
  • Support for Julia 1.9 has been added.

    -
  • -
- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2023-03-03-metal_0.2/index.html b/previews/PR44/post/2023-03-03-metal_0.2/index.html deleted file mode 100644 index 9317bcb..0000000 --- a/previews/PR44/post/2023-03-03-metal_0.2/index.html +++ /dev/null @@ -1,250 +0,0 @@ - - - - - - - - - - - - - - - - - - - - Metal.jl 0.2: Metal Performance Shaders ⋅ JuliaGPU - - - - - - -
-
- - - - - -

Metal.jl 0.2: Metal Performance Shaders

- -
- -Tim Besard - - -

- - - -

Metal.jl 0.2 marks a significant milestone in the development of the Metal.jl package. The release comes with initial support for the Metal Perform Shaders (MPS) framework for accelerating common operations like matrix multiplications, as well as various improvements for writing Metal kernels in Julia.

-

Metal Performance Shaders

-

Quoting the Apple documentation, The Metal Performance Shaders (MPS) framework contains a collection of highly optimized compute and graphics shaders for use in Metal applications. With Metal.jl 0.2, we have added initial support for this framework, and used it to accelerate the matrix multiplication operation:

-
julia> using Metal, LinearAlgebra, BenchmarkTools
-julia> n = p = m = 2048
-julia> flops = n*m*(2p-1)
-17175674880
-
-julia> a = MtlArray(rand(Float32, n, p));
-julia> b = MtlArray(rand(Float32, p, m));
-julia> c = MtlArray(zeros(Float32, n, m));
-
-julia> using LinearAlgebra
-julia> bench = @benchmark Metal.@sync mul!(c, a, b)
-BenchmarkTools.Trial: 518 samples with 1 evaluation.
- Range (min … max):  9.366 ms …  13.354 ms  ┊ GC (min … max): 0.00% … 0.00%
- Time  (median):     9.629 ms               ┊ GC (median):    0.00%
- Time  (mean ± σ):   9.646 ms ± 192.169 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%
-
-               ▃▂▅▅▆▆▆▇█▇▇▆▅▄▄▁▁ ▁
-  ▄▁▄▄▄▄▆▆▆▄▄▁▇█████████████████▄█▄▁▆▁▄▁▆▁▇▁▄▄▁▁▄▄▇▁▄▆▄▁▁▁▁▁▄ █
-  9.37 ms      Histogram: log(frequency) by time      10.1 ms <
-
- Memory estimate: 352 bytes, allocs estimate: 12.
-
-julia> flops / (minimum(bench.times)/1e9)
-1.83e12
-

The benchmark above shows that on an 8-core M1 Pro matrix multiplication now reaches 1.8 TFLOPS (out of the 2.6TFLOPS of theoretical performance). The accelerated matrix multiplication is available for a variety of input types, incuding mixed-mode operations, and as shown above is integrated with the LinearAlgebra.jl mul! interface.

-

Of course, the MPS framework offers more than just matrix multiplication, and we expect to support more of it in the future. If you have a specific operation you would like to use from Julia, please let us know by opening an issue on the Metal.jl repository.

-

GPU profiling support

-

To support the development of Metal kernels, Max Hawkins has added support for GPU profiling. Similar to how this works in CUDA.jl, you can run code under the Metal.@profile macro to record its execution. However, this does first require setting the METAL_CAPTURE_ENABLED environment flag before import Metal.jl:

-
julia> ENV["METAL_CAPTURE_ENABLED"] = 1
-
-julia> using Metal
-
-julia> a = mtl(rand(1024, 1024))
-julia> Metal.@profile sum(a)
-[ Info: GPU frame capture saved to jl_metal.gputrace/
-

The resulting capture can be opened with Xcode, presenting a timeline that's similar to other profilers:

-
- XCode viewing a Metal.jl capture trace -
- -

Other improvements

-
    -
  • Julia 1.9 is supported, but requires an up-to-date macOS version (issues have been encountered on macOS 12.4);

    -
  • -
  • An mtl function has been added for converting Julia arrays to Metal arrays, similar to the cu function in CUDA.jl;

    -
  • -
  • Multiple GPUs are supported, and the device! function can be used to select one;

    -
  • -
  • Coverage for SIMD Group functions has been improved, so it's is now possible to use simdgroup_load, simdgroup_store, simdgroup_multiply, and simdgroup_multiply_accumulate in kernels functions.

    -
  • -
-

Future work

-

Although Metal.jl is now usable for a variety of applications, there is still work to be done before it can be considered production-ready. In particular:

-
    -
  • there are known performance issues with mapreduce, and other operations that realy on CartesianIndices;

    -
  • -
  • the libcmt wrapper library for interfacing with the Metal APIs is cumbersome to use and improve, and we are looking into native ObjectiveC FFI instead;

    -
  • -
  • the MPS wrappers are incomplete, and similar to the Metal APIs requires a replacement to libcmt to be improved;

    -
  • -
  • support for atomic operations is missing, which is required to implement a full-featured KernelAbstractions.jl back-end.

    -
  • -
-

Once (most of) these issues are addressed, we should be able to release Metal.jl 1.0.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2023-03-03-metal_0.2/xcode.png b/previews/PR44/post/2023-03-03-metal_0.2/xcode.png deleted file mode 100644 index 3a59350..0000000 Binary files a/previews/PR44/post/2023-03-03-metal_0.2/xcode.png and /dev/null differ diff --git a/previews/PR44/post/2023-07-19-oneapi_profiling/index.html b/previews/PR44/post/2023-07-19-oneapi_profiling/index.html deleted file mode 100644 index 7d3054e..0000000 --- a/previews/PR44/post/2023-07-19-oneapi_profiling/index.html +++ /dev/null @@ -1,283 +0,0 @@ - - - - - - - - - - - - - - - - - - - - Profiling oneAPI.jl applications with VTune ⋅ JuliaGPU - - - - - - -
-
- - - - - -

Profiling oneAPI.jl applications with VTune

- -
- -Tim Besard - - -

- - - -

Profiling GPU applications is hard, so this post shows how to use Intel's VTune Profiler to profile GPU applications written in Julia with oneAPI.jl.

-

Because of the asynchronous nature of GPU execution, profiling GPU applications with Julia's tried and tested tools like @profile or even @time can be misleading: They will only show the time spent on the CPU, and will likely report that your application is spending most of its time waiting for the GPU.

-

To get a better understanding of what is happening on the GPU, we need specialized tools. In this post, we'll show how to use Intel's VTune Profiler to profile GPU applications written in Julia using oneAPI.jl.

-

Set-up

-

Start by downloading and installing the Intel VTune Profiler. This does not require administrative permissions, and will install in your home folder under the intel directory. On Linux, binaries will appear in ~/intel/oneapi/vtune/latest/bin64. There are three that are particularly important:

-
    -
  • vtune: a command-line tool to profile applications;

    -
  • -
  • vtune-gui: a graphical user interface to profile applications, or to visualize the results of a command-line profiling session;

    -
  • -
  • vtune-backend: a daemon that creates a web interface for VTune, which you can use to profile applications both locally and remotely.

    -
  • -
-

Hello VTune!

-

Let's start with a simple example: A Julia program that computes the sum of two arrays (i.e., the vadd example from the oneAPI repository):

-
using oneAPI
-
-function kernel(a, b, c)
-    i = get_global_id()
-    @inbounds c[i] = a[i] + b[i]
-    return
-end
-
-function vadd(a, b)
-    d_a = oneArray(a)
-    d_b = oneArray(b)
-    d_c = similar(d_a)
-
-    @oneapi items=size(d_c) kernel(d_a, d_b, d_c)
-    Array(d_c)
-end
-
-function main(N=256)
-    a = round.(rand(Float32, N) * 100)
-    b = round.(rand(Float32, N) * 100)
-    c = vadd(a, b)
-end
-main()
-

We've tweaked this example to make it more suited for profiling: We've enclosed the main application in a function so that it gets compiled, and we've increased the array sizes to make the GPU work harder.

-

There are several ways to profile this application. We'll start by demonstrating the command-line interface:

-
$ vtune -collect gpu-offload julia vadd.jl
-
-vtune: Collection started.
-vtune: Collection stopped.
-
-vtune: Using result path `/home/tim/Julia/pkg/oneAPI/r000gh'
-    GPU Time: 0.002s
-EU Array Stalled/Idle: 100.0% of Elapsed time with GPU busy
- | The percentage of time when the EUs were stalled or idle is high, which has a
- | negative impact on compute-bound applications.
-FPU Utilization: 0.0% of Elapsed time with GPU busy
-...
-

This will run the application, and collect a number of GPU-related metrics. A summary is shown in the terminal, and a more detailed report will be written to a directory in the current working directory. You can open that report with the graphical user interface, possibly even on a different machine:

-
$ vtune-gui r000gh
-

Instrumenting the application

-

The trace we just collected includes the time spent compiling our application, making it difficult to analyze what is happening. To refine the trace, we can instrument our application with Intel's Instrumentation and Tracing Technology (ITT) APIs:

-
    -
  • only start the profiler when we're running code of interest;

    -
  • -
  • add markers to the trace to indicate what is happening.

    -
  • -
-

We can interface with the ITT APIs using the IntelITT.jl package. Let's update our example:

-
using oneAPI, IntelITT
-
-# same as before
-
-function main(N=256)
-    a = round.(rand(Float32, N) * 100)
-    b = round.(rand(Float32, N) * 100)
-    c = IntelITT.@task "vadd" oneAPI.@sync vadd(a, b)
-end
-
-# warm-up
-main()
-
-# actual profile
-IntelITT.@collect main()
-

Here, the IntelITT.@collect macro will start and stop the collection, so we should launch VTune with the -start-paused option:

-
$ vtune -collect gpu-offload -start-paused julia vadd.jl
-

In the GUI, we can now clearly see a nicely packed stream of API calls, grouped under the vadd task we added. Note that because API calls are asynchronous, i.e. they return immediately before the GPU has executed them, I grouped them under a oneAPI.@sync call so that the task not only captures the time spent on the CPU, but also the time spent on the GPU. This may not be wanted for your application.

-

VTune timeline

-

Kernel details

-

The timeline view is great for getting an application-level overview of what is happening, but once you've isolated a kernel that doesn't perform as expected, you may want to switch from the GPU Offload to the GPU Compute Hotspots analysis. Here, you get a more detailed view of what's happening during execution on the GPU, including the memory bandwidth and execution properties:

-
$ vtune -collect gpu-hotspots -start-paused julia vadd.jl
-

VTune timeline

-

Many of these analysis can be configured to collect more or less data, at the cost of more or less overhead.

-

Working remotely

-

In many cases, your local system will not have a GPU, and you will want to profile an application running on a remote system. As shown above, you can use the vtune CLI to create a trace and open that locally using vtune-gui, however there is an easier way: The vtune-backend daemon.

-

Start by launching the VTune back-end on the remote system:

-
$ vtune-backend --enable-server-profiling --web-port 8443 --log-to-console
-

If your remote system is directly reachable, you want to add --allow-remote-access --base-url "https://remoteServer:8443". However, most people will need to set-up an SSH tunnel:

-
$ ssh -L 8443:localhost:8443 remoteServer
-

You can now access the VTune GUI at https://localhost:8443/. Note that the first time you connect, you will need to do so using the one-time URL that is shown in the terminal where you launched the vtune-backend daemon.

-

The web interface that vtune-backend provides is identical to the GUI from vtune-gui: Start by creating a new project, and configuring an analysis: Select the local VTune profile server, enter the path to the Julia executable along with arguments and a working directory, and select the GPU Offload analysis type:

-

VTune WebUI

-

To start the analysis, click the big blue play button. If you use IntelITT.@collect to restrict the trace to the code of interest, use the second button with the pause symbol.

-

Give it a try!

-

Hopefully, this guide has shed some light on how to accurately profile oneAPI.jl applications using Intel's VTune Profiler. It turns out that one package could significantly benefit from some rigorous profiling: oneAPI.jl! Until now, development has focussed on correctness and usability, leaving considerable room for performance enhancements.

-

If you have access to an Intel GPU and want to gain experience profiling GPU applications with VTune, we encourage you to get involved! A good starting point would be analyzing some of oneAPI.jl's array operations like mapreduce or broadcast to identify potential bottlenecks. For more information or any queries, feel free to open an issue on GitHub, or join the discussion on Slack or Discourse. Your help could make a significant difference!

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2023-07-19-oneapi_profiling/vtune_gpu_hotspots.png b/previews/PR44/post/2023-07-19-oneapi_profiling/vtune_gpu_hotspots.png deleted file mode 100644 index 0e5e116..0000000 Binary files a/previews/PR44/post/2023-07-19-oneapi_profiling/vtune_gpu_hotspots.png and /dev/null differ diff --git a/previews/PR44/post/2023-07-19-oneapi_profiling/vtune_timeline.png b/previews/PR44/post/2023-07-19-oneapi_profiling/vtune_timeline.png deleted file mode 100644 index 4321dce..0000000 Binary files a/previews/PR44/post/2023-07-19-oneapi_profiling/vtune_timeline.png and /dev/null differ diff --git a/previews/PR44/post/2023-07-19-oneapi_profiling/vtune_webui.png b/previews/PR44/post/2023-07-19-oneapi_profiling/vtune_webui.png deleted file mode 100644 index b6793ea..0000000 Binary files a/previews/PR44/post/2023-07-19-oneapi_profiling/vtune_webui.png and /dev/null differ diff --git a/previews/PR44/post/2023-09-19-cuda_5.0/index.html b/previews/PR44/post/2023-09-19-cuda_5.0/index.html deleted file mode 100644 index 9f528ea..0000000 --- a/previews/PR44/post/2023-09-19-cuda_5.0/index.html +++ /dev/null @@ -1,270 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 5.0: Integrated profiler and task synchronization changes ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 5.0: Integrated profiler and task synchronization changes

- -
- -Tim Besard - - -

- - - -

CUDA.jl 5.0 is an major release that adds an integrated profiler to CUDA.jl, and reworks how tasks are synchronized. The release is slightly breaking, as it changes how local toolkits are handled and raises the minimum Julia and CUDA versions.

-

Integrated profiler

-

The most exciting new feature in CUDA.jl 5.0 is the new integrated profiler, which is similar to the @profile macro from the Julia standard library. The profiler can be used by simply prefixing any code that uses the CUDA libraries with CUDA.@profile:

-
julia> CUDA.@profile CUDA.rand(1).+1
-Profiler ran for 268.46 µs, capturing 21 events.
-
-Host-side activity: calling CUDA APIs took 230.79 µs (85.97% of the trace)
-┌──────────┬───────────┬───────┬───────────┬───────────┬───────────┬─────────────────────────┐
-│ Time (%) │      Time │ Calls │  Avg time │  Min time │  Max time │ Name                    │
-├──────────┼───────────┼───────┼───────────┼───────────┼───────────┼─────────────────────────┤
-│   76.47% │ 205.28 µs │     1 │ 205.28 µs │ 205.28 µs │ 205.28 µs │ cudaLaunchKernel        │
-│    5.42% │  14.54 µs │     2 │   7.27 µs │   5.01 µs │   9.54 µs │ cuMemAllocFromPoolAsync │
-│    2.93% │   7.87 µs │     1 │   7.87 µs │   7.87 µs │   7.87 µs │ cuLaunchKernel          │
-│    0.36% │ 953.67 ns │     2 │ 476.84 ns │    0.0 ns │ 953.67 ns │ cudaGetLastError        │
-└──────────┴───────────┴───────┴───────────┴───────────┴───────────┴─────────────────────────┘
-
-Device-side activity: GPU was busy for 2.15 µs (0.80% of the trace)
-┌──────────┬───────────┬───────┬───────────┬───────────┬───────────┬──────────────────────────────
-│ Time (%) │      Time │ Calls │  Avg time │  Min time │  Max time │ Name                        ⋯
-├──────────┼───────────┼───────┼───────────┼───────────┼───────────┼──────────────────────────────
-│    0.44% │   1.19 µs │     1 │   1.19 µs │   1.19 µs │   1.19 µs │ _Z13gen_sequencedI17curandS ⋯
-│    0.36% │ 953.67 ns │     1 │ 953.67 ns │ 953.67 ns │ 953.67 ns │ _Z16broadcast_kernel15CuKer ⋯
-└──────────┴───────────┴───────┴───────────┴───────────┴───────────┴──────────────────────────────
-                                                                                  1 column omitted
-1-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
- 1.7242923
-

The output shown above is a summary of what happened during the execution of the code. It is split into two sections: host-side activity, i.e., API calls to the CUDA libraries, and the resulting device-side activity. As part of each section, the output shows the time spent and the ratio to the total execution time. These ratios are important, and a good tool to quickly assess the performance of your code. For example, in the above output, we see that most of the time is spent on the host calling the CUDA libraries, and only very little time is actually spent computing things on the GPU. This indicates that the GPU is severely underutilized, which can be solved by increasing the problem size.

-

Instead of a summary, it is also possible to view a chronological trace by passing the trace=true keyword argument:

-
julia> CUDA.@profile trace=true CUDA.rand(1).+1;
-Profiler ran for 262.98 µs, capturing 21 events.
-
-Host-side activity: calling CUDA APIs took 227.21 µs (86.40% of the trace)
-┌────┬───────────┬───────────┬─────────────────────────┬────────────────────────┐
-│ ID │     Start │      Time │                    Name │ Details                │
-├────┼───────────┼───────────┼─────────────────────────┼────────────────────────┤
-│  5 │   6.44 µs │   9.06 µs │ cuMemAllocFromPoolAsync │ 4 bytes, device memory │
-│  7 │  19.31 µs │ 715.26 ns │        cudaGetLastError │ -                      │
-│  8 │  22.41 µs │ 204.09 µs │        cudaLaunchKernel │ -                      │
-│  9 │ 227.21 µs │    0.0 ns │        cudaGetLastError │ -                      │
-│ 14 │  232.7 µs │   3.58 µs │ cuMemAllocFromPoolAsync │ 4 bytes, device memory │
-│ 18 │ 250.34 µs │   7.39 µs │          cuLaunchKernel │ -                      │
-└────┴───────────┴───────────┴─────────────────────────┴────────────────────────┘
-
-Device-side activity: GPU was busy for 2.38 µs (0.91% of the trace)
-┌────┬───────────┬─────────┬─────────┬────────┬──────┬────────────────────────────────────────────
-│ ID │     Start │    Time │ Threads │ Blocks │ Regs │ Name                                      ⋯
-├────┼───────────┼─────────┼─────────┼────────┼──────┼────────────────────────────────────────────
-│  8 │ 225.31 µs │ 1.19 µs │      64 │     64 │   38 │ _Z13gen_sequencedI17curandStateXORWOWfiXa ⋯
-│ 18 │ 257.73 µs │ 1.19 µs │       1 │      1 │   18 │ _Z16broadcast_kernel15CuKernelContext13Cu ⋯
-└────┴───────────┴─────────┴─────────┴────────┴──────┴────────────────────────────────────────────
-                                                                                  1 column omitted
-

Here, we can see a list of events that the profiler captured. Each event has a unique ID, which can be used to corelate host-side and device-side events. For example, we can see that event 8 on the host is a call to cudaLaunchKernel, which corresponds to to the execution of a CURAND kernel on the device.

-

The integrated profiler is a great tool to quickly assess the performance of your GPU application, identify bottlenecks, and find opportunities for optimization. For complex applications, however, it is still recommended to use NVIDIA's NSight Systems or Compute profilers, which provide a more detailed, graphical view of what is happening on the GPU.

-

Synchronization on worker threads

-

Another noteworthy change affects how tasks are synchronized. To enable concurrent execution, i.e., to make it possible for other Julia tasks to execute while waiting for the GPU to finish, CUDA.jl used to rely on so-called stream callbacks. These callbacks were a significant source of latency, at least 25us per invocation but sometimes much longer, and have also been slated for deprecation and eventual removal from the CUDA toolkit.

-

Instead, on Julia 1.9 and later, CUDA.jl now uses worker threads to wait for GPU operations to finish. This mechanism is significantly faster, taking around 5us per invocation, but more importantly offers a much more reliable and predictable latency. You can observe this mechanism using the integrated profiler:

-
julia> a = CUDA.rand(1024, 1024, 1024)
-julia> CUDA.@profile trace=true CUDA.@sync a .+ a
-Profiler ran for 12.29 ms, capturing 527 events.
-
-Host-side activity: calling CUDA APIs took 11.75 ms (95.64% of the trace)
-┌─────┬───────────┬───────────┬────────┬─────────────────────────┐
-│  ID │     Start │      Time │ Thread │                    Name │
-├─────┼───────────┼───────────┼────────┼─────────────────────────┤
-│   5 │   6.91 µs │  13.59 µs │      1 │ cuMemAllocFromPoolAsync │
-│   9 │  36.72 µs │ 199.56 µs │      1 │          cuLaunchKernel │
-│ 525 │ 510.69 µs │  11.75 ms │      2 │     cuStreamSynchronize │
-└─────┴───────────┴───────────┴────────┴─────────────────────────┘
-

For some users, this may still be too slow, so we have added two mechanisms that disable nonblocking synchronization and simply block the calling thread until the GPU operation finishes. The first is a global setting, which can be enabled by setting the nonblocking_synchronization preference to false, which can be done using Preferences.jl. The second is a fine-grained flag to pass to synchronization functions: synchronize(x; blocking=true), CUDA.@sync blocking=true -..., etc. Both these mechanisms should not be used widely, and are only intended for use in latency-critical code, e.g., when benchmarking or profiling.

-

Local toolkit discovery

-

One of the breaking changes involves how local toolkits are discovered, when opting out of the use of artifacts. Previously, this could be enabled by calling CUDA.set_runtime_version!("local"), which generated a version = "local" preference. We are now changing this into two separate preferences, version and local, where the version preference overrides the version of the CUDA toolkit, and the local preference independently indicates whether to use a local CUDA toolkit or not.

-

Concretely, this means that you will now need to call CUDA.set_runtime_version!(local_toolkit=true) to enable the use of a local toolkit. The toolkit version will be auto-detected, but can be overridden by also passing a version: CUDA.set_runtime_version!(version; local_toolkit=true). This may be necessary when CUDA is not available during precompilation, e.g., on the log-in node of a cluster, or when building a container image.

-

Raised minimum requirements

-

Finally, CUDA.jl 5.0 raises the minimum Julia and CUDA versions. The minimum Julia version is now 1.8, which should be enforced by the Julia package manager. The minimum CUDA toolkit version is now 11.4, but this cannot be enforced by the package manager. As a result, if you need to use an older version of the CUDA toolkit, you will need to pin CUDA.jl to v4.4 or below. The README will maintain a table of supported CUDA toolkit versions.

-

Most users will not be affected by this change: If you use the artifact-provided CUDA toolkit, you will automatically get the latest version supported by your CUDA driver.

-

Other changes

- - -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2023-11-07-cuda_5.1/index.html b/previews/PR44/post/2023-11-07-cuda_5.1/index.html deleted file mode 100644 index b27e727..0000000 --- a/previews/PR44/post/2023-11-07-cuda_5.1/index.html +++ /dev/null @@ -1,300 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 5.1: Unified memory and cooperative groups ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 5.1: Unified memory and cooperative groups

- -
- -Tim Besard - - -

- - - -

CUDA.jl 5.1 greatly improves the support of two important parts of the CUDA toolkit: unified memory, for accessing GPU memory on the CPU and vice-versa, and cooperative groups which offer a more modular approach to kernel programming.

-

Unified memory

-

Unified memory is a feature of CUDA that allows the programmer to access memory from both the CPU and GPU, relying on the driver to move data between the two. This can be useful for a variety of reasons: to avoid explicit memory copies, to use more memory than the GPU has available, or to be able to incrementally port code to the GPU and still have parts of the application run on the CPU.

-

CUDA.jl did already support unified memory, but only for the most basic use cases. With CUDA.jl 5.1, it is now easier to allocate unified memory, and more convenient to use that memory from the CPU:

-
julia> gpu = cu([1., 2.]; unified=true)
-2-element CuArray{Float32, 1, CUDA.Mem.UnifiedBuffer}:
- 1.0
- 2.0
-
-julia> # accessing GPU memory from the CPU
-       gpu[1] = 3;
-
-julia> gpu
-2-element CuArray{Float32, 1, CUDA.Mem.UnifiedBuffer}:
- 3.0
- 2.0
-

Accessing GPU memory like this used to throw an error, but with CUDA.jl 5.1 it is safe and efficient to perform scalar iteration on CuArrays backed by unified memory. This greatly simplifies porting applications to the GPU, as it no longer is a problem when code uses AbstractArray fallbacks from Base that process element by element.

-

In addition, CUDA.jl 5.1 also makes it easier to convert CuArrays to Array objects. This is important when wanting to use high-performance CPU libraries like BLAS or LAPACK which do not support CuArrays:

-
julia> cpu = unsafe_wrap(Array, gpu)
-2-element Vector{Float32}:
- 3.0
- 2.0
-
-julia> LinearAlgebra.BLAS.scal!(2f0, cpu);
-
-julia> gpu
-2-element CuArray{Float32, 1, CUDA.Mem.UnifiedBuffer}:
- 6.0
- 4.0
-

The reverse is also possible: CPU-based Arrays can now trivially be converted to CuArray objects for use on the GPU, without the need to explicitly allocate unified memory. This further simplifies memory management, as it makes it possible to use the GPU inside of an existing application without having to copy data into a CuArray:

-
julia> gpu = unsafe_wrap(CuArray, cpu)
-2-element CuArray{Int64, 1, CUDA.Mem.UnifiedBuffer}:
- 1
- 2
-
-julia> CUDA.@sync gpu .+= 1;
-
-julia> cpu
-2-element Vector{Int64}:
- 2
- 3
-

Note that the above methods are prefixed unsafe because of how they require careful management of object lifetimes: When creating an Array from a CuArray, the CuArray must be kept alive for as long as the Array is used, and vice-versa when creating a CuArray from an Array. Explicit synchronization (i.e. waiting for the GPU to finish computing) is also required, as CUDA.jl cannot synchronize automatically when accessing GPU memory through a CPU pointer.

-

For now, CUDA.jl still defaults to device memory for unspecified allocations. This can be changed using the default_memory preference of the CUDA.jl module, which can be set to either "device", "unified" or "host". When these changes have been sufficiently tested, and the remaining rough edges have been smoothed out, we may consider switching the default allocator.

-

Cooperative groups

-

Another major improvement in CUDA.jl 5.1 are the greatly expanded wrappers for the CUDA cooperative groups API. Cooperative groups are a low-level feature of CUDA that make it possible to write kernels that are more flexible than the traditional approach of differentiating computations based on thread and block indices. Instead, cooperative groups allow the programmer to use objects representing groups of threads, pass those around, and differentiate computations based on queries on those objects.

-

For example, let's port the example from the introductory NVIDIA blogpost post, which provides a function to compute the sum of an array in parallel:

-
function reduce_sum(group, temp, val)
-    lane = CG.thread_rank(group)
-
-    # Each iteration halves the number of active threads
-    # Each thread adds its partial sum[i] to sum[lane+i]
-    i = CG.num_threads(group) ÷ 2
-    while i > 0
-        temp[lane] = val
-        CG.sync(group)
-        if lane <= i
-            val += temp[lane + i]
-        end
-        CG.sync(group)
-        i ÷= 2
-    end
-
-    return val  # note: only thread 1 will return full sum
-end
-

When the threads of a group call this function, they cooperatively compute the sum of the values passed by each thread in the group. For example, let's write a kernel that calls this function using a group representing the current thread block:

-
function sum_kernel_block(sum::AbstractArray{T},
-                          input::AbstractArray{T}) where T
-    # have each thread compute a partial sum
-    my_sum = thread_sum(input)
-
-    # perform a cooperative summation
-    temp = CuStaticSharedArray(T, 256)
-    g = CG.this_thread_block()
-    block_sum = reduce_sum(g, temp, my_sum)
-
-    # combine the block sums
-    if CG.thread_rank(g) == 1
-        CUDA.@atomic sum[] += block_sum
-    end
-
-    return
-end
-
-function thread_sum(input::AbstractArray{T}) where T
-    sum = zero(T)
-
-    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-    stride = blockDim().x * gridDim().x
-    while i <= length(input)
-        sum += input[i]
-        i += stride
-    end
-
-    return sum
-end
-
-n = 1<<24
-threads = 256
-blocks = cld(n, threads)
-
-data = CUDA.rand(n)
-sum = CUDA.fill(zero(eltype(data)), 1)
-@cuda threads=threads blocks=blocks sum_kernel_block(sum, data)
-

This style of programming makes it possible to write kernels that are safer and more modular than traditional kernels. Some CUDA features also require the use of cooperative groups, for example, asynchronous memory copies between global and shared memory are done using the CG.memcpy_async function.

-

With CUDA.jl 5.1, it is now possible to use a large part of these APIs from Julia. Support has been added for implicit groups (with the exception of cluster groups and the deprecated multi-grid groups), all relevant queries on these groups, as well as the many important collective functions, such as shuffle, vote, and memcpy_async. Support for explicit groups is still missing, as are collectives like reduce and invoke. For more information, refer to the CUDA.jl documentation.

-

Other updates

-

Apart from these two major features, CUDA.jl 5.1 also includes a number of smaller fixes and improvements:

-
    -
  • Support for CUDA 12.3

    -
  • -
  • Performance improvements related to memory copies, which regressed in CUDA 5.0

    -
  • -
  • Improvements to the native profiler (CUDA.@profiler), now also showing local memory usage, supporting more NVTX metadata, and with better support for Pluto.jl and Jupyter

    -
  • -
  • Many CUSOLVER and CUSPARSE improvements by @amontoison

    -
  • -
- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2024-04-26-cuda_5.2_5.3/index.html b/previews/PR44/post/2024-04-26-cuda_5.2_5.3/index.html deleted file mode 100644 index 9bc8c46..0000000 --- a/previews/PR44/post/2024-04-26-cuda_5.2_5.3/index.html +++ /dev/null @@ -1,312 +0,0 @@ - - - - - - - - - - - - - - - - - - - - CUDA.jl 5.2 and 5.3: Maintenance releases ⋅ JuliaGPU - - - - - - -
-
- - - - - -

CUDA.jl 5.2 and 5.3: Maintenance releases

- -
- -Tim Besard - - -

- - - -

CUDA.jl 5.2 and 5.3 are two minor release of CUDA.jl that mostly focus on bug fixes and minor improvements, but also come with a number of interesting new features. This blog post summarizes the changes in these releases.

-

Profiler improvements

-

CUDA.jl 5.1 introduced a new native profiler, which can be used to profile Julia GPU applications without having to use NSight Systems or other external tools. The tool has seen continued development, mostly improving its robustness, but CUDA.jl now also provides a @bprofile equivalent that runs your application multiple times and reports on the time distribution of individual events:

-
julia> CUDA.@bprofile CuArray([1]) .+ 1
-Profiler ran for 1.0 s, capturing 1427349 events.
-
-Host-side activity: calling CUDA APIs took 792.95 ms (79.29% of the trace)
-┌──────────┬────────────┬────────┬───────────────────────────────────────┬─────────────────────────┐
-│ Time (%) │ Total time │  Calls │ Time distribution                     │ Name                    │
-├──────────┼────────────┼────────┼───────────────────────────────────────┼─────────────────────────┤
-│   19.27% │  192.67 ms │ 109796 │   1.75 µs ± 10.19  (  0.95 ‥ 1279.83) │ cuMemAllocFromPoolAsync │
-│   17.08% │   170.8 ms │  54898 │   3.11 µs ± 0.27   (  2.15 ‥ 23.84)   │ cuLaunchKernel          │
-│   16.77% │  167.67 ms │  54898 │   3.05 µs ± 0.24   (  0.48 ‥ 16.69)   │ cuCtxSynchronize        │
-│   14.11% │  141.12 ms │  54898 │   2.57 µs ± 0.79   (  1.67 ‥ 70.57)   │ cuMemcpyHtoDAsync       │
-│    1.70% │   17.04 ms │  54898 │ 310.36 ns ± 132.89 (238.42 ‥ 5483.63) │ cuStreamSynchronize     │
-└──────────┴────────────┴────────┴───────────────────────────────────────┴─────────────────────────┘
-
-Device-side activity: GPU was busy for 87.38 ms (8.74% of the trace)
-┌──────────┬────────────┬───────┬───────────────────────────────────────┬────────────────────┐
-│ Time (%) │ Total time │ Calls │ Time distribution                     │ Name               │
-├──────────┼────────────┼───────┼───────────────────────────────────────┼────────────────────┤
-│    6.66% │   66.61 ms │ 54898 │   1.21 µs ± 0.16   (  0.95 ‥ 1.67)    │ kernel             │
-│    2.08% │   20.77 ms │ 54898 │ 378.42 ns ± 147.66 (238.42 ‥ 1192.09) │ [copy to device]   │
-└──────────┴────────────┴───────┴───────────────────────────────────────┴────────────────────┘
-
-NVTX ranges:
-┌──────────┬────────────┬───────┬────────────────────────────────────────┬─────────────────────┐
-│ Time (%) │ Total time │ Calls │ Time distribution                      │ Name                │
-├──────────┼────────────┼───────┼────────────────────────────────────────┼─────────────────────┤
-│   98.99% │  989.94 ms │ 54898 │  18.03 µs ± 49.88  ( 15.26 ‥ 10731.22) │ @bprofile.iteration │
-└──────────┴────────────┴───────┴────────────────────────────────────────┴─────────────────────┘
-

By default, CUDA.@bprofile runs the application for 1 second, but this can be adjusted using the time keyword argument.

-

Display of the time distribution isn't limited to CUDA.@bprofile, and will also be used by CUDA.@profile when any operation is called more than once. For example, with the broadcasting example from above we allocate both the input CuArray and the broadcast result, which results in two calls to the allocator:

-
julia> CUDA.@profile CuArray([1]) .+ 1
-
-Host-side activity:
-┌──────────┬────────────┬───────┬─────────────────────────────────────┬─────────────────────────┐
-│ Time (%) │ Total time │ Calls │ Time distribution                   │ Name                    │
-├──────────┼────────────┼───────┼─────────────────────────────────────┼─────────────────────────┤
-│   99.92% │   99.42 ms │     1 │                                     │ cuMemcpyHtoDAsync       │
-│    0.02% │   21.22 µs │     2 │  10.61 µs ± 6.57   (  5.96 ‥ 15.26) │ cuMemAllocFromPoolAsync │
-│    0.02% │   17.88 µs │     1 │                                     │ cuLaunchKernel          │
-│    0.00% │  953.67 ns │     1 │                                     │ cuStreamSynchronize     │
-└──────────┴────────────┴───────┴─────────────────────────────────────┴─────────────────────────┘
-

It is also not required anymore to specify external=true when using CUDA.@profile in combination with a tool like NSight Systems, as CUDA.jl will automatically detect the presence of an external profiler:

-
shell> nsys launch julia
-
-# warm-up
-julia> CuArray([1]).+1
-1-element CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}:
- 2
-
-julia> CUDA.@profile CuArray([1]).+1
-[ Info: This Julia session is already being profiled; defaulting to the external profiler.
-Capture range started in the application.
-Capture range ended in the application.
-Generating '/tmp/nsys-report-c42f.qdstrm'
-[1/1] [========================100%] report1.nsys-rep
-

In case that detection fails, the external keyword argument remains available (but do file an issue).

-

Kernel launch debugging

-

A common issue with CUDA programming is that kernel launches may fail when exhausting certain resources, such as shared memory or registers. This typically results in a cryptic error message, but CUDA.jl will now try to diagnose launch failures and provide a more helpful error message, as suggested by @simonbyrne:

-

For example, when using more parameter memory than allowed by the architecture:

-
julia> kernel(x) = nothing
-julia> @cuda kernel(ntuple(_->UInt64(1), 2^13))
-ERROR: Kernel invocation uses too much parameter memory.
-64.016 KiB exceeds the 31.996 KiB limit imposed by sm_89 / PTX v8.2.
-

Or when using an invalid launch configuration, violating a device limit:

-
julia> @cuda threads=2000 identity(nothing)
-ERROR: Number of threads in x-dimension exceeds device limit (2000 > 1024).
-caused by: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)
-

We also diagnose launch failures that involve kernel-specific limits, such as exceeding the number of threads that are allowed in a block (e.g., because of register use):

-
julia> @cuda threads=1024 heavy_kernel()
-ERROR: Number of threads per block exceeds kernel limit (1024 > 512).
-caused by: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)
-

Sorting improvements

-

Thanks to @xaellison, our bitonic sorting implementation now supports sorting specific dimensions, making it possible to implement sortperm for multi-dimensional arrays:

-
julia> A = cu([8 7; 5 6])
-2×2 CuArray{Int64, 2, Mem.DeviceBuffer}:
- 8  7
- 5  6
-
-julia> sortperm(A, dims = 1)
-2×2 CuArray{Int64, 2, Mem.DeviceBuffer}:
- 2  4
- 1  3
-
-julia> sortperm(A, dims = 2)
-2×2 CuArray{Int64, 2, Mem.DeviceBuffer}:
- 3  1
- 2  4
-

The bitonic kernel is now used for all sorting operations, in favor of the often slower quicksort implementation:

-
# before (quicksort)
-julia> @btime CUDA.@sync sort($(CUDA.rand(1024, 1024)); dims=1)
-  2.760 ms (30 allocations: 1.02 KiB)
-
-# after (bitonic sort)
-julia> @btime CUDA.@sync sort($(CUDA.rand(1024, 1024)); dims=1)
-  246.386 μs (567 allocations: 13.66 KiB)
-
-# reference CPU time
-julia> @btime sort($(rand(Float32, 1024, 1024)); dims=1)
-  4.795 ms (1030 allocations: 5.07 MiB)
-

Unified memory fixes

-

CUDA.jl 5.1 greatly improved support for unified memory, and this has continued in CUDA.jl 5.2 and 5.3. Most notably, when broadcasting CuArrays we now correctly preserve the memory type of the input arrays. This means that if you broadcast a CuArray that is allocated as unified memory, the result will also be allocated as unified memory. In case of a conflict, e.g. broadcasting a unified CuArray with one backed by device memory, we will prefer unified memory:

-
julia> cu([1]; host=true) .+ 1
-1-element CuArray{Int64, 1, Mem.HostBuffer}:
- 2
-
-julia> cu([1]; host=true) .+ cu([2]; device=true)
-1-element CuArray{Int64, 1, Mem.UnifiedBuffer}:
- 3
-

Software updates

-

Finally, we also did routine updates of the software stack, support the latest and greatest by NVIDIA. This includes support for CUDA 12.4 (Update 1), cuDNN 9, and cuTENSOR 2.0. This latest release of cuTENSOR is noteworthy as it revamps the API in a backwards-incompatible way, and CUDA.jl has opted to follow this change. For more details, refer to the cuTENSOR 2 migration guide by NVIDIA.

-

Of course, cuTENSOR.jl also provides a high-level Julia API which has been mostly unaffected by these changes:

-
using CUDA
-A = CUDA.rand(7, 8, 3, 2)
-B = CUDA.rand(3, 2, 2, 8)
-C = CUDA.rand(3, 3, 7, 2)
-
-using cuTENSOR
-tA = CuTensor(A, ['a', 'f', 'b', 'e'])
-tB = CuTensor(B, ['c', 'e', 'd', 'f'])
-tC = CuTensor(C, ['b', 'c', 'a', 'd'])
-
-using LinearAlgebra
-mul!(tC, tA, tB)
-

This API is still quite underdeveloped, so if you are a user of cuTENSOR.jl and have to adapt to the new API, now is a good time to consider improving the high-level interface instead!

-

Future releases

-

The next release of CUDA.jl is gearing up to be a much larger release, with significant changes to both the API and internals of the package. Although the intent is to keep these changes non-breaking, it is always possible that some code will be affected in unexpected ways, so we encourage users to test the upcoming release by simply running ] add CUDA#master and report any issues.

- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/2024-05-24-oneapi_1.5/index.html b/previews/PR44/post/2024-05-24-oneapi_1.5/index.html deleted file mode 100644 index 741405f..0000000 --- a/previews/PR44/post/2024-05-24-oneapi_1.5/index.html +++ /dev/null @@ -1,242 +0,0 @@ - - - - - - - - - - - - - - - - - - - - oneAPI.jl 1.5: Ponte Vecchio support and oneMKL improvements ⋅ JuliaGPU - - - - - - -
-
- - - - - -

oneAPI.jl 1.5: Ponte Vecchio support and oneMKL improvements

- -
- -Tim Besard - - -

- - - -

oneAPI.jl v1.5 is a significant release that brings many new features, from extended hardware support to greatly improved wrappers of the oneMLK math library.

-

Intel Ponte Vecchio

-

In oneAPI.jl v1.5 we introduce support for the Intel Ponte Vecchio (PVC) architecture, which empowers the Xe HPC GPUs as found in the Aurora supercomputer:

-
julia> oneAPI.versioninfo()
-Binary dependencies:
-- NEO: 24.13.29138+0
-- libigc: 1.0.16510+0
-- gmmlib: 22.3.18+0
-- SPIRV_LLVM_Translator_unified: 0.4.0+0
-- SPIRV_Tools: 2023.2.0+0
-
-Toolchain:
-- Julia: 1.10.3
-- LLVM: 15.0.7
-
-1 driver:
-- 00000000-0000-0000-17d2-6b1e010371d2 (v1.3.29138, API v1.3.0)
-
-16 devices:
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-

Apart from a handful of MKL-related issues, oneAPI.jl is fully functional on PVC, and passes all tests.

-

oneMLK wrappers

-

Thanks to the work of @amontoison, oneAPI.jl now provides greatly improved wrappers of the oneMKL library. This includes support for:

-
    -
  • LAPACK: geqrf(_batched), orgqr(_batched), ormqr, potrf(_batched), potrs(_batched), getrf(_batched), getri(_batched), gebrd, gesvd, syevd, heevd, sygvd, hegvd

    -
  • -
  • Sparse arrays: sparse_gemm, sparse_gemv, sparse_symv, sparse_trmv, sparse_trsv, sparse_optimize_gemv, sparse_optimize_trsv

    -
  • -
-

Where possible, these functions are integrated with standard library interfaces, e.g., making it possible to simply call eigen, or to multiply two oneSparseMatrixCSRs.

-

Minor changes

-

There have of course been many other changes and improvements in oneAPI.jl v1.5. For a full list, please refer to the release notes, but some highlights include:

-
    -
  • a new launch configuration heuristic that should generally improve performance;

    -
  • -
  • broadcast now preserves the buffer type (host, device, or shared);

    -
  • -
  • support for very large arrays that exceed the default device memory limit;

    -
  • -
  • several toolchain bumps, with v1.5 using oneAPI 2024.1.0 with driver 24.13.29138.7;

    -
  • -
  • minimal support for native Windows (next to WSL, which is fully supported).

    -
  • -
- -
-
- - - - - - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/index.html b/previews/PR44/post/index.html deleted file mode 100644 index 6fbc0fa..0000000 --- a/previews/PR44/post/index.html +++ /dev/null @@ -1,437 +0,0 @@ - - - - - - - - - - - - - - - - - - - Blog ⋅ JuliaGPU - - - - - - -
-
- - - - - -

- Blog - -

- -

- - CUDA.jl 5.4: Memory management mayhem -  ↗ -
- -
-

-

- - oneAPI.jl 1.5: Ponte Vecchio support and oneMKL improvements -  ↗ -
- -
-

-

- - CUDA.jl 5.2 and 5.3: Maintenance releases -  ↗ -
- -
-

-

- - CUDA.jl 5.1: Unified memory and cooperative groups -  ↗ -
- -
-

-

- - CUDA.jl 5.0: Integrated profiler and task synchronization changes -  ↗ -
- -
-

-

- - Profiling oneAPI.jl applications with VTune - -
- -
-

-

- - Metal.jl 0.2: Metal Performance Shaders - -
- -
-

-

- - oneAPI.jl 1.0: oneMKL, Intel Arc and Julia 1.9 - -
- -
-

-

- - CUDA.jl 4.0 - -
- -
-

-

- - Technical preview: Programming Apple M1 GPUs in Julia with Metal.jl - -
- -
-

-

- - oneAPI.jl status update - -
- -
-

-

- - CUDA.jl 3.5-3.8 - -
- -
-

-

- - CUDA.jl 3.4 - -
- -
-

-

- - CUDA.jl 3.3 - -
- -
-

-

- - CUDA.jl 3.0 - -
- -
-

-

- - CUDA.jl 2.4 and 2.5 - -
- -
-

-

- - Introducing: oneAPI.jl - -
- -
-

-

- - CUDA.jl 2.1 - -
- -
-

-

- - CUDA.jl 2.0 - -
- -
-

-

- - Paper: Flexible Performant GEMM Kernels on GPUs - -
- -
-

-

- - CUDA.jl 1.3 - Multi-device programming - -
- -
-

-

- - CUDA.jl 1.1 - -
- -
-

-

- - CUDAnative.jl 3.0 and CuArrays.jl 2.0 - -
- -
-

-

- - Julia's Dramatic Rise in HPC and Elsewhere -  ↗ -
- -
-

-

- - Accelerating Tensor Computations in Julia with the GPU -  ↗ -
- -
-

-

- - New website for JuliaGPU - -
- -
-

-

- - Julia Computing Brings Support for NVIDIA GPU Computing on Arm Powered Servers -  ↗ -
- -
-

-

- - DifferentialEquations.jl v6.9.0 released with automatic Multi-GPU support -  ↗ -
- -
-

-

- - An Introduction to GPU Programming in Julia -  ↗ -
- -
-

-

- - Next Generation Climate Models leverage Julia and GPUs -  ↗ -
- -
-

-

- - New Climate Model to be Built from the Ground Up -  ↗ -
- -
-

-

- - Solving Systems of Stochastic PDEs and using GPUs in Julia -  ↗ -
- -
-

-

- - High-Performance GPU Computing in the Julia Programming Language -  ↗ -
- -
-

- - -
-
- - - - - - - - - - - - - - - - - diff --git a/previews/PR44/post/index.xml b/previews/PR44/post/index.xml deleted file mode 100644 index 4ebecc7..0000000 --- a/previews/PR44/post/index.xml +++ /dev/null @@ -1,2199 +0,0 @@ - - - - - - <![CDATA[JuliaGPU]]> - https://juliagpu.org - - Franklin.jl -- https://github.com/tlienart/Franklin.jl - - - - - <![CDATA[CUDA.jl 5.4: Memory management mayhem]]> - https://juliagpu.org/post/2024-05-28-cuda_5.4/index.html - https://juliagpu.org/2024-05-28-cuda_5.4/ - - - CUDA.jl 5.4 comes with many memory-management related changes that should improve performance of memory-heavy applications, and make it easier to work with heterogeneous set-ups involving multiple GPUs or using both the CPU and GPU.

-

Before anything else, let's get the breaking changes out of the way. CUDA.jl v5.4 only bumps the minor version, so it should be compatible with existing codebases. However, there are a couple of API changes that, although covered by appropriate deprecation warnings, applications should be updated to:

-
    -
  • The CUDA.Mem submodule has been removed. All identifiers have been moved to the parent CUDA submodule, with a couple being renamed in the process:

    -
      -
    • Mem.Device and Mem.DeviceBuffer have been renamed to CUDA.DeviceMemory (the same applies to Mem.Host and Mem.Unified);

      -
    • -
    • enums from the Mem submodule have gained a MEM suffix, e.g., Mem.ATTACH_GLOBAL has been renamed to CUDA.MEM_ATTACH_GLOBAL;

      -
    • -
    • Mem.set! has been renamed to CUDA.memset;

      -
    • -
    • Mem.info() has been renamed to CUDA.memory_info();

      -
    • -
    -
  • -
  • CUDA.memory_status() has been renamed to CUDA.pool_status();

    -
  • -
  • CUDA.available_memory() has been renamed to CUDA.free_memory().

    -
  • -
-

The meat of this release is in the memory management improvements detailed below. These changes can have a significant impact of the performance of your application, so it's recommended to thoroughly test your application after upgrading!

-

Eager garbage collection

-

Julia is a garbage collected language, which means that (GPU) allocations can fail because garbage has piled up, necessitating a collection cycle. Previous versions of CUDA.jl handled this at the allocation site, detecting out-of-memory errors and triggering the GC. This was not ideal, as it could lead to significant pauses and a bloated memory usage.

-

To improve this, CUDA.jl v5.4 more accurately keeps track of memory usage, and uses that information to trigger the GC early at appropriate times, e.g., when waiting for a kernel to finish. This should lead to more predictable performance, both by distributing the cost of garbage collection over time and by potentially masking it behind other operations.

-

For example, the following toy model implemented with Flux.jl allocates a ton of memory:

-
using CUDA, Flux
-using MLUtils: DataLoadern_obs = 300_000
-n_feature = 1000
-X = rand(n_feature, n_obs)
-y = rand(1, n_obs)
-train_data = DataLoader((X, y) |< gpu; batchsize = 2048, shuffle=false)model = Dense(n_feature, >) |< gpu
-loss(m, _x, _y) = Flux.Losses.mse(m(_x), _>)
-opt_state = Flux.setup(Flux.Adam(), model)
-Flux.train!(loss, model, train_data, opt_state)
-for epoch in 1:100
-  Flux.train!(loss, model, train_data, opt_state)
-end
-

Without eager garbage collection, this leads to expensive pauses while freeing a large amount of memory at every epoch. We can simulate this by artificially limiting the memory available to the GPU, while also disabling the new eager garbage collection feature by setting the JULIA_CUDA_GC_EARLY environment variable to false (this is a temporary knob that will be removed in the future, but may be useful now for evaluating the new feature):

-
❯ JULIA_CUDA_GC_EARLY=false JULIA_CUDA_HARD_MEMORY_LIMIT=4GiB \
-  julia --project train.jl
-...
-[ Info: Epoch 90 train time 0.031s
-retry_reclaim: freed 2.865 GiB
-[ Info: Epoch 91 train time 0.031s
-[ Info: Epoch 92 train time 0.027s
-retry_reclaim: freed 2.865 GiB
-[ Info: Epoch 93 train time 0.03s
-retry_reclaim: freed 2.873 GiB
-[ Info: Epoch 94 train time 0.031s
-retry_reclaim: freed 2.873 GiB
-[ Info: Epoch 95 train time 0.03s
-retry_reclaim: freed 2.873 GiB
-[ Info: Epoch 96 train time 0.031s
-[ Info: Epoch 97 train time 0.027s
-retry_reclaim: freed 2.873 GiB
-[ Info: Epoch 98 train time 0.031s
-retry_reclaim: freed 2.865 GiB
-[ Info: Epoch 99 train time 0.031s
-retry_reclaim: freed 2.865 GiB
-[ Info: Epoch 100 train time 0.031s
-[ Info: Total time 4.307s
-

With eager garbage collection enabled, more frequent but less costly pauses result in significantly improved performance:

-
❯ JULIA_CUDA_GC_EARLY=true JULIA_CUDA_HARD_MEMORY_LIMIT=4GiB \
-  julia --project wip.jl
-...
-[ Info: Epoch 90 train time 0.031s
-maybe_collect: collected 1.8 GiB
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 91 train time 0.033s
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 92 train time 0.031s
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 93 train time 0.031s
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 94 train time 0.03s
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 95 train time 0.03s
-maybe_collect: collected 1.8 GiB
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 96 train time 0.033s
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 97 train time 0.03s
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 98 train time 0.03s
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 99 train time 0.03s
-maybe_collect: collected 1.8 GiB
-[ Info: Epoch 100 train time 0.03s
-[ Info: Total time 3.76s
-

Eager garbage collection is driven by a heuristic that considers the current memory pressure, how much memory was freed during previous collections, and how much time that took. It is possible that the current implementation is not optimal, so if you encounter performance issues, please file an issue.

-

Tracked memory allocations

-

When working with multiple GPUs, it is important to differentiate between the device that memory was allocated on, and the device used to execute code. Practically, this meant that users of CUDA.jl had to manually remember that allocating and using CuArray objects (typically) needed to happen with the same device active. The same is true for streams, which are used to order operations executing on a single GPU.

-

To improve this, CUDA.jl now keeps track of the device that owns the memory, and the stream last used to access it, enabling the package to "do the right thing" when using that memory in kernels or with library functionality. This does not mean that CUDA.jl will automatically switch the active device: We want to keep the user in control of that, as it often makes sense to access memory from another device, if your system supports it.

-

Let's break down what the implications are of this change.

-

1. Using multiple GPUs

-

If you have multiple GPUs, it may be possible that direct P2P access between devices is possible (e.g., using NVLink, or just over PCIe). In this case, CUDA.jl will now automatically configure the system to allow such access, making it possible to seamlessly use memory allocated on one device in kernels executing on a different device:

-
julia> # Allocate memory on device 0
-       device!(0)
-CuDevice(0): Tesla V100-PCIE-16GB
-julia> a = CuArray([1]);julia> # Use on device 1
-       device!(1)
-CuDevice(1): Tesla V100S-PCIE-32GB
-julia> a .+ 1;
-

If P2P access between devices is not possible, CUDA.jl will now raise an error instead of throwing an illegal memory access error as it did before:

-
julia> # Use on incompatible device 2
-       device!(2)
-CuDevice(2): NVIDIA GeForce GTX 1080 Ti
-julia> a .+ 1
-ERROR: cannot take the GPU address of inaccessible device memory.You are trying to use memory from GPU 0 on GPU 2.
-P2P access between these devices is not possible;
-either switch to GPU 0 by calling `CUDA.device!(0)`,
-or copy the data to an array allocated on device 2.
-

As the error message suggests, you can always copy memory between devices using the copyto! function. In this case, CUDA.jl will fall back to staging the copy on the host when P2P access is not possible.

-

2. Using multiple streams

-

Streams are used to order operations executing on a single GPU. In CUDA.jl, every Julia task has its own stream, making it very easy to group independent operations together, and make it possible for the GPU to potentially overlap execution of these operations.

-

Before CUDA.jl v5.4, users had to be careful about synchronizing data used in multiple tasks. It was recommended, for example, to end every data-producing task with an explicit call to synchronize(), or alternatively make sure to device_synchronize() at the start of a data-consuming task. Now that CUDA.jl keeps track of the stream used to last access memory, it can automatically synchronize streams when needed:

-
# Allocate some data
-a = CUDA.zeros(4096, 4096)
-b = CUDA.zeros(4096, 4096)
-#synchronize()  # No longer needed# Perform work on a task
-t = @async begin
-  a * b
-  #synchronize()  # No longer needed
-end# Fetch the results
-c = fetch(t)
-

3. Using capturing APIs

-

All of the above is implemented by piggybacking on the function that converts memory objects to pointers, in the assumption that this will be the final operation before the memory is used. This is generally true, with one important exception: APIs that capture memory. For example, when recording an operation using the CUDA graph APIs, a memory address may be captured and used later without CUDA.jl being aware of it.

-

CUDA.jl accounts for this by detecting conversions during stream capture, however, some APIs may not covered yet. If you encounter issues with capturing APIs, let us know, and keep using additional synchronization calls to ensure correctness.

-

Unified memory iteration

-

Unified memory is a feature of CUDA that allows memory to be accessed from both the CPU and the GPU. We have now greatly improved the performance of using unified memory with CPU code that iterates over elements of a CuArray. Although this is typically unwanted, triggering the dreaded "scalar indexing" error when accessing device memory in such a way, it can be useful when incrementaly porting code to the GPU.

-

Concretely, accessing elements of a unified CuArray on the CPU is much faster now:

-
julia> # Reference
-       a = [1];
-julia> @btime $a[];
-  1.959 ns (0 allocations: 0 bytes)julia> b = cu(a; unified=true);julia> # Before
-       @btime $b[]
-  2.617 μs (0 allocations: 0 bytes);julia> # After
-       @btime $b[];
-  4.140 ns (0 allocations: 0 bytes)
-

Notice the different unit! This has a massive impact on real-life performance, for example, as demonstrated by calling foldl which does not have a GPU-optimized implementation:

-
julia> a = cu(rand(1024, 1024); unified=true);julia> # Before
-       @b foldl(+, a)
-4.210 s (9 allocs: 208 bytes, without a warmup)julia> # After
-       @b foldl(+, a)
-3.107 ms (9 allocs: 208 bytes)
-

For completeness, doing this with regular device memory triggers a scalar indexing error:

-
julia> a = cu(rand(1024, 1024));julia> foldl(+, a)
-ERROR: Scalar indexing is disallowed.
-

These changes should make it easier to port applications to the GPU by incrementally moving parts of the codebase to the GPU without having to worry about the performance of accessing memory from the CPU. The only requirement is to use unified memory, e.g., by calling cu with unified=true, or setting the CUDA.jl preference default_memory to use unified memory by default. However, as unified memory comes with a slight cost, and results in synchronous allocation behavior, it is still recommended to switch back to regular device memory when your application has been fully ported to the GPU.

-

Other changes

-

To keep this post from becoming even longer, a quick rundown of other changes:

-
    -
  • @wsmoses introduced initial support for automatic differentiation of heterogeneous host/device code using Enzyme.jl. Before, you would have to differentiate through host and device code separately, and manually set up rules for crossing the host/device boundary. Now, you can differentiate through entire applications with ease;

    -
  • -
  • CUDA.@profile now automatically detects external profilers, so it should not be required to specify external=true anymore when running under NSight;

    -
  • -
  • Exception output has been improved, only reporting a single error message instead of generating output on each thread, and better forwarding the exception type;

    -
  • -
  • Cached handles from libraries will now be freed when under memory pressure;

    -
  • -
  • Tegra devices are now supported by our artifacts, obviating the use of a local toolkit;

    -
  • -
  • Support for CUDA 12.5 has been added, as well as initial support for Julia 1.12.

    -
  • -
-]]>
- - Tue, 28 May 2024 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[oneAPI.jl 1.5: Ponte Vecchio support and oneMKL improvements]]> - https://juliagpu.org/post/2024-05-24-oneapi_1.5/index.html - https://juliagpu.org/2024-05-24-oneapi_1.5/ - - - oneAPI.jl v1.5 is a significant release that brings many new features, from extended hardware support to greatly improved wrappers of the oneMLK math library.

-

Intel Ponte Vecchio

-

In oneAPI.jl v1.5 we introduce support for the Intel Ponte Vecchio (PVC) architecture, which empowers the Xe HPC GPUs as found in the Aurora supercomputer:

-
julia> oneAPI.versioninfo()
-Binary dependencies:
-- NEO: 24.13.29138+0
-- libigc: 1.0.16510+0
-- gmmlib: 22.3.18+0
-- SPIRV_LLVM_Translator_unified: 0.4.0+0
-- SPIRV_Tools: 2023.2.0+0Toolchain:
-- Julia: 1.10.3
-- LLVM: 15.0.71 driver:
-- 00000000-0000-0000-17d2-6b1e010371d2 (v1.3.29138, API v1.3.0)16 devices:
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-- Intel(R) Data Center GPU Max 1550
-

Apart from a handful of MKL-related issues, oneAPI.jl is fully functional on PVC, and passes all tests.

-

oneMLK wrappers

-

Thanks to the work of @amontoison, oneAPI.jl now provides greatly improved wrappers of the oneMKL library. This includes support for:

-
    -
  • LAPACK: geqrf(_batched), orgqr(_batched), ormqr, potrf(_batched), potrs(_batched), getrf(_batched), getri(_batched), gebrd, gesvd, syevd, heevd, sygvd, hegvd

    -
  • -
  • Sparse arrays: sparse_gemm, sparse_gemv, sparse_symv, sparse_trmv, sparse_trsv, sparse_optimize_gemv, sparse_optimize_trsv

    -
  • -
-

Where possible, these functions are integrated with standard library interfaces, e.g., making it possible to simply call eigen, or to multiply two oneSparseMatrixCSRs.

-

Minor changes

-

There have of course been many other changes and improvements in oneAPI.jl v1.5. For a full list, please refer to the release notes, but some highlights include:

-
    -
  • a new launch configuration heuristic that should generally improve performance;

    -
  • -
  • broadcast now preserves the buffer type (host, device, or shared);

    -
  • -
  • support for very large arrays that exceed the default device memory limit;

    -
  • -
  • several toolchain bumps, with v1.5 using oneAPI 2024.1.0 with driver 24.13.29138.7;

    -
  • -
  • minimal support for native Windows (next to WSL, which is fully supported).

    -
  • -
-]]>
- - Fri, 24 May 2024 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 5.2 and 5.3: Maintenance releases]]> - https://juliagpu.org/post/2024-04-26-cuda_5.2_5.3/index.html - https://juliagpu.org/2024-04-26-cuda_5.2_5.3/ - - - CUDA.jl 5.2 and 5.3 are two minor release of CUDA.jl that mostly focus on bug fixes and minor improvements, but also come with a number of interesting new features. This blog post summarizes the changes in these releases.

-

Profiler improvements

-

CUDA.jl 5.1 introduced a new native profiler, which can be used to profile Julia GPU applications without having to use NSight Systems or other external tools. The tool has seen continued development, mostly improving its robustness, but CUDA.jl now also provides a @bprofile equivalent that runs your application multiple times and reports on the time distribution of individual events:

-
julia> CUDA.@bprofile CuArray([1]) .+ 1
-Profiler ran for 1.0 s, capturing 1427349 events.Host-side activity: calling CUDA APIs took 792.95 ms (79.29% of the trace)
-┌──────────┬────────────┬────────┬───────────────────────────────────────┬─────────────────────────┐
-│ Time (%) │ Total time │  Calls │ Time distribution                     │ Name                    │
-├──────────┼────────────┼────────┼───────────────────────────────────────┼─────────────────────────┤
-│   19.27% │  192.67 ms │ 109796 │   1.75 µs ± 10.19  (  0.95 ‥ 1279.83) │ cuMemAllocFromPoolAsync │
-│   17.08% │   170.8 ms │  54898 │   3.11 µs ± 0.27   (  2.15 ‥ 23.84)   │ cuLaunchKernel          │
-│   16.77% │  167.67 ms │  54898 │   3.05 µs ± 0.24   (  0.48 ‥ 16.69)   │ cuCtxSynchronize        │
-│   14.11% │  141.12 ms │  54898 │   2.57 µs ± 0.79   (  1.67 ‥ 70.57)   │ cuMemcpyHtoDAsync       │
-│    1.70% │   17.04 ms │  54898 │ 310.36 ns ± 132.89 (238.42 ‥ 5483.63) │ cuStreamSynchronize     │
-└──────────┴────────────┴────────┴───────────────────────────────────────┴─────────────────────────┘Device-side activity: GPU was busy for 87.38 ms (8.74% of the trace)
-┌──────────┬────────────┬───────┬───────────────────────────────────────┬────────────────────┐
-│ Time (%) │ Total time │ Calls │ Time distribution                     │ Name               │
-├──────────┼────────────┼───────┼───────────────────────────────────────┼────────────────────┤
-│    6.66% │   66.61 ms │ 54898 │   1.21 µs ± 0.16   (  0.95 ‥ 1.67)    │ kernel             │
-│    2.08% │   20.77 ms │ 54898 │ 378.42 ns ± 147.66 (238.42 ‥ 1192.09) │ [copy to device]   │
-└──────────┴────────────┴───────┴───────────────────────────────────────┴────────────────────┘NVTX ranges:
-┌──────────┬────────────┬───────┬────────────────────────────────────────┬─────────────────────┐
-│ Time (%) │ Total time │ Calls │ Time distribution                      │ Name                │
-├──────────┼────────────┼───────┼────────────────────────────────────────┼─────────────────────┤
-│   98.99% │  989.94 ms │ 54898 │  18.03 µs ± 49.88  ( 15.26 ‥ 10731.22) │ @bprofile.iteration │
-└──────────┴────────────┴───────┴────────────────────────────────────────┴─────────────────────┘
-

By default, CUDA.@bprofile runs the application for 1 second, but this can be adjusted using the time keyword argument.

-

Display of the time distribution isn't limited to CUDA.@bprofile, and will also be used by CUDA.@profile when any operation is called more than once. For example, with the broadcasting example from above we allocate both the input CuArray and the broadcast result, which results in two calls to the allocator:

-
julia> CUDA.@profile CuArray([1]) .+ 1Host-side activity:
-┌──────────┬────────────┬───────┬─────────────────────────────────────┬─────────────────────────┐
-│ Time (%) │ Total time │ Calls │ Time distribution                   │ Name                    │
-├──────────┼────────────┼───────┼─────────────────────────────────────┼─────────────────────────┤
-│   99.92% │   99.42 ms │     1 │                                     │ cuMemcpyHtoDAsync       │
-│    0.02% │   21.22 µs │     2 │  10.61 µs ± 6.57   (  5.96 ‥ 15.26) │ cuMemAllocFromPoolAsync │
-│    0.02% │   17.88 µs │     1 │                                     │ cuLaunchKernel          │
-│    0.00% │  953.67 ns │     1 │                                     │ cuStreamSynchronize     │
-└──────────┴────────────┴───────┴─────────────────────────────────────┴─────────────────────────┘
-

It is also not required anymore to specify external=true when using CUDA.@profile in combination with a tool like NSight Systems, as CUDA.jl will automatically detect the presence of an external profiler:

-
shell> nsys launch julia# warm-up
-julia> CuArray([1]).+1
-1-element CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}:
- 2julia> CUDA.@profile CuArray([1]).+1
-[ Info: This Julia session is already being profiled; defaulting to the external profiler.
-Capture range started in the application.
-Capture range ended in the application.
-Generating '/tmp/nsys-report-c42f.qdstrm'
-[1/1] [========================100%] report1.nsys-rep
-

In case that detection fails, the external keyword argument remains available (but do file an issue).

-

Kernel launch debugging

-

A common issue with CUDA programming is that kernel launches may fail when exhausting certain resources, such as shared memory or registers. This typically results in a cryptic error message, but CUDA.jl will now try to diagnose launch failures and provide a more helpful error message, as suggested by @simonbyrne:

-

For example, when using more parameter memory than allowed by the architecture:

-
julia> kernel(x) = nothing
-julia> @cuda kernel(ntuple(_->UInt64(1), 2^13))
-ERROR: Kernel invocation uses too much parameter memory.
-64.016 KiB exceeds the 31.996 KiB limit imposed by sm_89 / PTX v8.2.
-

Or when using an invalid launch configuration, violating a device limit:

-
julia> @cuda threads=2000 identity(nothing)
-ERROR: Number of threads in x-dimension exceeds device limit (2000 > 1024).
-caused by: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)
-

We also diagnose launch failures that involve kernel-specific limits, such as exceeding the number of threads that are allowed in a block (e.g., because of register use):

-
julia> @cuda threads=1024 heavy_kernel()
-ERROR: Number of threads per block exceeds kernel limit (1024 > 512).
-caused by: CUDA error: invalid argument (code 1, ERROR_INVALID_VALUE)
-

Sorting improvements

-

Thanks to @xaellison, our bitonic sorting implementation now supports sorting specific dimensions, making it possible to implement sortperm for multi-dimensional arrays:

-
julia> A = cu([8 7; 5 6])
-2×2 CuArray{Int64, 2, Mem.DeviceBuffer}:
- 8  7
- 5  6julia> sortperm(A, dims = 1)
-2×2 CuArray{Int64, 2, Mem.DeviceBuffer}:
- 2  4
- 1  3julia> sortperm(A, dims = 2)
-2×2 CuArray{Int64, 2, Mem.DeviceBuffer}:
- 3  1
- 2  4
-

The bitonic kernel is now used for all sorting operations, in favor of the often slower quicksort implementation:

-
# before (quicksort)
-julia> @btime CUDA.@sync sort($(CUDA.rand(1024, 1024)); dims=1)
-  2.760 ms (30 allocations: 1.02 KiB)# after (bitonic sort)
-julia> @btime CUDA.@sync sort($(CUDA.rand(1024, 1024)); dims=1)
-  246.386 μs (567 allocations: 13.66 KiB)# reference CPU time
-julia> @btime sort($(rand(Float32, 1024, 1024)); dims=1)
-  4.795 ms (1030 allocations: 5.07 MiB)
-

Unified memory fixes

-

CUDA.jl 5.1 greatly improved support for unified memory, and this has continued in CUDA.jl 5.2 and 5.3. Most notably, when broadcasting CuArrays we now correctly preserve the memory type of the input arrays. This means that if you broadcast a CuArray that is allocated as unified memory, the result will also be allocated as unified memory. In case of a conflict, e.g. broadcasting a unified CuArray with one backed by device memory, we will prefer unified memory:

-
julia> cu([1]; host=true) .+ 1
-1-element CuArray{Int64, 1, Mem.HostBuffer}:
- 2julia> cu([1]; host=true) .+ cu([2]; device=true)
-1-element CuArray{Int64, 1, Mem.UnifiedBuffer}:
- 3
-

Software updates

-

Finally, we also did routine updates of the software stack, support the latest and greatest by NVIDIA. This includes support for CUDA 12.4 (Update 1), cuDNN 9, and cuTENSOR 2.0. This latest release of cuTENSOR is noteworthy as it revamps the API in a backwards-incompatible way, and CUDA.jl has opted to follow this change. For more details, refer to the cuTENSOR 2 migration guide by NVIDIA.

-

Of course, cuTENSOR.jl also provides a high-level Julia API which has been mostly unaffected by these changes:

-
using CUDA
-A = CUDA.rand(7, 8, 3, 2)
-B = CUDA.rand(3, 2, 2, 8)
-C = CUDA.rand(3, 3, 7, 2)using cuTENSOR
-tA = CuTensor(A, ['a', 'f', 'b', 'e'])
-tB = CuTensor(B, ['c', 'e', 'd', 'f'])
-tC = CuTensor(C, ['b', 'c', 'a', 'd'])using LinearAlgebra
-mul!(tC, tA, tB)
-

This API is still quite underdeveloped, so if you are a user of cuTENSOR.jl and have to adapt to the new API, now is a good time to consider improving the high-level interface instead!

-

Future releases

-

The next release of CUDA.jl is gearing up to be a much larger release, with significant changes to both the API and internals of the package. Although the intent is to keep these changes non-breaking, it is always possible that some code will be affected in unexpected ways, so we encourage users to test the upcoming release by simply running ] add CUDA#master and report any issues.

-]]>
- - Fri, 26 Apr 2024 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 5.1: Unified memory and cooperative groups]]> - https://juliagpu.org/post/2023-11-07-cuda_5.1/index.html - https://juliagpu.org/2023-11-07-cuda_5.1/ - - - CUDA.jl 5.1 greatly improves the support of two important parts of the CUDA toolkit: unified memory, for accessing GPU memory on the CPU and vice-versa, and cooperative groups which offer a more modular approach to kernel programming.

-

Unified memory

-

Unified memory is a feature of CUDA that allows the programmer to access memory from both the CPU and GPU, relying on the driver to move data between the two. This can be useful for a variety of reasons: to avoid explicit memory copies, to use more memory than the GPU has available, or to be able to incrementally port code to the GPU and still have parts of the application run on the CPU.

-

CUDA.jl did already support unified memory, but only for the most basic use cases. With CUDA.jl 5.1, it is now easier to allocate unified memory, and more convenient to use that memory from the CPU:

-
julia> gpu = cu([1., 2.]; unified=true)
-2-element CuArray{Float32, 1, CUDA.Mem.UnifiedBuffer}:
- 1.0
- 2.0julia> # accessing GPU memory from the CPU
-       gpu[1] = 3;julia> gpu
-2-element CuArray{Float32, 1, CUDA.Mem.UnifiedBuffer}:
- 3.0
- 2.0
-

Accessing GPU memory like this used to throw an error, but with CUDA.jl 5.1 it is safe and efficient to perform scalar iteration on CuArrays backed by unified memory. This greatly simplifies porting applications to the GPU, as it no longer is a problem when code uses AbstractArray fallbacks from Base that process element by element.

-

In addition, CUDA.jl 5.1 also makes it easier to convert CuArrays to Array objects. This is important when wanting to use high-performance CPU libraries like BLAS or LAPACK which do not support CuArrays:

-
julia> cpu = unsafe_wrap(Array, gpu)
-2-element Vector{Float32}:
- 3.0
- 2.0julia> LinearAlgebra.BLAS.scal!(2f0, cpu);julia> gpu
-2-element CuArray{Float32, 1, CUDA.Mem.UnifiedBuffer}:
- 6.0
- 4.0
-

The reverse is also possible: CPU-based Arrays can now trivially be converted to CuArray objects for use on the GPU, without the need to explicitly allocate unified memory. This further simplifies memory management, as it makes it possible to use the GPU inside of an existing application without having to copy data into a CuArray:

-
julia> gpu = unsafe_wrap(CuArray, cpu)
-2-element CuArray{Int64, 1, CUDA.Mem.UnifiedBuffer}:
- 1
- 2julia> CUDA.@sync gpu .+= 1;julia> cpu
-2-element Vector{Int64}:
- 2
- 3
-

Note that the above methods are prefixed unsafe because of how they require careful management of object lifetimes: When creating an Array from a CuArray, the CuArray must be kept alive for as long as the Array is used, and vice-versa when creating a CuArray from an Array. Explicit synchronization (i.e. waiting for the GPU to finish computing) is also required, as CUDA.jl cannot synchronize automatically when accessing GPU memory through a CPU pointer.

-

For now, CUDA.jl still defaults to device memory for unspecified allocations. This can be changed using the default_memory preference of the CUDA.jl module, which can be set to either "device", "unified" or "host". When these changes have been sufficiently tested, and the remaining rough edges have been smoothed out, we may consider switching the default allocator.

-

Cooperative groups

-

Another major improvement in CUDA.jl 5.1 are the greatly expanded wrappers for the CUDA cooperative groups API. Cooperative groups are a low-level feature of CUDA that make it possible to write kernels that are more flexible than the traditional approach of differentiating computations based on thread and block indices. Instead, cooperative groups allow the programmer to use objects representing groups of threads, pass those around, and differentiate computations based on queries on those objects.

-

For example, let's port the example from the introductory NVIDIA blogpost post, which provides a function to compute the sum of an array in parallel:

-
function reduce_sum(group, temp, val)
-    lane = CG.thread_rank(group)    # Each iteration halves the number of active threads
-    # Each thread adds its partial sum[i] to sum[lane+i]
-    i = CG.num_threads(group) ÷ 2
-    while i > 0
-        temp[lane] = val
-        CG.sync(group)
-        if lane <= i
-            val += temp[lane + i]
-        end
-        CG.sync(group)
-        i ÷= 2
-    end    return val  # note: only thread 1 will return full sum
-end
-

When the threads of a group call this function, they cooperatively compute the sum of the values passed by each thread in the group. For example, let's write a kernel that calls this function using a group representing the current thread block:

-
function sum_kernel_block(sum::AbstractArray{T},
-                          input::AbstractArray{T}) where T
-    # have each thread compute a partial sum
-    my_sum = thread_sum(input)    # perform a cooperative summation
-    temp = CuStaticSharedArray(T, 256)
-    g = CG.this_thread_block()
-    block_sum = reduce_sum(g, temp, my_sum)    # combine the block sums
-    if CG.thread_rank(g) == 1
-        CUDA.@atomic sum[] += block_sum
-    end    return
-endfunction thread_sum(input::AbstractArray{T}) where T
-    sum = zero(T)    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-    stride = blockDim().x * gridDim().x
-    while i <= length(input)
-        sum += input[i]
-        i += stride
-    end    return sum
-endn = 1<<24
-threads = 256
-blocks = cld(n, threads)data = CUDA.rand(n)
-sum = CUDA.fill(zero(eltype(data)), 1)
-@cuda threads=threads blocks=blocks sum_kernel_block(sum, data)
-

This style of programming makes it possible to write kernels that are safer and more modular than traditional kernels. Some CUDA features also require the use of cooperative groups, for example, asynchronous memory copies between global and shared memory are done using the CG.memcpy_async function.

-

With CUDA.jl 5.1, it is now possible to use a large part of these APIs from Julia. Support has been added for implicit groups (with the exception of cluster groups and the deprecated multi-grid groups), all relevant queries on these groups, as well as the many important collective functions, such as shuffle, vote, and memcpy_async. Support for explicit groups is still missing, as are collectives like reduce and invoke. For more information, refer to the CUDA.jl documentation.

-

Other updates

-

Apart from these two major features, CUDA.jl 5.1 also includes a number of smaller fixes and improvements:

-
    -
  • Support for CUDA 12.3

    -
  • -
  • Performance improvements related to memory copies, which regressed in CUDA 5.0

    -
  • -
  • Improvements to the native profiler (CUDA.@profiler), now also showing local memory usage, supporting more NVTX metadata, and with better support for Pluto.jl and Jupyter

    -
  • -
  • Many CUSOLVER and CUSPARSE improvements by @amontoison

    -
  • -
-]]>
- - Tue, 07 Nov 2023 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 5.0: Integrated profiler and task synchronization changes]]> - https://juliagpu.org/post/2023-09-19-cuda_5.0/index.html - https://juliagpu.org/2023-09-19-cuda_5.0/ - - - CUDA.jl 5.0 is an major release that adds an integrated profiler to CUDA.jl, and reworks how tasks are synchronized. The release is slightly breaking, as it changes how local toolkits are handled and raises the minimum Julia and CUDA versions.

-

Integrated profiler

-

The most exciting new feature in CUDA.jl 5.0 is the new integrated profiler, which is similar to the @profile macro from the Julia standard library. The profiler can be used by simply prefixing any code that uses the CUDA libraries with CUDA.@profile:

-
julia> CUDA.@profile CUDA.rand(1).+1
-Profiler ran for 268.46 µs, capturing 21 events.Host-side activity: calling CUDA APIs took 230.79 µs (85.97% of the trace)
-┌──────────┬───────────┬───────┬───────────┬───────────┬───────────┬─────────────────────────┐
-│ Time (%) │      Time │ Calls │  Avg time │  Min time │  Max time │ Name                    │
-├──────────┼───────────┼───────┼───────────┼───────────┼───────────┼─────────────────────────┤
-│   76.47% │ 205.28 µs │     1 │ 205.28 µs │ 205.28 µs │ 205.28 µs │ cudaLaunchKernel        │
-│    5.42% │  14.54 µs │     2 │   7.27 µs │   5.01 µs │   9.54 µs │ cuMemAllocFromPoolAsync │
-│    2.93% │   7.87 µs │     1 │   7.87 µs │   7.87 µs │   7.87 µs │ cuLaunchKernel          │
-│    0.36% │ 953.67 ns │     2 │ 476.84 ns │    0.0 ns │ 953.67 ns │ cudaGetLastError        │
-└──────────┴───────────┴───────┴───────────┴───────────┴───────────┴─────────────────────────┘Device-side activity: GPU was busy for 2.15 µs (0.80% of the trace)
-┌──────────┬───────────┬───────┬───────────┬───────────┬───────────┬──────────────────────────────
-│ Time (%) │      Time │ Calls │  Avg time │  Min time │  Max time │ Name                        ⋯
-├──────────┼───────────┼───────┼───────────┼───────────┼───────────┼──────────────────────────────
-│    0.44% │   1.19 µs │     1 │   1.19 µs │   1.19 µs │   1.19 µs │ _Z13gen_sequencedI17curandS ⋯
-│    0.36% │ 953.67 ns │     1 │ 953.67 ns │ 953.67 ns │ 953.67 ns │ _Z16broadcast_kernel15CuKer ⋯
-└──────────┴───────────┴───────┴───────────┴───────────┴───────────┴──────────────────────────────
-                                                                                  1 column omitted
-1-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
- 1.7242923
-

The output shown above is a summary of what happened during the execution of the code. It is split into two sections: host-side activity, i.e., API calls to the CUDA libraries, and the resulting device-side activity. As part of each section, the output shows the time spent and the ratio to the total execution time. These ratios are important, and a good tool to quickly assess the performance of your code. For example, in the above output, we see that most of the time is spent on the host calling the CUDA libraries, and only very little time is actually spent computing things on the GPU. This indicates that the GPU is severely underutilized, which can be solved by increasing the problem size.

-

Instead of a summary, it is also possible to view a chronological trace by passing the trace=true keyword argument:

-
julia> CUDA.@profile trace=true CUDA.rand(1).+1;
-Profiler ran for 262.98 µs, capturing 21 events.Host-side activity: calling CUDA APIs took 227.21 µs (86.40% of the trace)
-┌────┬───────────┬───────────┬─────────────────────────┬────────────────────────┐
-│ ID │     Start │      Time │                    Name │ Details                │
-├────┼───────────┼───────────┼─────────────────────────┼────────────────────────┤
-│  5 │   6.44 µs │   9.06 µs │ cuMemAllocFromPoolAsync │ 4 bytes, device memory │
-│  7 │  19.31 µs │ 715.26 ns │        cudaGetLastError │ -                      │
-│  8 │  22.41 µs │ 204.09 µs │        cudaLaunchKernel │ -                      │
-│  9 │ 227.21 µs │    0.0 ns │        cudaGetLastError │ -                      │
-│ 14 │  232.7 µs │   3.58 µs │ cuMemAllocFromPoolAsync │ 4 bytes, device memory │
-│ 18 │ 250.34 µs │   7.39 µs │          cuLaunchKernel │ -                      │
-└────┴───────────┴───────────┴─────────────────────────┴────────────────────────┘Device-side activity: GPU was busy for 2.38 µs (0.91% of the trace)
-┌────┬───────────┬─────────┬─────────┬────────┬──────┬────────────────────────────────────────────
-│ ID │     Start │    Time │ Threads │ Blocks │ Regs │ Name                                      ⋯
-├────┼───────────┼─────────┼─────────┼────────┼──────┼────────────────────────────────────────────
-│  8 │ 225.31 µs │ 1.19 µs │      64 │     64 │   38 │ _Z13gen_sequencedI17curandStateXORWOWfiXa ⋯
-│ 18 │ 257.73 µs │ 1.19 µs │       1 │      1 │   18 │ _Z16broadcast_kernel15CuKernelContext13Cu ⋯
-└────┴───────────┴─────────┴─────────┴────────┴──────┴────────────────────────────────────────────
-                                                                                  1 column omitted
-

Here, we can see a list of events that the profiler captured. Each event has a unique ID, which can be used to corelate host-side and device-side events. For example, we can see that event 8 on the host is a call to cudaLaunchKernel, which corresponds to to the execution of a CURAND kernel on the device.

-

The integrated profiler is a great tool to quickly assess the performance of your GPU application, identify bottlenecks, and find opportunities for optimization. For complex applications, however, it is still recommended to use NVIDIA's NSight Systems or Compute profilers, which provide a more detailed, graphical view of what is happening on the GPU.

-

Synchronization on worker threads

-

Another noteworthy change affects how tasks are synchronized. To enable concurrent execution, i.e., to make it possible for other Julia tasks to execute while waiting for the GPU to finish, CUDA.jl used to rely on so-called stream callbacks. These callbacks were a significant source of latency, at least 25us per invocation but sometimes much longer, and have also been slated for deprecation and eventual removal from the CUDA toolkit.

-

Instead, on Julia 1.9 and later, CUDA.jl now uses worker threads to wait for GPU operations to finish. This mechanism is significantly faster, taking around 5us per invocation, but more importantly offers a much more reliable and predictable latency. You can observe this mechanism using the integrated profiler:

-
julia> a = CUDA.rand(1024, 1024, 1024)
-julia> CUDA.@profile trace=true CUDA.@sync a .+ a
-Profiler ran for 12.29 ms, capturing 527 events.Host-side activity: calling CUDA APIs took 11.75 ms (95.64% of the trace)
-┌─────┬───────────┬───────────┬────────┬─────────────────────────┐
-│  ID │     Start │      Time │ Thread │                    Name │
-├─────┼───────────┼───────────┼────────┼─────────────────────────┤
-│   5 │   6.91 µs │  13.59 µs │      1 │ cuMemAllocFromPoolAsync │
-│   9 │  36.72 µs │ 199.56 µs │      1 │          cuLaunchKernel │
-│ 525 │ 510.69 µs │  11.75 ms │      2 │     cuStreamSynchronize │
-└─────┴───────────┴───────────┴────────┴─────────────────────────┘
-

For some users, this may still be too slow, so we have added two mechanisms that disable nonblocking synchronization and simply block the calling thread until the GPU operation finishes. The first is a global setting, which can be enabled by setting the nonblocking_synchronization preference to false, which can be done using Preferences.jl. The second is a fine-grained flag to pass to synchronization functions: synchronize(x; blocking=true), CUDA.@sync blocking=true -..., etc. Both these mechanisms should not be used widely, and are only intended for use in latency-critical code, e.g., when benchmarking or profiling.

-

Local toolkit discovery

-

One of the breaking changes involves how local toolkits are discovered, when opting out of the use of artifacts. Previously, this could be enabled by calling CUDA.set_runtime_version!("local"), which generated a version = "local" preference. We are now changing this into two separate preferences, version and local, where the version preference overrides the version of the CUDA toolkit, and the local preference independently indicates whether to use a local CUDA toolkit or not.

-

Concretely, this means that you will now need to call CUDA.set_runtime_version!(local_toolkit=true) to enable the use of a local toolkit. The toolkit version will be auto-detected, but can be overridden by also passing a version: CUDA.set_runtime_version!(version; local_toolkit=true). This may be necessary when CUDA is not available during precompilation, e.g., on the log-in node of a cluster, or when building a container image.

-

Raised minimum requirements

-

Finally, CUDA.jl 5.0 raises the minimum Julia and CUDA versions. The minimum Julia version is now 1.8, which should be enforced by the Julia package manager. The minimum CUDA toolkit version is now 11.4, but this cannot be enforced by the package manager. As a result, if you need to use an older version of the CUDA toolkit, you will need to pin CUDA.jl to v4.4 or below. The README will maintain a table of supported CUDA toolkit versions.

-

Most users will not be affected by this change: If you use the artifact-provided CUDA toolkit, you will automatically get the latest version supported by your CUDA driver.

-

Other changes

- -]]>
- - Tue, 19 Sep 2023 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[Profiling oneAPI.jl applications with VTune]]> - https://juliagpu.org/post/2023-07-19-oneapi_profiling/index.html - https://juliagpu.org/2023-07-19-oneapi_profiling/ - - - Profiling GPU applications is hard, so this post shows how to use Intel's VTune Profiler to profile GPU applications written in Julia with oneAPI.jl.

-

Because of the asynchronous nature of GPU execution, profiling GPU applications with Julia's tried and tested tools like @profile or even @time can be misleading: They will only show the time spent on the CPU, and will likely report that your application is spending most of its time waiting for the GPU.

-

To get a better understanding of what is happening on the GPU, we need specialized tools. In this post, we'll show how to use Intel's VTune Profiler to profile GPU applications written in Julia using oneAPI.jl.

-

Set-up

-

Start by downloading and installing the Intel VTune Profiler. This does not require administrative permissions, and will install in your home folder under the intel directory. On Linux, binaries will appear in ~/intel/oneapi/vtune/latest/bin64. There are three that are particularly important:

-
    -
  • vtune: a command-line tool to profile applications;

    -
  • -
  • vtune-gui: a graphical user interface to profile applications, or to visualize the results of a command-line profiling session;

    -
  • -
  • vtune-backend: a daemon that creates a web interface for VTune, which you can use to profile applications both locally and remotely.

    -
  • -
-

Hello VTune!

-

Let's start with a simple example: A Julia program that computes the sum of two arrays (i.e., the vadd example from the oneAPI repository):

-
using oneAPIfunction kernel(a, b, c)
-    i = get_global_id()
-    @inbounds c[i] = a[i] + b[i]
-    return
-endfunction vadd(a, b)
-    d_a = oneArray(a)
-    d_b = oneArray(b)
-    d_c = similar(d_a)    @oneapi items=size(d_c) kernel(d_a, d_b, d_c)
-    Array(d_c)
-endfunction main(N=256)
-    a = round.(rand(Float32, N) * 100)
-    b = round.(rand(Float32, N) * 100)
-    c = vadd(a, b)
-end
-main()
-

We've tweaked this example to make it more suited for profiling: We've enclosed the main application in a function so that it gets compiled, and we've increased the array sizes to make the GPU work harder.

-

There are several ways to profile this application. We'll start by demonstrating the command-line interface:

-
$ vtune -collect gpu-offload julia vadd.jlvtune: Collection started.
-vtune: Collection stopped.vtune: Using result path `/home/tim/Julia/pkg/oneAPI/r000gh'
-    GPU Time: 0.002s
-EU Array Stalled/Idle: 100.0% of Elapsed time with GPU busy
- | The percentage of time when the EUs were stalled or idle is high, which has a
- | negative impact on compute-bound applications.
-FPU Utilization: 0.0% of Elapsed time with GPU busy
-...
-

This will run the application, and collect a number of GPU-related metrics. A summary is shown in the terminal, and a more detailed report will be written to a directory in the current working directory. You can open that report with the graphical user interface, possibly even on a different machine:

-
$ vtune-gui r000gh
-

Instrumenting the application

-

The trace we just collected includes the time spent compiling our application, making it difficult to analyze what is happening. To refine the trace, we can instrument our application with Intel's Instrumentation and Tracing Technology (ITT) APIs:

-
    -
  • only start the profiler when we're running code of interest;

    -
  • -
  • add markers to the trace to indicate what is happening.

    -
  • -
-

We can interface with the ITT APIs using the IntelITT.jl package. Let's update our example:

-
using oneAPI, IntelITT# same as beforefunction main(N=256)
-    a = round.(rand(Float32, N) * 100)
-    b = round.(rand(Float32, N) * 100)
-    c = IntelITT.@task "vadd" oneAPI.@sync vadd(a, b)
-end# warm-up
-main()# actual profile
-IntelITT.@collect main()
-

Here, the IntelITT.@collect macro will start and stop the collection, so we should launch VTune with the -start-paused option:

-
$ vtune -collect gpu-offload -start-paused julia vadd.jl
-

In the GUI, we can now clearly see a nicely packed stream of API calls, grouped under the vadd task we added. Note that because API calls are asynchronous, i.e. they return immediately before the GPU has executed them, I grouped them under a oneAPI.@sync call so that the task not only captures the time spent on the CPU, but also the time spent on the GPU. This may not be wanted for your application.

-

VTune timeline

-

Kernel details

-

The timeline view is great for getting an application-level overview of what is happening, but once you've isolated a kernel that doesn't perform as expected, you may want to switch from the GPU Offload to the GPU Compute Hotspots analysis. Here, you get a more detailed view of what's happening during execution on the GPU, including the memory bandwidth and execution properties:

-
$ vtune -collect gpu-hotspots -start-paused julia vadd.jl
-

VTune timeline

-

Many of these analysis can be configured to collect more or less data, at the cost of more or less overhead.

-

Working remotely

-

In many cases, your local system will not have a GPU, and you will want to profile an application running on a remote system. As shown above, you can use the vtune CLI to create a trace and open that locally using vtune-gui, however there is an easier way: The vtune-backend daemon.

-

Start by launching the VTune back-end on the remote system:

-
$ vtune-backend --enable-server-profiling --web-port 8443 --log-to-console
-

If your remote system is directly reachable, you want to add --allow-remote-access --base-url "https://remoteServer:8443". However, most people will need to set-up an SSH tunnel:

-
$ ssh -L 8443:localhost:8443 remoteServer
-

You can now access the VTune GUI at https://localhost:8443/. Note that the first time you connect, you will need to do so using the one-time URL that is shown in the terminal where you launched the vtune-backend daemon.

-

The web interface that vtune-backend provides is identical to the GUI from vtune-gui: Start by creating a new project, and configuring an analysis: Select the local VTune profile server, enter the path to the Julia executable along with arguments and a working directory, and select the GPU Offload analysis type:

-

VTune WebUI

-

To start the analysis, click the big blue play button. If you use IntelITT.@collect to restrict the trace to the code of interest, use the second button with the pause symbol.

-

Give it a try!

-

Hopefully, this guide has shed some light on how to accurately profile oneAPI.jl applications using Intel's VTune Profiler. It turns out that one package could significantly benefit from some rigorous profiling: oneAPI.jl! Until now, development has focussed on correctness and usability, leaving considerable room for performance enhancements.

-

If you have access to an Intel GPU and want to gain experience profiling GPU applications with VTune, we encourage you to get involved! A good starting point would be analyzing some of oneAPI.jl's array operations like mapreduce or broadcast to identify potential bottlenecks. For more information or any queries, feel free to open an issue on GitHub, or join the discussion on Slack or Discourse. Your help could make a significant difference!

-]]>
- - Wed, 19 Jul 2023 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[Metal.jl 0.2: Metal Performance Shaders]]> - https://juliagpu.org/post/2023-03-03-metal_0.2/index.html - https://juliagpu.org/2023-03-03-metal_0.2/ - - - Metal.jl 0.2 marks a significant milestone in the development of the Metal.jl package. The release comes with initial support for the Metal Perform Shaders (MPS) framework for accelerating common operations like matrix multiplications, as well as various improvements for writing Metal kernels in Julia.

-

Metal Performance Shaders

-

Quoting the Apple documentation, The Metal Performance Shaders (MPS) framework contains a collection of highly optimized compute and graphics shaders for use in Metal applications. With Metal.jl 0.2, we have added initial support for this framework, and used it to accelerate the matrix multiplication operation:

-
julia> using Metal, LinearAlgebra, BenchmarkTools
-julia> n = p = m = 2048
-julia> flops = n*m*(2p-1)
-17175674880julia> a = MtlArray(rand(Float32, n, p));
-julia> b = MtlArray(rand(Float32, p, m));
-julia> c = MtlArray(zeros(Float32, n, m));julia> using LinearAlgebra
-julia> bench = @benchmark Metal.@sync mul!(c, a, b)
-BenchmarkTools.Trial: 518 samples with 1 evaluation.
- Range (min … max):  9.366 ms …  13.354 ms  ┊ GC (min … max): 0.00% … 0.00%
- Time  (median):     9.629 ms               ┊ GC (median):    0.00%
- Time  (mean ± σ):   9.646 ms ± 192.169 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%               ▃▂▅▅▆▆▆▇█▇▇▆▅▄▄▁▁ ▁
-  ▄▁▄▄▄▄▆▆▆▄▄▁▇█████████████████▄█▄▁▆▁▄▁▆▁▇▁▄▄▁▁▄▄▇▁▄▆▄▁▁▁▁▁▄ █
-  9.37 ms      Histogram: log(frequency) by time      10.1 ms < Memory estimate: 352 bytes, allocs estimate: 12.julia> flops / (minimum(bench.times)/1e9)
-1.83e12
-

The benchmark above shows that on an 8-core M1 Pro matrix multiplication now reaches 1.8 TFLOPS (out of the 2.6TFLOPS of theoretical performance). The accelerated matrix multiplication is available for a variety of input types, incuding mixed-mode operations, and as shown above is integrated with the LinearAlgebra.jl mul! interface.

-

Of course, the MPS framework offers more than just matrix multiplication, and we expect to support more of it in the future. If you have a specific operation you would like to use from Julia, please let us know by opening an issue on the Metal.jl repository.

-

GPU profiling support

-

To support the development of Metal kernels, Max Hawkins has added support for GPU profiling. Similar to how this works in CUDA.jl, you can run code under the Metal.@profile macro to record its execution. However, this does first require setting the METAL_CAPTURE_ENABLED environment flag before import Metal.jl:

-
julia> ENV["METAL_CAPTURE_ENABLED"] = 1julia> using Metaljulia> a = mtl(rand(1024, 1024))
-julia> Metal.@profile sum(a)
-[ Info: GPU frame capture saved to jl_metal.gputrace/
-

The resulting capture can be opened with Xcode, presenting a timeline that's similar to other profilers:

-
- XCode viewing a Metal.jl capture trace -

Other improvements

-
    -
  • Julia 1.9 is supported, but requires an up-to-date macOS version (issues have been encountered on macOS 12.4);

    -
  • -
  • An mtl function has been added for converting Julia arrays to Metal arrays, similar to the cu function in CUDA.jl;

    -
  • -
  • Multiple GPUs are supported, and the device! function can be used to select one;

    -
  • -
  • Coverage for SIMD Group functions has been improved, so it's is now possible to use simdgroup_load, simdgroup_store, simdgroup_multiply, and simdgroup_multiply_accumulate in kernels functions.

    -
  • -
-

Future work

-

Although Metal.jl is now usable for a variety of applications, there is still work to be done before it can be considered production-ready. In particular:

-
    -
  • there are known performance issues with mapreduce, and other operations that realy on CartesianIndices;

    -
  • -
  • the libcmt wrapper library for interfacing with the Metal APIs is cumbersome to use and improve, and we are looking into native ObjectiveC FFI instead;

    -
  • -
  • the MPS wrappers are incomplete, and similar to the Metal APIs requires a replacement to libcmt to be improved;

    -
  • -
  • support for atomic operations is missing, which is required to implement a full-featured KernelAbstractions.jl back-end.

    -
  • -
-

Once (most of) these issues are addressed, we should be able to release Metal.jl 1.0.

-]]>
- - Fri, 03 Mar 2023 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[oneAPI.jl 1.0: oneMKL, Intel Arc and Julia 1.9]]> - https://juliagpu.org/post/2023-02-08-oneapi_1.0/index.html - https://juliagpu.org/2023-02-08-oneapi_1.0/ - - - The release of oneAPI.jl 1.0 adds integration with the oneAPI Math Kernel Library (oneMKL) to accelerate linear algebra operations on Intel GPUs. It also brings support for Julia 1.9 and Intel Arc GPUs.

-

oneMKL integration

-

oneAPI.jl now uses the Intel oneAPI Math Kernel Library (oneMKL), automatically downloaded as part of oneAPI_Support_jll.jl, to accelerate a great number of BLAS and LAPACK operations on Intel GPUs. Similar to how it is implemented in our other GPU back-ends, these wrappers are available at different levels of abstraction.

-

At the lowest level, we use a C library that wraps the oneMKL C++ APIs. For example, the oneapi::mkl::blas::column_major::gemm function for matrix-matrix multiplication is wrapped by the C functions onemklSgemm, onemklDgemm, etc. These wrappers are used to implement low-level methods like oneMKL.gemm!:

-
julia> using oneAPIjulia> A = oneArray(rand(Float32, 2, 3));
-2×3 oneMatrix{Float32, oneAPI.oneL0.DeviceBuffer}:
- 0.44302   0.125576  0.859145
- 0.674291  0.428346  0.0400119
-julia> B = oneArray(rand(Float32, 3, 4))
-3×4 oneMatrix{Float32, oneAPI.oneL0.DeviceBuffer}:
- 0.592748   0.529413   0.0323396  0.659528
- 0.22489    0.0872259  0.253291   0.376519
- 0.0121506  0.591135   0.706755   0.751686
-julia> C = similar(B, (2, 4));julia> oneMKL.gemm!('N', 'N', true, A, B, true, C)
-2×4 oneMatrix{Float32, oneAPI.oneL0.DeviceBuffer}:
- 0.301279  0.753365  0.65334   0.985274
- 0.496501  0.417994  0.158581  0.63607julia> Array(C) ≈ Array(A) * Array(B)
-true
-

Of course, these low-level functions aren't very user-friendly, so we also integrate with Julia's standard libraries where possible:

-
julia> A = oneArray(rand(Float32, 2, 3));
-julia> B = oneArray(rand(Float32, 3, 4));julia> using LinearAlgebra
-julia> C = A * B;julia> Array(C) ≈ Array(A) * Array(B)
-true
-

The most frequently used oneMKL BLAS functions have been wrapped and integrated with Julia’s standard linear algebra libraries. If you run into a missing function, please file a request to add it, or take a look at the source and contribute to oneAPI.jl! The current state of the wrappers should make it easy to extend their functionality, as well as form a good basis for integrating with other libraries like oneDNN.

-

Intel Arc support

-

The new Arc series of discrete Intel GPUs are now fully supported by oneAPI.jl. These GPUs offer a significant performance improvement over their integrated predecessors:

-
julia> using oneAPI
-julia> oneAPI.versioninfo()
-1 device:
-- Intel(R) Arc(TM) A770 Graphics [0x56a0]julia> T = Float32;
-julia> n = p = m = 2048;
-julia> a = oneArray(rand(T, n, p));
-julia> b = oneArray(rand(T, p, m));
-julia> c = oneArray(zeros(T, n, m));julia> using BenchmarkTools, LinearAlgebra
-julia> bench = @benchmark oneAPI.@sync mul!(c, a, b)
-BenchmarkTools.Trial: 1510 samples with 1 evaluation.
- Range (min … max):  3.233 ms …  3.791 ms  ┊ GC (min … max): 0.00% … 0.00%
- Time  (median):     3.298 ms              ┊ GC (median):    0.00%
- Time  (mean ± σ):   3.308 ms ± 48.426 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%        ▁▃▄▇█▅▄▃▂   ▁▁▁
-  ▁▁▃▃▅▇██████████████████▇▇▇▅▆▄▅▅▄▂▃▂▂▂▂▂▂▁▂▂▂▁▂▁▂▁▂▂▂▂▁▁▂▂ ▃
-  3.23 ms        Histogram: frequency by time        3.47 ms < Memory estimate: 272 bytes, allocs estimate: 11.julia> flops = n*m*(2p-1)
-17175674880julia> flops / (minimum(bench.times)/1e9)
-5.3131281169900205e12
-

For example, here we're getting over 5 TFlops of Float32 performance, which is over 10x faster than the Intel Xe Graphics G7 we had been previously using for oneAPI.jl development. At the same time, the A770 used above should be able to deliver close to 20 TFlops, so there's still room for improvement in our software stack.

-

To use oneAPI.jl with an Arc series GPU, you need to run Linux 6.2. At the time of writing, that kernel is still in beta, so refer to your distribution's documentation for how to install it. For example, on Arch Linux you can use the linux-mainline package from the AUR, Ubuntu has the kernel-ppa archive, Fedora provides the stable-rc repository, etc.

-

Other changes

-
    -
  • Support for Julia 1.9 has been added.

    -
  • -
-]]>
- - Wed, 08 Feb 2023 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 4.0]]> - https://juliagpu.org/post/2023-02-01-cuda_4.0/index.html - https://juliagpu.org/2023-02-01-cuda_4.0/ - - - CUDA.jl 4.0 is a breaking release that introduces the use of JLLs to provide the CUDA toolkit. This makes it possible to compile other binary libaries against the CUDA runtime, and use them together with CUDA.jl. The release also brings CUSPARSE improvements, the ability to limit memory use, and many bug fixes and performance improvements.

-

JLLs for CUDA artifacts

-

While CUDA.jl has been using binary artifacts for a while, it was manually managing installation and selection of them, i.e., not by using standardised JLL packages. This complicated use of the artifacts by other packages, and made it difficult to build other binary packages against the CUDA runtime.

-

With CUDA.jl 4.0, we now use JLLs to load the CUDA driver and runtime. Specifically, there are two JLLs in play: CUDA_Driver_jll and CUDA_Runtime_jll. The former is responsible for loading the CUDA driver library (possibly upgrading it using a forward-compatible version), and determining the CUDA version that your set-up supports:

-
❯ JULIA_DEBUG=CUDA_Driver_jll julia
-julia> using CUDA_Driver_jll
-┌ System CUDA driver found at libcuda.so.1, detected as version 12.0.0
-└ @ CUDA_Driver_jll
-┌ System CUDA driver is recent enough; not using forward-compatible driver
-└ @ CUDA_Driver_jll
-

With the driver identified and loaded, CUDA_Runtime_jll can select a compatible toolkit. By default, it uses the latest supported toolkit that is compatible with the driver:

-
julia> using CUDA_Runtime_jlljulia> CUDA_Runtime_jll.cuda_toolkits
-10-element Vector{VersionNumber}:
- v"10.2.0"
- v"11.0.0"
- v"11.1.0"
- v"11.2.0"
- v"11.3.0"
- v"11.4.0"
- v"11.5.0"
- v"11.6.0"
- v"11.7.0"
- v"11.8.0"julia> CUDA_Runtime_jll.host_platform
-Linux x86_64 {cuda=11.8}
-

As you can see, the selected CUDA runtime is encoded in the host platform. This makes it possible for Julia to automatically select compatible versions of other binary packages. For example, if we install and load SuiteSparse_GPU_jll, which right now provides builds for CUDA 10.2, 11.0 and 12.0, the artifact resolution code knows to load the build for CUDA 11.0 which is compatible with the selected CUDA 11.8 runtime:

-
julia> using SuiteSparse_GPU_jlljulia> SuiteSparse_GPU_jll.best_wrapper
-"~/.julia/packages/SuiteSparse_GPU_jll/.../x86_64-linux-gnu-cuda+11.0.jl"
-

The change to JLLs requires a breaking change: the JULIA_CUDA_VERSION and JULIA_CUDA_USE_BINARYBUILDER environment variables have been removed, and are replaced by preferences that are set in the current environment. For convenience, you can set these preferences by calling CUDA.set_runtime_version!:

-
❯ julia --project
-julia> using CUDA
-julia> CUDA.runtime_version()
-v"11.8.0"julia> CUDA.set_runtime_version!(v"11.7")
-┌ Set CUDA Runtime version preference to 11.7,
-└ please re-start Julia for this to take effect.❯ julia --project
-julia> using CUDA
-julia> CUDA.runtime_version()
-v"11.7.0"julia> using CUDA_Runtime_jll
-julia> CUDA_Runtime_jll.host_platform
-Linux x86_64 {cuda=11.7}
-

The changed preference is reflected in the host platform, which means that you can use this mechanism to load a different builds of other binary packages. For example, if you rely on a package or JLL that does not yet have a build for CUDA 12, you could set the preference to v"11.x" to load an available build.

-

For discovering a local runtime, you can set the version to "local", which will replace the use of CUDA_Runtime_jll by CUDA_Runtime_discovery.jl, an API-compatible package that replaces the JLL with a local runtime discovery mechanism:

-
❯ julia --project
-julia> CUDA.set_runtime_version!("local")
-┌ Set CUDA Runtime version preference to local,
-└ please re-start Julia for this to take effect.❯ JULIA_DEBUG=CUDA_Runtime_Discovery julia --project
-julia> using CUDA
-┌ Looking for CUDA toolkit via environment variables CUDA_PATH
-└ @ CUDA_Runtime_Discovery
-┌ Looking for binary ptxas in /opt/cuda
-│   all_locations =
-│    2-element Vector{String}:
-│     "/opt/cuda"
-│     "/opt/cuda/bin"
-└ @ CUDA_Runtime_Discovery
-┌ Debug: Found ptxas at /opt/cuda/bin/ptxas
-└ @ CUDA_Runtime_Discovery
-...
-

Memory limits

-

By popular demand, support for memory limits has been reinstated. This functionality had been removed after the switch to CUDA memory pools, as the memory pool allocator does not yet support memory limits. Awaiting improvements by NVIDIA, we have added functionality to impose memory limits from the Julia side, in the form of two environment variables:

-
    -
  • JULIA_CUDA_SOFT_MEMORY_LIMIT: This is an advisory limit, used to configure the memory pool, which will result in the pool being shrunk down to the requested limit at every synchronization point. That means that the pool may temporarily grow beyond the limit. This limit is unavailable when disabling memory pools (with JULIA_CUDA_MEMORY_POOL=none).

    -
  • -
  • JULIA_CUDA_HARD_MEMORY_LIMIT: This is a hard limit, checked before every allocation. Doing so is relatively expensive, so it is recommended to use the soft limit instead.

    -
  • -
-

The value of these variables can be formatted as a numer of bytes, optionally followed by a unit, or as a percentage of the total device memory. Examples: 100M, 50%, 1.5GiB, 10000.

-

CUSPARSE improvements

-

Thanks to the work of @amontoison, the CUSPARSE interface has undergone many improvements:

-
    -
  • Better support of the CuSparseMatrixCOO format with, in particular, the addition of CuSparseMatrixCOO * CuVector and CuSparseMatrixCOO * CuMatrix products;

    -
  • -
  • Routines specialized for -, +, * operations between sparse matrices (CuSparseMatrixCOO, CuSparseMatrixCSC and CuSparseMatrixCSR) have been interfaced;

    -
  • -
  • New generic routines for backward and forward sweeps with sparse triangular matrices are now used by \;

    -
  • -
  • CuMatrix * CuSparseVector and CuMatrix * CuSparseMatrix products have been added;

    -
  • -
  • Conversions between sparse and dense matrices have been updated for using more recent and optimized routines;

    -
  • -
  • High-level Julia functions for the new set of sparse BLAS 1 routines such as dot products between CuSparseVector;

    -
  • -
  • Add missing dispatchs for mul! and ldiv! functions;

    -
  • -
  • Interfacing of almost all new CUSPARSE routines added by the CUDA toolkits v"11.x".

    -
  • -
-

Other changes

-
    -
  • Removal of the CUDNN, CUTENSOR, CUTENSORNET and CUSTATEVEC submodules: These have been moved into their own packages, respectively cuDNN.jl, cuTENSOR.jl, cuTensorNet.jl and cuStateVec.jl (note the change in capitalization, now following NVIDIA's naming scheme);

    -
  • -
  • Removal of the NVTX submodule: NVTX.jl should be used instead, which is a more complete implementation of the NVTX API;

    -
  • -
  • Support for CUDA 11.8 (support for CUDA 12.0 is being worked on);

    -
  • -
  • Support for Julia 1.9.

    -
  • -
-

Backport releases

-

Because CUDA.jl 4.0 is a breaking release, two additional releases have been made that backport bugfixes and select features:

-
    -
  • CUDA.jl 3.12.1 and 3.12.2: backports of bugfixes since 3.12

    -
  • -
  • CUDA.jl 3.13.0: additionally adding the memory limit functionality

    -
  • -
-]]>
- - Wed, 01 Feb 2023 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[Technical preview: Programming Apple M1 GPUs in Julia with Metal.jl]]> - https://juliagpu.org/post/2022-06-24-metal/index.html - https://juliagpu.org/2022-06-24-metal/ - - - Julia has gained a new GPU back-end: Metal.jl, for working with Apple's M1 GPUs. The back-end is built on the same foundations that make up existing GPU packages like CUDA.jl and AMDGPU.jl, so it should be familiar to anybody who's already programmed GPUs in Julia. In the following post I'll demonstrate some of that functionality and explain how it works.

-

But first, note that Metal.jl is under heavy development: The package is considered experimental for now, as we're still working on squashing bugs and adding essential functionality. We also haven't optimized for performance yet. If you're interesting in using Metal.jl, please consider contributing to its development! Most of the package is written in Julia, and checking-out the source code is a single Pkg.develop away :-)

-

Quick start

-

Start by getting a hold of the upcoming Julia 1.8, launch it, and enter the package manager by pressing ]:

-
julia> ]pkg> add Metal
-  Installed Metal
-

Installation is as easy as that, and we'll automatically download the necessary binary artifacts (a C wrapper for the Metal APIs, and an LLVM back-end). Then, leave the package manager by pressing backspace, import the Metal package, and e.g. call the versioninfo() method for some details on the toolchain:

-
julia> using Metaljulia> Metal.versioninfo()
-macOS 13.0.0, Darwin 21.3.0Toolchain:
-- Julia: 1.8.0-rc1
-- LLVM: 13.0.11 device:
-- Apple M1 Pro (64.000 KiB allocated)
-

And there we go! You'll note here that I'm using the upcoming macOS 13 (Ventura); this is currently the only supported operating system. We also only support M-series GPUs, even though Metal does support other GPUs. These choices were made to simplify development, and aren't technical limitations. In fact, Metal.jl does work on e.g. macOS Monterey with an Intel GPU, but it's an untested combination that may suffer from bugs.

-

Array programming

-

Just like our other GPU back-ends, Metal.jl offers an array abstraction that greatly simplifies GPU programming. The abstraction centers around the MtlArray type that can be used to manage memory and perform GPU computations:

-
# allocate + initialize
-julia> a = MtlArray(rand(Float32, 2, 2))
-2×2 MtlArray{Float32, 2}:
- 0.158752  0.836366
- 0.535798  0.153554# perform some GPU-accelerated operations
-julia> b = a * a
-2×2 MtlArray{Float32, 2}:
- 0.473325  0.261202
- 0.167333  0.471702# back to the CPU
-julia> Array(b)
-2×2 Matrix{Float32}:
- 0.473325  0.261202
- 0.167333  0.471702
-

Beyond these simple operations, Julia's higher-order array abstractions can be used to express more complex operations without ever having to write a kernel:

-
julia> mapreduce(sin, +, a; dims=1)
-1×2 MtlArray{Float32, 2}:
- 1.15276  0.584146julia> cos.(a .+ 2) .* 3
-2×2 MtlArray{Float32, 2}:
- -2.0472   -1.25332
- -2.96594  -2.60351
-

Much of this functionality comes from the GPUArrays.jl package, which provides vendor-neutral implementations of common array operations. As a result, MtlArray is already pretty capable, and should be usable with realistic array-based applications.

-

Kernel programming

-

Metal.jl's array operations are implemented in Julia, using our native kernel programming capabilities and accompanying JIT-compiler. A small demonstration:

-
# a simple kernel that sets elements of an array to a value
-function memset_kernel(array, value)
-  i = thread_position_in_grid_1d()
-  if i <= length(array)
-    @inbounds array[i] = value
-  end
-  return
-enda = MtlArray{Float32}(undef, 512)
-@metal threads=512 grid=2 memset_kernel(a, 42)# verify
-@assert all(isequal(42), Array(a))
-

As can be seen here, we've opted to deviate slightly from the Metal Shading Language, instead providing a programming experience that's similar to Julia's existing back-ends. Some key differences:

-
    -
  • we use intrinsic functions instead of special kernel function arguments to access properties like the thread position, grid size, ...;

    -
  • -
  • all types of arguments (buffers, indirect buffers, value-typed inputs) are transparently converted to a GPU-compatible structure[1];

    -
  • -
  • global (task-bound) state is used to keep track of the active device and a queue;

    -
  • -
  • compute pipeline set-up and command encoding is hidden behind a single macro.

    -
  • -
-

Behind the scenes, we compile Julia to LLVM IR and use a tiny LLVM back-end (based on @a2flo's libfloor) that (re)writes the bitcode to a Metal-compatible library containing LLVM 5 bitcode. You can inspect the generated IR using @device_code_metal:

-
julia> @device_code_metal @metal threads=512 grid=2 memset_kernel(a, 42)
-
[header]
-program_count: 1
-...[program]
-name: julia_memset_kernel
-type: kernel
-...
-
target datalayout = "..."
-target triple = "air64-apple-macosx13.0.0"; the (rewritten) kernel function:
-;  - %value argument passed by reference
-;  - %thread_position_in_grid argument added
-;  - sitofp rewritten to AIR-specific intrinsic
-define void @julia_memset_kernel(
-    { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %array,
-    i64 addrspace(1)* %value,
-    i32 %thread_position_in_grid) {
-  ...
-  %9 = tail call float @air.convert.f.f32.s.i64(i64 %7)
-  ...
-  ret void
-}; minimal required argument metadata
-!air.kernel = !{!10}
-!10 = !{void ({ i8 addrspace(1)*, [1 x i64] } addrspace(1)*,
-              i64 addrspace(1)*, i32)* @julia_memset_kernel, !11, !12}
-!12 = !{!13, !14, !15}
-!13 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1,
-       !"air.read_write", !"air.address_space", i32 1,
-       !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 8}
-!14 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1,
-       !"air.read_write", !"air.address_space", i32 1,
-       !"air.arg_type_size", i32 8, !"air.arg_type_align_size", i32 8}
-!15 = !{i32 0, !"air.thread_position_in_grid"}; other metadata not shown, for brevity
-

Shout-out to @max-Hawkins for exploring Metal code generation during his internship at Julia Computing!

-

Metal APIs in Julia

-

Lacking an Objective C or C++ FFI, we interface with the Metal libraries using a shim C library. Most users won't have to interface with Metal directly – the array abstraction is sufficient for many – but more experienced developers can make use of the high-level wrappers that we've designed for the Metal APIs:

-
julia> dev = MtlDevice(1)
-MtlDevice:
-  name:             Apple M1 Pro
-  lowpower:         false
-  headless:         false
-  removable:        false
-  unified memory:   truejulia> desc = MtlHeapDescriptor()
-MtlHeapDescriptor:
-  type:             MtHeapTypeAutomatic
-  storageMode:      MtStorageModePrivate
-  size:             0julia> desc.size = 16384
-16384julia> heap = MtlHeap(dev, desc)
-MtlHeap:
-  type:                 MtHeapTypeAutomatic
-  size:                 16384
-  usedSize:             0
-  currentAllocatedSize: 16384# etc
-

These wrappers are based on @PhilipVinc's excellent work on MetalCore.jl, which formed the basis for (and has been folded into) Metal.jl.

-

What's next?

-

The current release of Metal.jl focusses on code generation capabilities, and is meant as a preview for users and developers to try out on their system or with their specific GPU application. It is not production-ready yet, and is lacking some crucial features:

-
    -
  • performance optimization

    -
  • -
  • integration with Metal Performance Shaders

    -
  • -
  • integration / documentation for use with Xcode tools

    -
  • -
  • fleshing out the array abstraction based on user feedback

    -
  • -
-

Please consider helping out with any of these! Since Metal.jl and its dependencies are almost entirely implemented in Julia, any experience with the language is sufficient to contribute. If you're not certain, or have any questions, please drop by the #gpu channel on the JuliaLang Slack, ask questions on our Discourse, or chat to us during the GPU office hours every other Monday.

-

If you encounter any bugs, feel free to let us know on the Metal.jl issue tracker. For information on upcoming releases, subscribe to this website's blog where we post about significant developments in Julia's GPU ecosystem.

-
-

- - - - -
[1]This relies on Metal 3 from macOS 13, which introduced bindless argument
- buffers, as we didn't fully figure out how to reliably encode arbitrarily-nested indirect buffers in argument encoder metadata.

-]]>
- - Fri, 24 Jun 2022 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[oneAPI.jl status update]]> - https://juliagpu.org/post/2022-04-06-oneapi_update/index.html - https://juliagpu.org/2022-04-06-oneapi_update/ - - - It has been over a year since the last update on oneAPI.jl, the Julia package for programming Intel GPUs (and other accelerators) using the oneAPI toolkit. Since then, the package has been under steady development, and several new features have been added to improve the developer experience and usability of the package.

-

@atomic intrinsics

-

oneAPI.jl now supports atomic operations, which are required to implement a variety of parallel algorithms. Low-level atomic functions (atomic_add!, atomic_xchg!, etc) are available as unexported methods in the oneAPI module:

-
a = oneArray(Int32[0])function kernel(a)
-    oneAPI.atomic_add!(pointer(a), Int32(1))
-    return
-end@oneapi items=256 kernel(a)
-@test Array(a)[1] == 256
-

Note that these methods are only available for those types that are supported by the underlying OpenCL intrinsics. For example, the atomic_add! from above can only be used with Int32 and UInt32 inputs.

-

Most users will instead rely on the higher-level @atomic macro, which can be easily put in front of many array operations to make them behave atomically. To avoid clashing with the new @atomic macro in Julia 1.7, this macro is also unexported:

-
a = oneArray(Int32[0])function kernel(a)
-    oneAPI.@atomic a[1] += Int32(1)
-    return
-end@oneapi items=256 kernel(a)
-@test Array(a)[1] == 512
-

When used with operations that are supported by OpenCL, this macro will lower to calls like atomic_add!. For other operations, a compare-and-exchange loop will be used. Note that for now, this is still restricted to 32-bit operations, as we do not support the cl_khr_int64_base_atomics extension for 64-bit atomics.

-

Initial integration with vendor libraries

-

One significant missing features is the integration with vendor libraries like oneMKL. These integrations are required to ensure good performance for important operations like matrix multiplication, which currently fall-back to generic implementations in Julia that may not always perform as good.

-

To improve this situation, we are working on a wrapper library that allows us to integrate with oneMKL and other oneAPI and SYCL libraries. Currently, only matrix multiplication is supported, but once the infrastructural issues are worked out we expect to quickly support many more operations.

-

If you need support for specific libraries, please have a look at this PR. As the API surface is significant, we will need help to extend the wrapper library and integrate it with high-level Julia libraries like LinearAlgebra.jl.

-

Correctness issues

-

In porting existing Julia GPU applications to oneAPI.jl, we fixed several issues that caused correctness issues when executing code on Intel GPUs:

-
    -
  • when the garbage collector frees GPU memory, it now blocks until all outstanding commands (which may include uses of said memory) are completes

    -
  • -
  • the barrier function to synchronize threads is now marked as convert to avoid LLVM miscompilations

    -
  • -
-

Note that if you are using Tiger Lake hardware, there is currently a known issue in the back-end Intel compiler that affects oneAPI.jl, causing correctness issues that can be spotted by running the oneAPI.jl test suite.

-

Future work

-

To significantly improve usability of oneAPI.jl, we will add support to the KernelAbstraction.jl package. This library is used by many other packages for adding GPU acceleration to algorithms that cannot be easily expressed using only array operations. As such, support for oneAPI.jl will make it possible to use your oneAPI GPUs with all of these packages.

-]]>
- - Wed, 06 Apr 2022 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 3.5-3.8]]> - https://juliagpu.org/post/2022-01-28-cuda_3.5_3.8/index.html - https://juliagpu.org/2022-01-28-cuda_3.5_3.8/ - - - CUDA.jl versions 3.5 to 3.8 have brought several new features to improve performance and productivity. This blog post will highlight a couple: direct copies between devices, better performance by preserving array index types and changing the memory pool, and a much-improved interface to the compute sanitizer utility.

-

Copies between devices

-

Typically, when sending data between devices you need to stage through the CPU. CUDA.jl now does this automatically, making it possible to directly copy between CuArrays on different devices:

-
julia> device!(0);julia> a = CUDA.rand(2,2)
-2×2 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
- 0.440147  0.986939
- 0.622901  0.698119julia> device!(1);julia> b = CUDA.zeros(2,2);julia> copyto!(b, a)
-2×2 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
- 0.440147  0.986939
- 0.622901  0.698119
-

When your hardware supports it, CUDA.jl will automatically enable so-called peer-to-peer mode, making it possible to copy data directly without going through the CPU. This can result in significant bandwidth and latency reductions. You can check if this mode of communication is possible:

-
julia> src = CuDevice(0)
-CuDevice(0): NVIDIA A100-PCIE-40GBjulia> dst = CuDevice(1)
-CuDevice(1): Tesla V100-PCIE-32GBjulia> can_access_peer(src, dst)
-false
-

In this case, peer-to-peer communication is not possible because the devices have a different compute capability major revision number. With a compatible device, the function reports true:

-
julia> src = CuDevice(1)
-CuDevice(1): Tesla V100-PCIE-32GBjulia> dst = CuDevice(2)
-CuDevice(2): Tesla V100-PCIE-16GBjulia> can_access_peer(src, dst)
-true
-

Thanks to @kshyatt for help with this change!

-

Helper function to use compute-sanitizer

-

The CUDA toolkit comes with a powerful tool to check GPU kernels for common issues like memory errors and race conditions: the compute sanitizer. To make it easier to use this tool, CUDA.jl now ships the binary as part of its artifacts, and provides a helper function to restart Julia under the compute-sanitizer. Let's demonstrate, and trigger a memory error to show what the compute sanitizer can detect:

-
julia> using CUDAjulia> CUDA.run_compute_sanitizer()
-Re-starting your active Julia session...========= COMPUTE-SANITIZER
-julia> using CUDAjulia> unsafe_wrap(CuArray, pointer(CuArray([1])), 2) .= 1
-========= Invalid __global__ write of size 8 bytes
-=========     at 0x2a0 in LLVM/src/interop/base.jl:45:julia_broadcast_kernel_1892(CuKernelContext, CuDeviceArray<Int64, (int)1, (int)1>, Broadcasted<void, Tuple<OneTo<Int64>>, _identity, Broadcasted<Int64>>, Int64)
-=========     by thread (1,0,0) in block (0,0,0)
-=========     Address 0xa64000008 is out of bounds
-=========     and is 1 bytes after the nearest allocation at 0xa64000000 of size 8 bytes
-

Other tools are available too, e.g. racecheck for detecting races or synccheck for finding synchronization issues. These tools can be selected using the tool keyword argument to run_compute_sanitizer.

-

Updated binary dependencies

-

As is common with every release, CUDA.jl now supports newer versions of NVIDIA's tools and libraries:

- -

The update to CUDA toolkit 11.6 comes with improved debug info compatibility. If you need to debug Julia GPU code with tools like compute-sanitizer or cuda-gdb, and you need debug info (the equivalent of nvcc -G), ensure CUDA.jl can use the latest version of the CUDA toolkit.

-

To make it easier to use the latest supported toolkit, CUDA.jl now implements CUDA's so-called Forward Compatibility mode: When your driver is outdated, CUDA.jl will attempt to load a newer version of the CUDA driver library, enabling use of a newer CUDA toolkit and libraries. Note that this is only supported on select hardware, refer to the NVIDIA documentation for more details.

-

Preserving array indices

-

Julia's integers are typically 64-bits wide, which can be wasteful when dealing with GPU indexing intrinsics that are typically only 32-bits wide. CUDA.jl's device array type now carefully preserves the type of indices so that 32-bits indices aren't unnecessarily promoted to 64-bits. With some careful kernel programming (note the use of 0x1 instead of 1 below), this makes it possible to significantly reduce the register pressure surrounding indexing operations, which may be useful in register-constrained situations:

-
julia> function memset(arr, val)
-           i = (blockIdx().x-0x1) * blockDim().x + threadIdx().x
-           @inbounds arr[i] = val
-           return
-       endjulia> CUDA.code_ptx(memset, Tuple{CuDeviceArray{Float32,1,AS.Global},Float32})
-.func julia_memset(.param .b64 arr, .param .b32 val) {
-        .reg .f32       %f<2>;
-        .reg .b32       %r<5>;
-        .reg .b64       %rd<5>;        ld.param.u64    %rd1, [arr];
-        ld.param.f32    %f1, [val];
-        mov.u32         %r1, %ctaid.x;
-        mov.u32         %r2, %ntid.x;
-        mov.u32         %r3, %tid.x;
-        mad.lo.s32      %r4, %r2, %r1, %r3;
-        ld.u64          %rd2, [%rd1];
-        mul.wide.s32    %rd3, %r4, 4;
-        add.s64         %rd4, %rd2, %rd3;
-        st.global.f32   [%rd4], %f1;
-        ret;
-}
-

On CUDA.jl 3.4, this simple function used 3 more 64-bit registers:

-
.func julia_memset(.param .b64 arr, .param .b32 val) {
-        .reg .f32       %f<2>;
-        .reg .b32       %r<5>;
-        .reg .b64       %rd<8>;        ld.param.u64    %rd1, [arr];
-        ld.param.f32    %f1, [val];
-        mov.u32         %r1, %ctaid.x;
-        mov.u32         %r2, %ntid.x;
-        mul.wide.u32    %rd2, %r2, %r1;
-        mov.u32         %r3, %tid.x;
-        add.s32         %r4, %r3, 1;
-        cvt.u64.u32     %rd3, %r4;
-        ld.u64          %rd4, [%rd1];
-        add.s64         %rd5, %rd2, %rd3;
-        shl.b64         %rd6, %rd5, 2;
-        add.s64         %rd7, %rd4, %rd6;
-        st.global.f32   [%rd7+-4], %f1;
-        ret;
-}
-

More aggressive memory management

-

Starting with CUDA 3.8, the memory pool used to allocate CuArrays will be configured differently: The pool will now be allowed to use all available GPU memory, whereas previously all cached memory was released at each synchronization point. This can significantly improve performance, and makes synchronization much cheaper.

-

This behavior can be observed by calling the memory_status() function:

-
julia> CUDA.memory_status()
-Effective GPU memory usage: 13.57% (2.001 GiB/14.751 GiB)
-Memory pool usage: 0 bytes (0 bytes reserved)julia> a = CuArray{Float32}(undef, (1024, 1024, 1024));
-julia> Base.format_bytes(sizeof(a))
-"4.000 GiB"julia> a = nothing
-julia> GC.gc()julia> CUDA.memory_status()
-Effective GPU memory usage: 40.59% (5.988 GiB/14.751 GiB)
-Memory pool usage: 0 bytes (4.000 GiB reserved)
-

So far nothing new. On previous versions of CUDA.jl however, any subsequent synchronization of the GPU (e.g., by copying memory to the CPU) would have resulted in a release of this reserved memory. This is not the case anymore:

-
julia> synchronize()julia> CUDA.memory_status()
-Effective GPU memory usage: 40.59% (5.988 GiB/14.751 GiB)
-Memory pool usage: 0 bytes (4.000 GiB reserved)
-

If you still want to release this memory, you can call the reclaim() function:

-
julia> CUDA.reclaim()julia> CUDA.memory_status()
-Effective GPU memory usage: 13.48% (1.988 GiB/14.751 GiB)
-Memory pool usage: 0 bytes (0 bytes reserved)
-

With interactive Julia sessions, this function is called periodically so that the GPU's memory isn't held on to unnecessarily. Otherwise it shouldn't be necessary to call this function, as memory is freed automatically when it is needed.

-

Minor changes and improvements

- -]]>
- - Fri, 28 Jan 2022 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 3.4]]> - https://juliagpu.org/post/2021-08-13-cuda_3.4/index.html - https://juliagpu.org/2021-08-13-cuda_3.4/ - - - The latest version of CUDA.jl brings several new features, from improved atomic operations to initial support for arrays with unified memory. The native random number generator introduced in CUDA.jl 3.0 is now the default fallback, and support for memory pools other than the CUDA stream-ordered one has been removed.

-

Streamlined atomic operations

-

In preparation of integrating with the new standard @atomic macro introduced in Julia 1.7, we have streamlined the capabilities of atomic operations in CUDA.jl. The API is now split into two levels: low-level atomic_ methods for atomic functionality that's directly supported by the hardware, and a high-level @atomic macro that tries to perform operations natively or falls back to a loop with compare-and-swap. This fall-back implementation makes it possible to use more complex operations that do not map onto a single atomic operation:

-
julia> a = CuArray([1]);julia> function kernel(a)
-         CUDA.@atomic a[] <<= 1
-         return
-       endjulia> @cuda threads=16 kernel(a)julia> a
-1-element CuArray{Int64, 1, CUDA.Mem.DeviceBuffer}:
- 65536julia> 1<<16
-65536
-

The only requirement is that the types being used are supported by CUDA.atomic_cas!. This includes common types like 32 and 64-bit integers and floating-point numbers, as well as 16-bit numbers on devices with compute capability 7.0 or higher.

-

Note that on Julia 1.7 and higher, CUDA.jl does not export the @atomic macro anymore to avoid conflicts with the version in Base. That means it is recommended to always fully specify uses of the macro, i.e., use CUDA.@atomic as in the example above.

-

Arrays with unified memory

-

You may have noticed that the CuArray type in the example above included an additional parameter, Mem.DeviceBuffer. This has been introduced to support arrays backed by different kinds of buffers. By default, we will use an ordinary device buffer, but it's now possible to allocate arrays backed by unified buffers that can be used on multiple devices:

-
julia> a = cu([0]; unified=true)
-1-element CuArray{Int64, 1, CUDA.Mem.UnifiedBuffer}:
- 0julia> a .+= 1
-1-element CuArray{Int64, 1, CUDA.Mem.UnifiedBuffer}:
- 1julia> device!(1)julia> a .+= 1
-1-element CuArray{Int64, 1, CUDA.Mem.UnifiedBuffer}:
- 2
-

Although all operations should work equally well with arrays backed by unified memory, they have not been optimized yet. For example, copying memory to the device could be avoided as the driver can automatically page in unified memory on-demand.

-

New default random number generator

-

CUDA.jl 3.0 introduced a new random number generator, and starting with CUDA.jl 3.2 performance and quality of this generator was improved up to the point it could be used by applications. A couple of features were still missing though, such as generating normally-distributed random numbers, or support for complex numbers. These features have been added in CUDA.jl 3.3, and the generator is now used as the default fallback when CURAND does not support the requested element types.

-

Both the performance and quality of this generator is much better than the previous, GPUArrays.jl-based one:

-
julia> using BenchmarkTools
-julia> cuda_rng = CUDA.RNG();
-julia> gpuarrays_rng = GPUArrays.default_rng(CuArray);
-julia> a = CUDA.zeros(1024,1024);julia> @benchmark CUDA.@sync rand!($cuda_rng, $a)
-BenchmarkTools.Trial: 10000 samples with 1 evaluation.
- Range (min … max):  17.040 μs …  2.430 ms  ┊ GC (min … max): 0.00% … 99.04%
- Time  (median):     18.500 μs              ┊ GC (median):    0.00%
- Time  (mean ± σ):   20.604 μs ± 34.734 μs  ┊ GC (mean ± σ):  1.17% ±  0.99%         ▃▆█▇▇▅▄▂▁
-  ▂▂▂▃▄▆███████████▇▆▆▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂ ▄
-  17 μs           Histogram: frequency by time        24.1 μs <julia> @benchmark CUDA.@sync rand!($gpuarrays_rng, $a)
-BenchmarkTools.Trial: 10000 samples with 1 evaluation.
- Range (min … max):  72.489 μs …  2.790 ms  ┊ GC (min … max): 0.00% … 98.44%
- Time  (median):     74.479 μs              ┊ GC (median):    0.00%
- Time  (mean ± σ):   81.211 μs ± 61.598 μs  ┊ GC (mean ± σ):  0.67% ±  1.40%  █                                                           ▁
-  █▆▃▁▃▃▅▆▅▁▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▄▆▁▁▁▁▁▁▁▁▄▄▃▄▃▁▁▁▁▁▁▁▁▁▃▃▄▆▄▁▄▃▆ █
-  72.5 μs      Histogram: log(frequency) by time       443 μs <
-
julia> using RNGTest
-julia> test_cuda_rng = RNGTest.wrap(cuda_rng, UInt32);
-julia> test_gpuarrays_rng = RNGTest.wrap(gpuarrays_rng, UInt32);julia> RNGTest.smallcrushTestU01(test_cuda_rng)
- All tests were passedjulia> RNGTest.smallcrushTestU01(test_gpuarrays_rng)
- The following tests gave p-values outside [0.001, 0.9990]:       Test                          p-value
- ----------------------------------------------
-  1  BirthdaySpacings                 eps
-  2  Collision                        eps
-  3  Gap                              eps
-  4  SimpPoker                       1.0e-4
-  5  CouponCollector                  eps
-  6  MaxOft                           eps
-  7  WeightDistrib                    eps
- 10  RandomWalk1 M                   6.0e-4
- ----------------------------------------------
- (eps  means a value < 1.0e-300):
-

Removal of old memory pools

-

With the new stream-ordered allocator, caching memory allocations at the CUDA library level, much of the need for memory pools to cache memory allocations has disappeared. To simplify the allocation code, we have removed support for those Julia-managed memory pools (i.e., binned, split and simple). You can now only use the cuda memory pool, or use no pool at all by setting the JULIA_CUDA_MEMORY_POOL environment variable to none.

-

Not using a memory pool degrades performance, so if you are stuck on an NVIDIA driver that does not support CUDA 11.2, it is advised to remain on CUDA.jl 3.3 until you can upgrade.

-

Also note that the new stream-ordered allocator has turned out incompatible with legacy cuIpc APIs as used by OpenMPI. If that applies to you, consider disabling the memory pool or reverting to CUDA.jl 3.3 if your application's allocation pattern benefits from a memory pool.

-

Because of this, we will be maintaining CUDA.jl 3.3 longer than usual. All bug fixes in CUDA.jl 3.4 have already been backported to the previous release, which is currently at version 3.3.6.

-

Device capability-dependent kernel code

-

Some of the improvements in this release depend on the ability to write generic code that only uses certain hardware features when they are available. To facilitate writing such code, the compiler now embeds metadata in the generated code that can be used to branch on.

-

Currently, the device capability and PTX ISA version are embedded and made available using respectively the compute_capability and ptx_isa_version functions. A simplified version number type, constructable using the sv"..." string macro, can be used to test against these properties. For example:

-
julia> function kernel(a)
-           a[] = compute_capability() >= sv"6.0" ? 1 : 2
-           return
-       end
-kernel (generic function with 1 method)julia> CUDA.code_llvm(kernel, Tuple{CuDeviceVector{Float32, AS.Global}})
-define void @julia_kernel_1({ i8 addrspace(1)*, i64, [1 x i64] }* %0) {
-top:
-  %1 = bitcast { i8 addrspace(1)*, i64, [1 x i64] }* %0 to float addrspace(1)**
-  %2 = load float addrspace(1)*, float addrspace(1)** %1, align 8
-  store float 1.000000e+00, float addrspace(1)* %2, align 4
-  ret void
-}julia> capability(device!(1))
-v"3.5.0"julia> CUDA.code_llvm(kernel, Tuple{CuDeviceVector{Float32, AS.Global}})
-define void @julia_kernel_2({ i8 addrspace(1)*, i64, [1 x i64] }* %0) {
-top:
-  %1 = bitcast { i8 addrspace(1)*, i64, [1 x i64] }* %0 to float addrspace(1)**
-  %2 = load float addrspace(1)*, float addrspace(1)** %1, align 8
-  store float 2.000000e+00, float addrspace(1)* %2, align 4
-  ret void
-}
-

The branch on the compute capability is completely optimized away. At the same time, this does not require re-inferring the function as the optimization happens at the LLVM level.

-

Other changes

- -]]>
- - Fri, 13 Aug 2021 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 3.3]]> - https://juliagpu.org/post/2021-06-10-cuda_3.3/index.html - https://juliagpu.org/2021-06-10-cuda_3.3/ - - - There have been several releases of CUDA.jl in the past couple of months, with many bugfixes and many exciting new features to improve GPU programming in Julia: CuArray now supports isbits Unions, CUDA.jl can emit debug info for use with NVIDIA tools, and changes to the compiler make it even easier to use the latest version of the CUDA toolkit.

-

CuArray support for isbits Unions

-

Unions are a way to represent values of one type or another, e.g., a value that can be an integer or a floating point. If all possible element types of a Union are so-called bitstypes, which can be stored contiguously in memory, the Union of these types can be stored contiguously too. This kind of optimization is implemented by the Array type, which can store such "isbits Unions" inline, as opposed to storing a pointer to a heap-allocated box. For more details, refer to the Julia documentation.

-

With CUDA.jl 3.3, the CuArray GPU array type now supports this optimization too. That means you can safely allocate CuArrays with isbits union element types and perform GPU-accelerated operations on then:

-
julia> a = CuArray([1, nothing, 3])
-3-element CuArray{Union{Nothing, Int64}, 1}:
- 1
-  nothing
- 3julia> findfirst(isnothing, a)
-2
-

It is also safe to pass these CuArrays to a kernel and use unions there:

-
julia> function kernel(a)
-         i = threadIdx().x
-         if a[i] !== nothing
-           a[i] += 1
-         end
-         return
-       endjulia> @cuda threads=3 kernel(a)julia> a
-3-element CuArray{Union{Nothing, Int64}, 1}:
- 2
-  nothing
- 4
-

This feature is especially valuable to represent missing values, and is an important step towards GPU support for DataFrames.jl.

-

Debug and location information

-

Another noteworthy addition is the support for emitting debug and location information. The debug level, set by passing -g <level> to the julia executable, determines how much info is emitted. The default of level 1 only enables location information instructions which should not impact performance. Passing -g0 disables this, while passing -g2 also enables the output of DWARF debug information and compiles in debug mode.

-

Location information is useful for a variety of reasons. Many tools, like the NVIDIA profilers, use it corelate instructions to source code:

-
- NVIDIA Visual Profiler with source-code location information -

Debug information can be used to debug compiled code using cuda-gdb:

-
$ cuda-gdb --args julia -g2 examples/vadd.jl
-(cuda-gdb) set cuda break_on_launch all
-(cuda-gdb) run
-[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
-macro expansion () at .julia/packages/LLVM/hHQuD/src/interop/base.jl:74
-74                  Base.llvmcall(($ir,$fn), $rettyp, $argtyp, $(args.args...))(cuda-gdb) bt
-#0  macro expansion () at .julia/packages/LLVM/hHQuD/src/interop/base.jl:74
-#1  macro expansion () at .julia/dev/CUDA/src/device/intrinsics/indexing.jl:6
-#2  _index () at .julia/dev/CUDA/src/device/intrinsics/indexing.jl:6
-#3  blockIdx_x () at .julia/dev/CUDA/src/device/intrinsics/indexing.jl:56
-#4  blockIdx () at .julia/dev/CUDA/src/device/intrinsics/indexing.jl:76
-#5  julia_vadd<<<(1,1,1),(12,1,1)>>> (a=..., b=..., c=...) at .julia/dev/CUDA/examples/vadd.jl:6(cuda-gdb) f 5
-#5  julia_vadd<<<(1,1,1),(12,1,1)>>> (a=..., b=..., c=...) at .julia/dev/CUDA/examples/vadd.jl:6
-6           i = (blockIdx().x-1) * blockDim().x + threadIdx().x(cuda-gdb) l
-1       using Test
-2
-3       using CUDA
-4
-5       function vadd(a, b, c)
-6           i = (blockIdx().x-1) * blockDim().x + threadIdx().x
-7           c[i] = a[i] + b[i]
-8           return
-9       end
-10
-

Improved CUDA compatibility support

-

As always, new CUDA.jl releases come with updated support for the CUDA toolkit. CUDA.jl is now compatible with CUDA 11.3, as well as CUDA 11.3 Update 1. Users don't have to do anything to update to these versions, as CUDA.jl will automatically select and download the latest supported version.

-

Of course, for CUDA.jl to use the latest versions of the CUDA toolkit, a sufficiently recent version of the NVIDIA driver is required. Before CUDA 11.0, the driver's CUDA compatibility was a strict lower bound, and every minor CUDA release required a driver update. CUDA 11.0 comes with an enhanced compatibility option that follows semantic versioning, e.g., CUDA 11.3 can be used on an NVIDIA driver that only supports up to CUDA 11.0. CUDA.jl now follows semantic versioning when selecting a compatible toolkit, making it easier to use the latest version of the CUDA toolkit in Julia.

-

For those interested: Implementing semantic versioning required the CUDA.jl compiler to use ptxas instead of the driver's embedded JIT to generate GPU machine code. At the same time, many parts of CUDA.jl still use the CUDA driver APIs, so it's always recommended to keep your NVIDIA driver up-to-date.

-

High-level graph APIs

-

To overcome the cost of launching kernels, CUDA makes it possible to build computational graphs, and execute those graphs with less overhead than the underlying operations. In CUDA.jl we provide easy access to the APIs to record and execute these graphs:

-
A = CUDA.zeros(Int, 1)# ensure the operation is compiled
-A .+= 1# capture
-graph = capture() do
-    A .+= 1
-end
-@test Array(A) == [1]   # didn't change anything# instantiate and launch
-exec = instantiate(graph)
-CUDA.launch(exec)
-@test Array(A) == [2]# update and instantiate/launch again
-graph′ = capture() do
-    A .+= 2
-end
-update(exec, graph′)
-CUDA.launch(exec)
-@test Array(A) == [4]
-

This sequence of operations is common enough that we provide a high-level @captured macro wraps that automatically records, updates, instantiates and launches the graph:

-
A = CUDA.zeros(Int, 1)for i in 1:2
-    @captured A .+= 1
-end
-@test Array(A) == [2]
-

Minor changes and features

- -]]>
- - Thu, 10 Jun 2021 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 3.0]]> - https://juliagpu.org/post/2021-04-09-cuda_3.0/index.html - https://juliagpu.org/2021-04-09-cuda_3.0/ - - - CUDA.jl 3.0 is a significant, semi-breaking release that features greatly improved multi-tasking and multi-threading, support for CUDA 11.2 and its new memory allocator, compiler tooling for GPU method overrides, device-side random number generation and a completely revamped cuDNN interface.

-

Improved multi-tasking and multi-threading

-

Before this release, CUDA operations were enqueued on a single global stream, and many of these operations (like copying memory, or synchronizing execution) were fully blocking. This posed difficulties when using multiple tasks to perform independent operations: Blocking operations prevent all tasks from making progress, and using the same stream introduces unintended dependencies on otherwise independend operations. CUDA.jl now uses private streams for each Julia task, and avoids blocking operations where possible, enabling task-based concurrent execution. It is also possible to use different devices on each task, and there is experimental support for executing those tasks from different threads.

-

A picture snippet of code is worth a thousand words, so let's demonstrate using a computation that uses both a library function (GEMM from CUBLAS) and a native Julia broadcast kernel:

-
using CUDA, LinearAlgebrafunction compute(a,b,c)
-    mul!(c, a, b)
-    broadcast!(sin, c, c)
-    synchronize()
-    c
-end
-

To execute multiple invocations of this function concurrently, we can simply use Julia's task-based programming interfaces and wrap each call to compute in an @async block. Then, we synchronize execution again by wrapping in a @sync block:

-
function iteration(a,b,c)
-    results = Vector{Any}(undef, 2)
-    NVTX.@range "computation" @sync begin
-        @async begin
-            results[1] = compute(a,b,c)
-        end
-        @async begin
-            results[2] = compute(a,b,c)
-        end
-    end
-    NVTX.@range "comparison" Array(results[1]) == Array(results[2])
-end
-

The calls to the @range macro from NVTX, a submodule of CUDA.jl, will visualize the different phases of execution when we profile our program. We now invoke our function using some random data:

-
function main(N=1024)
-    a = CUDA.rand(N,N)
-    b = CUDA.rand(N,N)
-    c = CUDA.rand(N,N)    # make sure this data can be used by other tasks!
-    synchronize()    # warm-up
-    iteration(a,b,c)
-    GC.gc(true)    NVTX.@range "main" iteration(a,b,c)
-end
-

The snippet above illustrates one breaking aspect of this release: Because each task uses its own stream, you now need to synchronize when re-using data in another task. Although it is unlikely that any user code was relying on the old behavior, it is technically a breaking change, and as such we are bumping the major version of the CUDA.jl package.

-

If we profile these our program using NSight Systems, we can see how the execution of both calls to compute was overlapped:

-
- Overlapping execution on the GPU using task-based concurrency -

The region highlighted in green was spent enqueueing operations from the CPU, which includes the call to synchronize(). This used to be a blocking operation, whereas now it only synchronizes the task-local stream while yielding to the Julia scheduler so that it can continue execution on another task. For synchronizing the entire device, use the new device_synchronize() function.

-

The remainder of computation was then spent executing kernels. Here, execution was overlapped, but that obviously depends on the exact characteristics of the computations and your GPU. Also note that copying to and from the CPU is always going to block for some time, unless the memory was page-locked. CUDA.jl now supports locking memory like that using the pin function; for more details refer to the CUDA.jl documentation on tasks and threads.

-

CUDA 11.2 and stream-ordered allocations

-

CUDA.jl now also fully supports CUDA 11.2, and it will default to using that version of the toolkit if your driver supports it. The release came with several new features, such as the new stream-ordered memory allocator. Without going into details, it is now possible to asynchonously allocate memory, obviating much of the need to cache those allocations in a memory pool. Initial benchmarks have shown nice speed-ups from using this allocator, while lowering memory pressure and thus reducing invocations of the Julia garbage collector.

-

When using CUDA 11.2, CUDA.jl will default to the CUDA-backed memory pool and disable its own caching layer. If you want to compare performance, you can still use the old allocator and caching memory pool by setting the JULIA_CUDA_MEMORY_POOL environment variable to, e.g. binned. On older versions of CUDA, the binned pool is still used by default.

-

GPU method overrides

-

With the new AbstractInterpreter functionality in Julia 1.6, it is now much easier to further customize the Base compiler. This has enabled us to develop a mechanism for overriding methods with GPU-specific counterparts. It used to be required to explicitly pick CUDA-specific versions, e.g. CUDA.sin, because the Base version performed some GPU-incompatible operation. This was problematic as it did not compose with generic code, and the CUDA-specific versions often lacked support for specific combinations of argument types (for example, CUDA.sin(::Complex) was not supported).

-

With CUDA 3.0, it is possible to define GPU-specific methods that override an existing definition, without requiring a new function type. For now, this functionality is private to CUDA.jl, but we expect to make it available to other packages starting with Julia 1.7.

-

This functionality has unblocked many issues, as can be seen in the corresponding pull request. It is now no longer needed to prefix a call with the CUDA module to ensure a GPU-compatible version is used. Furthermore, it also protects users from accidentally calling GPU intrinsics, as doing so will now result in an error instead of a crash:

-
julia> CUDA.saturate(1f0)
-ERROR: This function is not intended for use on the CPU
-Stacktrace:
- [1] error(s::String)
-   @ Base ./error.jl:33
- [2] saturate(x::Float32)
-   @ CUDA ~/Julia/pkg/CUDA/src/device/intrinsics.jl:23
- [3] top-level scope
-   @ REPL[10]:1
-

Device-side random number generation

-

As an illustration of the value of GPU method overrides, CUDA.jl now provides a device-side random number generator that is accessible by simply calling rand() from a kernel:

-
julia> function kernel()
-         @cushow rand()
-         return
-       end
-kernel (generic function with 1 method)julia> @cuda kernel()
-rand() = 0.668274
-

This works by overriding the Random.default_rng() method, and providing a GPU-compatible random number generator: Building on exploratory work by @S-D-R, the current generator is a maximally equidistributed combined Tausworthe RNG that shares 32-bytes of random state across threads in a warp for performance. The generator performs well, but does not pass the Crush battery of tests, so PRs are welcome here to improve the implementation!

-

Note that for host-side operations, e.g. rand!(::CuArray), the generator is not yet used by default. Instead, we use CURAND whenever possible, and fall back to the slower but more full-featured GPUArrays.jl-generator in other cases.

-

Revamped cuDNN interface

-

Finally, the cuDNN wrappers have been completely revamped by @denizyuret. The goal of the redesign is to more faithfully map the cuDNN API to more natural Julia functions, so that packages like Knet.jl or NNlib.jl can more easily use advanced cuDNN features without having to resort to low-level C calls. For more details, refer to the design document. As part of this redesign, the high-level wrappers of CUDNN have been moved to a subpackage of NNlib.jl.

-]]>
- - Fri, 09 Apr 2021 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 2.4 and 2.5]]> - https://juliagpu.org/post/2021-01-08-cuda_2.4_2.5/index.html - https://juliagpu.org/2021-01-08-cuda_2.4_2.5/ - - - CUDA.jl v2.4 and v2.5 are two almost-identical feature releases, respectively for Julia 1.5 and 1.6. These releases feature a greatly improved findmin and findmax kernels, an improved interface for kernel introspection, support for CUDA 11.2, and of course many bug fixes.

-

Improved findmin and findmax kernels

-

Thanks to @tkf and @Ellipse0934, CUDA.jl now uses a single-pass kernel for finding the minimum or maximum item in a CuArray. This fixes compatibility with NaN-valued elements, while on average improving performance. Depending on the rank, shape and size of the array these improvements vary from a minor regression to order-of-magnitude improvements.

-

New kernel introspection interface

-

It is now possible to obtain a compiled-but-not-launched kernel by passing the launch=false keyword to @cuda. This is useful when you want to reflect, e.g., query the amount of registers, or other kernel properties:

-
julia> kernel = @cuda launch=false identity(nothing)
-CUDA.HostKernel{identity,Tuple{Nothing}}(...)julia> CUDA.registers(kernel)
-4
-

The old API is still available, and will even be extended in future versions of CUDA.jl for the purpose of compiling device functions (not kernels):

-
julia> kernel = cufunction(identity, Tuple{Nothing})
-CUDA.HostKernel{identity,Tuple{Nothing}}(...)
-

Support for CUDA 11.2

-

CUDA.jl now supports the latest version of CUDA, version 11.2. Because CUDNN and CUTENSOR are not compatible with this release yet, CUDA.jl won't automatically switch to it unless you explicitly request so:

-
julia> ENV["JULIA_CUDA_VERSION"] = "11.2"
-"11.2"julia> using CUDAjulia> CUDA.versioninfo()
-CUDA toolkit 11.2.0, artifact installation
-CUDA driver 11.2.0
-NVIDIA driver 460.27.4
-

Alternatively, if you disable use of artifacts through JULIA_CUDA_USE_BINARYBUILDER=false, CUDA 11.2 can be picked up from your local system.

-

Future developments

-

Due to upstream compiler changes, CUDA.jl 2.4 is expected to be the last release compatible with Julia 1.5. Patch releases are still possible, but are not automatic: If you need a specific bugfix from a future CUDA.jl release, create an issue or PR to backport the change.

-]]>
- - Fri, 08 Jan 2021 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[Introducing: oneAPI.jl]]> - https://juliagpu.org/post/2020-11-05-oneapi_0.1/index.html - https://juliagpu.org/2020-11-05-oneapi_0.1/ - - - We're proud to announce the first version of oneAPI.jl, a Julia package for programming accelerators with the oneAPI programming model. It is currently available for select Intel GPUs, including common integrated ones, and offers a similar experience to CUDA.jl.

-

The initial version of this package, v0.1, consists of three key components:

-
    -
  • wrappers for the oneAPI Level Zero interfaces;

    -
  • -
  • a compiler for Julia source code to SPIR-V IR;

    -
  • -
  • and an array interface for convenient data-parallel programming.

    -
  • -
-

In this post, I'll briefly describe each of these. But first, some essentials.

-

Installation

-

oneAPI.jl is currently only supported on 64-bit Linux, using a sufficiently recent kernel, and requires Julia 1.5. Furthermore, it currently only supports a limited set of Intel GPUs: Gen9 (Skylake, Kaby Lake, Coffee Lake), Gen11 (Ice Lake), and Gen12 (Tiger Lake).

-

If your Intel CPU has an integrated GPU supported by oneAPI, you can just go ahead and install the oneAPI.jl package:

-
pkg> add oneAPI
-

That's right, no additional drivers required! oneAPI.jl ships its own copy of the Intel Compute Runtime, which works out of the box on any (sufficiently recent) Linux kernel. The initial download, powered by Julia's artifact subsystem, might take a while to complete. After that, you can import the package and start using its functionality:

-
julia> using oneAPIjulia> oneAPI.versioninfo()
-Binary dependencies:
-- NEO_jll: 20.42.18209+0
-- libigc_jll: 1.0.5186+0
-- gmmlib_jll: 20.3.2+0
-- SPIRV_LLVM_Translator_jll: 9.0.0+1
-- SPIRV_Tools_jll: 2020.2.0+1Toolchain:
-- Julia: 1.5.2
-- LLVM: 9.0.11 driver:
-- 00007fee-06cb-0a10-1642-ca9f01000000 (v1.0.0, API v1.0.0)1 device:
-- Intel(R) Graphics Gen9
-

The oneArray type

-

Similar to CUDA.jl's CuArray type, oneAPI.jl provides an array abstraction that you can use to easily perform data parallel operations on your GPU:

-
julia> a = oneArray(zeros(2,3))
-2×3 oneArray{Float64,2}:
- 0.0  0.0  0.0
- 0.0  0.0  0.0julia> a .+ 1
-2×3 oneArray{Float64,2}:
- 1.0  1.0  1.0
- 1.0  1.0  1.0julia> sum(ans; dims=2)
-2×1 oneArray{Float64,2}:
- 3.0
- 3.0
-

This functionality builds on the GPUArrays.jl package, which means that a lot of operations are supported out of the box. Some are still missing, of course, and we haven't carefully optimized for performance either.

-

Kernel programming

-

The above array operations are made possible by a compiler that transforms Julia source code into SPIR-V IR for use with oneAPI. Most of this work is part of GPUCompiler.jl. In oneAPI.jl, we use this compiler to provide a kernel programming model:

-
julia> function vadd(a, b, c)
-           i = get_global_id()
-           @inbounds c[i] = a[i] + b[i]
-           return
-       endjulia> a = oneArray(rand(10));julia> b = oneArray(rand(10));julia> c = similar(a);julia> @oneapi items=10 vadd(a, b, c)julia> @test Array(a) .+ Array(b) == Array(c)
-Test Passed
-

Again, the @oneapi macro resembles @cuda from CUDA.jl. One of the differences with the CUDA stack is that we use OpenCL-style built-ins, like get_global_id instead of threadIdx and barrier instead of sync_threads. Other familiar functionality, e.g. to reflect on the compiler, is available as well:

-
julia> @device_code_spirv @oneapi vadd(a, b, c)
-; CompilerJob of kernel vadd(oneDeviceArray{Float64,1,1},
-;                            oneDeviceArray{Float64,1,1},
-;                            oneDeviceArray{Float64,1,1})
-; for GPUCompiler.SPIRVCompilerTarget; SPIR-V
-; Version: 1.0
-; Generator: Khronos LLVM/SPIR-V Translator; 14
-; Bound: 46
-; Schema: 0
-               OpCapability Addresses
-               OpCapability Linkage
-               OpCapability Kernel
-               OpCapability Float64
-               OpCapability Int64
-               OpCapability Int8
-          %1 = OpExtInstImport "OpenCL.std"
-               OpMemoryModel Physical64 OpenCL
-               OpEntryPoint Kernel
-               ...
-               OpReturn
-               OpFunctionEnd
-

Level Zero wrappers

-

To interface with the oneAPI driver, we use the Level Zero API. Wrappers for this API is available under the oneL0 submodule of oneAPI.jl:

-
julia> using oneAPI.oneL0julia> drv = first(drivers())
-ZeDriver(00000000-0000-0000-1642-ca9f01000000, version 1.0.0)julia> dev = first(devices(drv))
-ZeDevice(GPU, vendor 0x8086, device 0x1912): Intel(R) Graphics Gen9
-

This is a low-level interface, and importing this submodule should not be required for the vast majority of users. It is only useful when you want to perform very specific operations, like submitting an certain operations to the command queue, working with events, etc. In that case, you should refer to the upstream specification; The wrappers in the oneL0 module closely mimic the C APIs.

-

Status

-

Version 0.1 of oneAPI.jl forms a solid base for future oneAPI developments in Julia. Thanks to the continued effort of generalizing the Julia GPU support in packages like GPUArrays.jl and GPUCompiler.jl, this initial version is already much more usable than early versions of CUDA.jl or AMDGPU.jl ever were.

-

That said, there are crucial parts missing. For one, oneAPI.jl does not integrate with any of the vendor libraries like oneMKL or oneDNN. That means several important operations, e.g. matrix-matrix multiplication, will be slow. Hardware support is also limited, and the package currently only works on Linux.

-

If you want to contribute to oneAPI.jl, or run into problems, check out the GitHub repository at JuliaGPU/oneAPI.jl. For questions, please use the Julia Discourse forum under the GPU domain and/or in the #gpu channel of the Julia Slack.

-]]>
- - Thu, 05 Nov 2020 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 2.1]]> - https://juliagpu.org/post/2020-10-30-cuda_2.1/index.html - https://juliagpu.org/2020-10-30-cuda_2.1/ - - - CUDA.jl v2.1 is a bug-fix release, with one new feature: support for cubic texture interpolations. The release also partly reverts a change from v2.0: reshape, reinterpret and contiguous views now return a CuArray again.

-

Generalized texture interpolations

-

CUDA's texture hardware only supports nearest-neighbour and linear interpolation, for other modes one is required to perform the interpolation by hand. In CUDA.jl v2.1 we are generalizing the texture interpolation API so that it is possible to use both hardware-backed and software-implemented interpolation modes in exactly the same way:

-
# N is the dimensionality (1, 2 or 3)
-# T is the element type (needs to be supported by the texture hardware)# source array
-src = rand(T, fill(10, N)...)# indices we want to interpolate
-idx = [tuple(rand(1:0.1:10, N)...) for _ in 1:10]# upload to the GPU
-gpu_src = CuArray(src)
-gpu_idx = CuArray(idx)# create a texture array for optimized fetching
-# this is required for N=1, optional for N=2 and N=3
-gpu_src = CuTextureArray(gpu_src)# interpolate using a texture
-gpu_dst = CuArray{T}(undef, size(gpu_idx))
-gpu_tex = CuTexture(gpu_src; interpolation=CUDA.NearestNeighbour())
-broadcast!(gpu_dst, gpu_idx, Ref(gpu_tex)) do idx, tex
-    tex[idx...]
-end# back to the CPU
-dst = Array(gpu_dst)
-

Here, we can change the interpolation argument to CuTexture to either NearestNeighbour or LinearInterpolation, both supported by the hardware, or CubicInterpolation which is implemented in software (building on the hardware-supported linear interpolation).

-

Partial revert of array wrapper changes

-

In CUDA.jl v2.0, we changed the behavior of several important array operations to reuse available wrappers in Base: reshape started returning a ReshapedArray, view now returned a SubArray, and reinterpret was reworked to use ReinterpretArray. These changes were made to ensure maximal compatibility with Base's array type, and to simplify the implementation in CUDA.jl and GPUArrays.jl.

-

However, this change turned out to regress the time to precompile and load CUDA.jl. Consequently, the change has been reverted, and these wrappers are now implemented as part of the CuArray type again. Note however that we intend to revisit this change in the future. It is therefore recommended to use the DenseCuArray type alias for methods that need a CuArray backed by contiguous GPU memory. For strided CuArrays, i.e. non-contiguous views, you should use the StridedCuArray alias.

-]]>
- - Fri, 30 Oct 2020 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 2.0]]> - https://juliagpu.org/post/2020-10-02-cuda_2.0/index.html - https://juliagpu.org/2020-10-02-cuda_2.0/ - - - Today we're releasing CUDA.jl 2.0, a breaking release with several new features. Highlights include initial support for Float16, a switch to CUDA's new stream model, a much-needed rework of the sparse array support and support for CUDA 11.1.

-

The release now requires Julia 1.5, and assumes a GPU with compute capability 5.0 or higher (although most of the package will still work with an older GPU).

-

Low- and mixed-precision operations

-

With NVIDIA's latest GPUs featuring more and more low-precision operations, CUDA.jl now starts to support these data types. For example, the CUBLAS wrappers can be used with (B)Float16 inputs (running under JULIA_DEBUG=CUBLAS to illustrate the called methods) thanks to the cublasGemmEx API call:

-
julia> mul!(CUDA.zeros(Float32,2,2),
-            cu(rand(Float16,2,2)),
-            cu(rand(Float16,2,2)))I! cuBLAS (v11.0) function cublasStatus_t cublasGemmEx(...) called:
-i!  Atype: type=cudaDataType_t; val=CUDA_R_16F(2)
-i!  Btype: type=cudaDataType_t; val=CUDA_R_16F(2)
-i!  Ctype: type=cudaDataType_t; val=CUDA_R_32F(0)
-i!  computeType: type=cublasComputeType_t; val=CUBLAS_COMPUTE_32F(68)2×2 CuArray{Float32,2}:
- 0.481284  0.561241
- 1.12923   1.04541
-
julia> using BFloat16sjulia> mul!(CUDA.zeros(BFloat16,2,2),
-            cu(BFloat16.(rand(2,2))),
-            cu(BFloat16.(rand(2,2))))I! cuBLAS (v11.0) function cublasStatus_t cublasGemmEx(...) called:
-i!  Atype: type=cudaDataType_t; val=CUDA_R_16BF(14)
-i!  Btype: type=cudaDataType_t; val=CUDA_R_16BF(14)
-i!  Ctype: type=cudaDataType_t; val=CUDA_R_16BF(14)
-i!  computeType: type=cublasComputeType_t; val=CUBLAS_COMPUTE_32F(68)2×2 CuArray{BFloat16,2}:
- 0.300781   0.71875
- 0.0163574  0.0241699
-

Alternatively, CUBLAS can be configured to automatically down-cast 32-bit inputs to Float16. This is now exposed through a task-local CUDA.jl math mode:

-
julia> CUDA.math_mode!(CUDA.FAST_MATH; precision=:Float16)julia> mul!(CuArray(zeros(Float32,2,2)),
-            CuArray(rand(Float32,2,2)),
-            CuArray(rand(Float32,2,2)))I! cuBLAS (v11.0) function cublasStatus_t cublasGemmEx(...) called:
-i!  Atype: type=cudaDataType_t; val=CUDA_R_32F(0)
-i!  Btype: type=cudaDataType_t; val=CUDA_R_32F(0)
-i!  Ctype: type=cudaDataType_t; val=CUDA_R_32F(0)
-i!  computeType: type=cublasComputeType_t; val=CUBLAS_COMPUTE_32F_FAST_16F(74)2×2 CuArray{Float32,2}:
- 0.175258  0.226159
- 0.511893  0.331351
-

As part of these changes, CUDA.jl now defaults to using tensor cores. This may affect accuracy; use math mode PEDANTIC if you want the old behavior.

-

Work is under way to extend these capabilities to the rest of CUDA.jl, e.g., the CUDNN wrappers, or the native kernel programming capabilities.

-

New default stream semantics

-

In CUDA.jl 2.0 we're switching to CUDA's simplified stream programming model. This simplifies working with multiple streams, and opens up more possibilities for concurrent execution of GPU operations.

-

Multi-stream programming

-

In the old model, the default stream (used by all GPU operations unless specified otherwise) was a special stream whose commands could not be executed concurrently with commands on regular, explicitly-created streams. For example, if we interleave kernels executed on a dedicated stream with ones on the default one, execution was serialized:

-
using CUDAN = 1 << 20function kernel(x, n)
-    tid = threadIdx().x + (blockIdx().x-1) * blockDim().x
-    for i = tid:blockDim().x*gridDim().x:n
-        x[i] = CUDA.sqrt(CUDA.pow(3.14159f0, i))
-    end
-    return
-endnum_streams = 8for i in 1:num_streams
-    stream = CuStream()    data = CuArray{Float32}(undef, N)    @cuda blocks=1 threads=64 stream=stream kernel(data, N)    @cuda kernel(data, 0)
-end
-
- Multi-stream programming (old) -

In the new model, default streams are regular streams and commands issued on them can execute concurrently with those on other streams:

-
- Multi-stream programming (new) -

Multi-threading

-

Another consequence of the new stream model is that each thread gets its own default stream (accessible as CuStreamPerThread()). Together with Julia's threading capabilities, this makes it trivial to group independent work in tasks, benefiting from concurrent execution on the GPU where possible:

-
using CUDAN = 1 << 20function kernel(x, n)
-    tid = threadIdx().x + (blockIdx().x-1) * blockDim().x
-    for i = tid:blockDim().x*gridDim().x:n
-        x[i] = CUDA.sqrt(CUDA.pow(3.14159f0, i))
-    end
-    return
-endThreads.@threads for i in 1:Threads.nthreads()
-    data = CuArray{Float32}(undef, N)
-    @cuda blocks=1 threads=64 kernel(data, N)
-    synchronize(CuDefaultStream())
-end
-
- Multi-threading (new) -

With the old model, execution would have been serialized because the default stream was the same across threads:

-
- Multi-threading (old) -

Future improvements will make this behavior configurable, such that users can use a different default stream per task.

-

Sparse array clean-up

-

As part of CUDA.jl 2.0, the sparse array support has been refactored, bringing them in line with other array types and their expected behavior. For example, the custom switch2 methods have been removed in favor of calls to convert and array constructors:

-
julia> using SparseArrays
-julia> using CUDA, CUDA.CUSPARSEjulia> CuSparseMatrixCSC(CUDA.rand(2,2))
-2×2 CuSparseMatrixCSC{Float32} with 4 stored entries:
-  [1, 1]  =  0.124012
-  [2, 1]  =  0.791714
-  [1, 2]  =  0.487905
-  [2, 2]  =  0.752466julia> CuSparseMatrixCOO(sprand(2,2, 0.5))
-2×2 CuSparseMatrixCOO{Float64} with 3 stored entries:
-  [1, 1]  =  0.183183
-  [2, 1]  =  0.966466
-  [2, 2]  =  0.064101julia> CuSparseMatrixCSR(ans)
-2×2 CuSparseMatrixCSR{Float64} with 3 stored entries:
-  [1, 1]  =  0.183183
-  [2, 1]  =  0.966466
-  [2, 2]  =  0.064101
-

Initial support for the COO sparse matrix type has also been added, along with more better support for sparse matrix-vector multiplication.

-

Support for CUDA 11.1

-

This release also features support for the brand-new CUDA 11.1. As there is no compatible release of CUDNN or CUTENSOR yet, CUDA.jl won't automatically select this version, but you can force it to by setting the JULIA_CUDA_VERSION environment variable to 11.1:

-
julia> ENV["JULIA_CUDA_VERSION"] = "11.1"julia> using CUDAjulia> CUDA.versioninfo()
-CUDA toolkit 11.1.0, artifact installationLibraries:
-- CUDNN: missing
-- CUTENSOR: missing
-

Minor changes

-

Many other changes are part of this release:

-
    -
  • Views, reshapes and array reinterpretations are now represented by the Base array wrappers, simplifying the CuArray type definition.

    -
  • -
  • Various optimizations to CUFFT and CUDNN library wrappers.

    -
  • -
  • Support for LinearAlgebra.reflect! and rotate!

    -
  • -
  • Initial support for calling CUDA libraries with strided inputs

    -
  • -
-]]>
- - Fri, 02 Oct 2020 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[Paper: Flexible Performant GEMM Kernels on GPUs]]> - https://juliagpu.org/post/2020-09-28-gemmkernels/index.html - https://juliagpu.org/2020-09-28-gemmkernels/ - - - General Matrix Multiplication or GEMM kernels take center place in high performance computing and machine learning. Recent NVIDIA GPUs include GEMM accelerators, such as NVIDIA's Tensor Cores. In this paper we show how it is possible to program these accelerators from Julia, and present abstractions and interfaces that allow to do so efficiently without sacrificing performance.

-

A pre-print of the paper has been published on arXiv: arXiv:2009.12263.
The source code can be found on GitHub: thomasfaingnaert/GemmKernels.jl.

-

With the APIs from GemmKernels.jl, it is possible to instantiate GEMM kernels that perform in the same ball park as, and sometimes even outperform state-of-the-art libraries like CUBLAS and CUTLASS. For example, performing a mixed-precision multiplication of two 16-bit matrixes into a 32-bit accumulator (on different combinations of layouts):

-
- Performance of mixed-precision GEMM -

The APIs are also highly flexible and allow customization of each step, e.g., to apply the activation function max(x, 0) for implementing a rectified linear unit (ReLU):

-
a = CuArray(rand(Float16, (M, K)))
-b = CuArray(rand(Float16, (K, N)))
-c = CuArray(rand(Float32, (M, N)))
-d = similar(c)conf = GemmKernels.get_config(
-    gemm_shape = (M = M, N = N, K = K),
-    operator = Operator.WMMAOp{16, 16, 16},
-    global_a_layout = Layout.AlignedColMajor{Float16},
-    global_c_layout = Layout.AlignedColMajor{Float32})GemmKernels.matmul(
-    a, b, c, d, conf;
-    transform_regs_to_shared_d = Transform.Elementwise(x -> max(x, 0)))
-

The GemmKernels.jl framework is written entirely in Julia, demonstrating the high-performance GPU programming capabilities of this language, but at the same time keeping the research accessible and easy to modify or repurpose by other Julia developers.

-]]>
- - Mon, 28 Sep 2020 00:00:00 +0000 - - - - Thomas Faingnaert, Tim Besard, Bjorn De Sutter - - -
- - - <![CDATA[CUDA.jl 1.3 - Multi-device programming]]> - https://juliagpu.org/post/2020-07-18-cuda_1.3/index.html - https://juliagpu.org/2020-07-18-cuda_1.3/ - - - Today we're releasing CUDA.jl 1.3, with several new features. The most prominent change is support for multiple GPUs within a single process.

-

Multi-GPU programming

-

With CUDA.jl 1.3, you can finally use multiple CUDA GPUs within a single process. To switch devices you can call device!, query the current device with device(), or reset it using device_reset!():

-
julia> collect(devices())
-9-element Array{CuDevice,1}:
- CuDevice(0): Tesla V100-PCIE-32GB
- CuDevice(1): Tesla V100-PCIE-32GB
- CuDevice(2): Tesla V100-PCIE-32GB
- CuDevice(3): Tesla V100-PCIE-32GB
- CuDevice(4): Tesla V100-PCIE-16GB
- CuDevice(5): Tesla P100-PCIE-16GB
- CuDevice(6): Tesla P100-PCIE-16GB
- CuDevice(7): GeForce GTX 1080 Ti
- CuDevice(8): GeForce GTX 1080 Tijulia> device!(5)julia> device()
-CuDevice(5): Tesla P100-PCIE-16GB
-

Let's define a kernel to show this really works:

-
julia> function kernel()
-           dev = Ref{Cint}()
-           CUDA.cudaGetDevice(dev)
-           @cuprintln("Running on device $(dev[])")
-           return
-       endjulia> @cuda kernel()
-Running on device 5julia> device!(0)julia> device()
-CuDevice(0): Tesla V100-PCIE-32GBjulia> @cuda kernel()
-Running on device 0
-

Memory allocations, like CuArrays, are implicitly bound to the device they were allocated on. That means you should take care to only use an array when the owning device is active, or you will run into errors:

-
julia> device()
-CuDevice(0): Tesla V100-PCIE-32GBjulia> a = CUDA.rand(1)
-1-element CuArray{Float32,1}:
- 0.6322775julia> device!(1)julia> a
-ERROR: CUDA error: an illegal memory access was encountered
-

Future improvements might make the array type device-aware.

-

Multitasking and multithreading

-

Dovetailing with the support for multiple GPUs, is the ability to use these GPUs on separate Julia tasks and threads:

-
julia> device!(0)julia> @sync begin
-         @async begin
-           device!(1)
-           println("Working with $(device()) on $(current_task())")
-           yield()
-           println("Back to device $(device()) on $(current_task())")
-         end
-         @async begin
-           device!(2)
-           println("Working with $(device()) on $(current_task())")
-         end
-       end
-Working with CuDevice(1) on Task @0x00007fc9e6a48010
-Working with CuDevice(2) on Task @0x00007fc9e6a484f0
-Back to device CuDevice(1) on Task @0x00007fc9e6a48010julia> device()
-CuDevice(0): Tesla V100-PCIE-32GB
-

Each task has its own local GPU state, such as the device it was bound to, handles to libraries like CUBLAS or CUDNN (which means that each task can configure libraries independently), etc.

-

Minor features

-

CUDA.jl 1.3 also features some minor changes:

-
    -
  • Reinstated compatibility with Julia 1.3

    -
  • -
  • Support for CUDA 11.0 Update 1

    -
  • -
  • Support for CUDNN 8.0.2

    -
  • -
-

Known issues

-

Several operations on sparse arrays have been broken since CUDA.jl 1.2, due to the deprecations that were part of CUDA 11. The next version of CUDA.jl will drop support for CUDA 10.0 or older, which will make it possible to use new cuSPARSE APIs and add back missing functionality.

-]]>
- - Sat, 18 Jul 2020 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDA.jl 1.1]]> - https://juliagpu.org/post/2020-07-07-cuda_1.1/index.html - https://juliagpu.org/2020-07-07-cuda_1.1/ - - - CUDA.jl 1.1 marks the first feature release after merging several CUDA packages into one. It raises the minimal Julia version to 1.4, and comes with support for the impending 1.5 release.

-

CUDA.jl replacing CuArrays/CUDAnative.jl

-

As announced a while back, CUDA.jl is now the new package for programming CUDA GPUs in Julia, replacing CuArrays.jl, CUDAnative.jl, CUDAdrv.jl and CUDAapi.jl. The merged package should be a drop-in replacement: All existing functionality has been ported, and almost all exported functions are still there. Applications like Flux.jl or the DiffEq.jl stack are being updated to support this change.

-

CUDA 11 support

-

With CUDA.jl 1.1, we support the upcoming release of the CUDA toolkit. This only applies to locally-installed versions of the toolkit, i.e., you need to specify JULIA_CUDA_USE_BINARYBUILDER=false in your environment to pick up the locally-installed release candidate of the CUDA toolkit. New features, like the third-generation tensor cores and its extended type support, or any new APIs, are not yet natively supported by Julia code.

-

NVIDIA Management Library (NVML)

-

CUDA.jl now integrates with the NVIDIA Management Library, or NVML. With this library, it's possible to query information about the system, any GPU devices, their topology, etc.:

-
julia> using CUDAjulia> dev = first(NVML.devices())
-CUDA.NVML.Device(Ptr{Nothing} @0x00007f987c7c6e38)julia> NVML.uuid(dev)
-UUID("b8d5e790-ea4d-f962-e0c3-0448f69f2e23")julia> NVML.name(dev)
-"Quadro RTX 5000"julia> NVML.power_usage(dev)
-37.863julia> NVML.energy_consumption(dev)
-65330.292
-

Experimental: Texture support

-

It is now also possible to use the GPU's hardware texture support from Julia, albeit using a fairly low-level and still experimental API (many thanks to @cdsousa for the initial development). As a demo, let's start with loading a sample image:

-
julia> using Images, TestImages, ColorTypes, FixedPointNumbers
-julia> img = RGBA{N0f8}.(testimage("lighthouse"))
-

We use RGBA since CUDA's texture hardware only supports 1, 2 or 4 channels. This support is also currently limited to "plain" types, so let's reinterpret the image:

-
julia> img′ = reinterpret(NTuple{4,UInt8}, img)
-

Now we can upload this image to the array, using the CuTextureArray type for optimized storage (normal CuArrays are supported too), and bind it to a CuTexture object that we can pass to a kernel:

-
julia> texturearray = CuTextureArray(img′)julia> texture = CuTexture(texturearray; normalized_coordinates=true)
-512×768 4-channel CuTexture(::CuTextureArray) with eltype NTuple{4,UInt8}
-

Let's write and a kernel that warps this image. Since we specified normalized_coordinates=true, we index the texture using values in [0,1]:

-
function warp(dst, texture)
-    tid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    I = CartesianIndices(dst)
-    @inbounds if tid <= length(I)
-        i,j = Tuple(I[tid])
-        u = Float32(i-1) / Float32(size(dst, 1)-1)
-        v = Float32(j-1) / Float32(size(dst, 2)-1)
-        x = u + 0.02f0 * CUDA.sin(30v)
-        y = v + 0.03f0 * CUDA.sin(20u)
-        dst[i,j] = texture[x,y]
-    end
-    return
-end
-

The size of the output image determines how many elements we need to process. This needs to be translated to a number of threads and blocks, keeping in mind device and kernel characteristics. We automate this using the occupancy API:

-
julia> outimg_d = CuArray{eltype(img′)}(undef, 500, 1000);julia> function configurator(kernel)
-           config = launch_configuration(kernel.fun)           threads = Base.min(length(outimg_d), config.threads)
-           blocks = cld(length(outimg_d), threads)           return (threads=threads, blocks=blocks)
-       endjulia> @cuda config=configurator warp(outimg_d, texture)
-

Finally, we fetch and visualize the output:

-
julia> outimg = Array(outimg_d)julia> save("imgwarp.png", reinterpret(eltype(img), outimg))
-
- Warped lighthouse -

Minor features

-

The test-suite is now parallelized, using up-to JULIA_NUM_THREADS processes:

-
$ JULIA_NUM_THREADS=4 julia -e 'using Pkg; Pkg.test("CUDA");'                                     |          | ---------------- GPU ---------------- | ---------------- CPU ---------------- |
-Test                        (Worker) | Time (s) | GC (s) | GC % | Alloc (MB) | RSS (MB) | GC (s) | GC % | Alloc (MB) | RSS (MB) |
-initialization                   (2) |     2.52 |   0.00 |  0.0 |       0.00 |   115.00 |   0.05 |  1.8 |     153.13 |   546.27 |
-apiutils                         (4) |     0.55 |   0.00 |  0.0 |       0.00 |   115.00 |   0.02 |  4.0 |      75.86 |   522.36 |
-codegen                          (4) |    14.81 |   0.36 |  2.5 |       0.00 |   157.00 |   0.62 |  4.2 |    1592.28 |   675.15 |
-...
-gpuarrays/mapreduce essentials   (2) |   113.52 |   0.01 |  0.0 |       3.19 |   641.00 |   2.61 |  2.3 |    8232.84 |  2449.35 |
-gpuarrays/mapreduce (old tests)  (5) |   138.35 |   0.01 |  0.0 |     130.20 |   507.00 |   2.94 |  2.1 |    8615.15 |  2353.62 |
-gpuarrays/mapreduce derivatives  (3) |   180.52 |   0.01 |  0.0 |       3.06 |   229.00 |   3.44 |  1.9 |   12262.67 |  1403.39 |Test Summary: |  Pass  Broken  Total
-  Overall     | 11213       3  11216
-    SUCCESS
-    Testing CUDA tests passed
-

A copy of Base.versioninfo() is available to report on the CUDA toolchain and any devices:

-
julia> CUDA.versioninfo()
-CUDA toolkit 10.2.89, artifact installation
-CUDA driver 11.0.0
-NVIDIA driver 450.36.6Libraries:
-- CUBLAS: 10.2.2
-- CURAND: 10.1.2
-- CUFFT: 10.1.2
-- CUSOLVER: 10.3.0
-- CUSPARSE: 10.3.1
-- CUPTI: 12.0.0
-- NVML: 11.0.0+450.36.6
-- CUDNN: 7.6.5 (for CUDA 10.2.0)
-- CUTENSOR: 1.1.0 (for CUDA 10.2.0)Toolchain:
-- Julia: 1.5.0-rc1.0
-- LLVM: 9.0.1
-- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4
-- Device support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_751 device(s):
-- Quadro RTX 5000 (sm_75, 14.479 GiB / 15.744 GiB available)
-

CUTENSOR artifacts have been upgraded to version 1.1.0.

-

Benchmarking infrastructure based on the Codespeed project has been set-up at speed.juliagpu.org to keep track of the performance of various operations.

-]]>
- - Tue, 07 Jul 2020 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[CUDAnative.jl 3.0 and CuArrays.jl 2.0]]> - https://juliagpu.org/post/2020-03-25-cudanative_3.0-cuarrays_2.0/index.html - https://juliagpu.org/cudanative_3.0-cuarrays_2.0/ - - - This post is located at /cudanative_3.0-cuarrays_2.0/

-

This release of the Julia CUDA stack contains some exciting new features: automatic installation of CUDA using artifacts, full support for GPU method redefinitions, and experimental support for multitasking and multithreading. The release is technically breaking, but most end-users should not be affected.

-

API changes

-

Changes to certain APIs require these releases to be breaking, however, most users should not be affected and chances are you can just bump your Compat entries without any additional changes. Flux.jl users will have to wait a little longer though, as the package uses non-public APIs that have changed and requires an update.

-

Artifacts

-

CUDA and its dependencies will now be automatically installed using artifacts generated by BinaryBuilder.jl. This greatly improves usability, and only requires a functioning NVIDIA driver:

-
julia> ENV["JULIA_DEBUG"] = "CUDAnative"julia> using CUDAnativejulia> CUDAnative.version()
-┌ Debug: Trying to use artifacts...
-└ @ CUDAnative CUDAnative/src/bindeps.jl:52
-┌ Debug: Using CUDA 10.2.89 from an artifact at /depot/artifacts/...
-└ @ CUDAnative CUDAnative/src/bindeps.jl:108
-v"10.2.89"
-

Use of a local installation is still possible by setting the environment variable JULIA_CUDA_USE_BINARYBUILDER to false. For more details, refer to the documentation.

-

Relevant PRs: CUDAnative.jl#492 and CuArrays.jl#490

-

Method redefinitions

-

CUDAnative 3.0 now fully supports method redefinitions, commonly referred to as Julia issue #265, and makes it possible to use interactive programming tools like Revise.jl:

-
julia> child() = 0
-julia> parent() = (@cuprintln(child()); return)
-julia> @cuda parent()
-0julia> parent() = (@cuprintln(child() + 1); return)
-julia> @cuda parent()
-1
-julia> child() = 1
-julia> @cuda parent()
-2
-

Relevant PRs: CUDAnative.jl#581

-

Experimental: Multitasking and multithreading

-

With CUDAnative 3.0 and CuArrays 2.0 you can now use Julia tasks and threads to organize your code. In combination with CUDA streams, this makes it possible to execute kernels and other GPU operations in parallel:

-
@sync begin
-    function my_expensive_kernel()
-        return
-    end
-    @async @cuda stream=CuStream() my_expensive_kernel()
-    @async @cuda stream=CuStream() my_expensive_kernel()
-end
-

Every task, whether it runs on a separate thread or not, can work with a different device, as well as independently work with CUDA libraries like CUBLAS and CUFFT.

-

Note that this support is experimental, and lacks certain features to be fully effective. For one, the CuArrays memory allocator is not device-aware, and it is currently not possible to configure the CUDA stream for operations like map or broadcast.

-

Relevant PRs: CUDAnative.jl#609 and CuArrays.jl#645

-

Minor changes

-

GPU kernels are now name-mangled like C++, which offers better integration with NVIDIA tools (CUDAnative.jl#559).

-

A better N-dimensional mapreducedim! kernel, properly integrating with all Base interfaces (CuArrays.jl#602 and GPUArrays#246).

-

A CuIterator type for batching arrays to the GPU (by @jrevels, CuArrays.jl#467).

-

Integration with Base's 5-arg mul! (by @haampie, CuArrays.jl#641 and GPUArrays#253).

-

Integration with Cthulhu.jl for interactive inspection of generated code (CUDAnative.jl#597).

-

Known issues

-

With a release as big as this one there's bound to be some bugs, e.g., with the installation of artifacts on exotic systems, or due to the many changes to make the libraries thread-safe. If you need absolute stability, please wait for a point release.

-

There are also some known issues. CUDAnative is currently not compatible with Julia 1.5 due to Base compiler changes (julia#34993), the new mapreducedim! kernel appears to be slower in some cases (CuArrays.jl#611), and there are some remaining thread-safety issues when using the non-default memory pool (CuArrays.jl#647).

-]]>
- - Wed, 25 Mar 2020 00:00:00 +0000 - - - - Tim Besard - - -
- - - <![CDATA[New website for JuliaGPU]]> - https://juliagpu.org/post/2019-12-12-new_site/index.html - https://juliagpu.org/new_site/ - - - This post is located at /new_site/

-

Welcome to the new landing page for the JuliaGPU organization. This website serves as an introduction to the several packages for programming GPUs in Julia, with pointers to relevant resources for new users.

-

The sources for this website are hosted at GitHub and generated using Hugo, feel free to open an issue or pull request if you think it could be improved.

-]]>
- - Thu, 12 Dec 2019 00:00:00 +0000 - - - - Tim Besard - - -
-
\ No newline at end of file diff --git a/previews/PR44/robots.txt b/previews/PR44/robots.txt deleted file mode 100644 index 959563f..0000000 --- a/previews/PR44/robots.txt +++ /dev/null @@ -1,4 +0,0 @@ -Sitemap: https://juliagpu.org/sitemap.xml - -User-agent: * -Disallow: diff --git a/previews/PR44/rocm/index.html b/previews/PR44/rocm/index.html deleted file mode 100644 index 6d61dae..0000000 --- a/previews/PR44/rocm/index.html +++ /dev/null @@ -1,203 +0,0 @@ - - - - - - - - - - - - - - - - - - - AMD ROCm ⋅ JuliaGPU - - - - - - -
-
- - - - -

AMD ROCm

- -

- - - - - - -

- -

The Julia programming support for AMD GPUs based on the ROCm platform aims to provide similar capabilities as the NVIDIA CUDA stack, with support for both low-level kernel programming as well as an array-oriented interface. AMDGPU.jl offers comparable performance as HIP C++. The toolchain can easily be installed on latest version of Julia using the integrated package manager.

-

AMDGPU.jl makes it possible to program AMD GPUs at different abstraction levels:

-
    -
  • by using the ROCArray type, providing a user-friendly yet powerful abstraction that does not require any GPU programming experience;

    -
  • -
  • by writing ROC kernels, with similar performance as kernels written in HIP C++;

    -
  • -
  • by interfacing with HIP APIs and libraries directly, similar level of flexibility you would expect from a C-based programming environment.

    -
  • -
-

The documentation of AMDGPU.jl demonstrates each of these approaches.

-

Performance

-

Julia on the CPU is known for its good performance, approaching that of statically compiled languages like C. The same holds for programming AMD GPUs with kernels written using AMDGPU.jl, where we show preliminary performance to approach that of HIP C++ on a memcopy and 2D diffusion kernel:

- -
- - - -
-

- Preliminary performance of a memcopy and 2D diffusion kernel implemented in - Julia with AMDGPU.jl and executed on a MI250x GPU. -

-
-
- - -
-
- - - - - - - - - - - - - - - - - diff --git a/previews/PR44/sitemap.xml b/previews/PR44/sitemap.xml deleted file mode 100644 index cdd596c..0000000 --- a/previews/PR44/sitemap.xml +++ /dev/null @@ -1,58 +0,0 @@ - - - - - https://juliagpu.org/post/index.html - 2024-05-28 - monthly - 0.5 - - - https://juliagpu.org/rocm/index.html - 2024-05-28 - monthly - 0.5 - - - https://juliagpu.org/learn/index.html - 2024-05-28 - monthly - 0.5 - - - https://juliagpu.org/404/index.html - 2024-05-28 - monthly - 0.5 - - - https://juliagpu.org/cuda/index.html - 2024-05-28 - monthly - 0.5 - - - https://juliagpu.org/index.html - 2024-05-28 - monthly - 0.5 - - - https://juliagpu.org/oneapi/index.html - 2024-05-28 - monthly - 0.5 - - - https://juliagpu.org/metal/index.html - 2024-05-28 - monthly - 0.5 - - - https://juliagpu.org/other/index.html - 2024-05-28 - monthly - 0.5 - - diff --git a/sitemap.xml b/sitemap.xml index aaacda4..cdd596c 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -3,55 +3,55 @@ https://juliagpu.org/post/index.html - 2024-05-24 + 2024-05-28 monthly 0.5 https://juliagpu.org/rocm/index.html - 2024-05-24 + 2024-05-28 monthly 0.5 https://juliagpu.org/learn/index.html - 2024-05-24 + 2024-05-28 monthly 0.5 https://juliagpu.org/404/index.html - 2024-05-24 + 2024-05-28 monthly 0.5 https://juliagpu.org/cuda/index.html - 2024-05-24 + 2024-05-28 monthly 0.5 https://juliagpu.org/index.html - 2024-05-24 + 2024-05-28 monthly 0.5 https://juliagpu.org/oneapi/index.html - 2024-05-24 + 2024-05-28 monthly 0.5 https://juliagpu.org/metal/index.html - 2024-05-24 + 2024-05-28 monthly 0.5 https://juliagpu.org/other/index.html - 2024-05-24 + 2024-05-28 monthly 0.5