Всякое

2025-09-20 18:48:55 +03:00
parent 95e5aab32d
commit 8f705bb7b8
9 changed files with 634 additions and 35 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -2,29 +2,47 @@
    "version": "0.2.0",
    "configurations": [
        {
+            "name": "<file>: lldb x64",
            "type": "lldb",
            "request": "launch",
-            "name": "<file>: lldb debug x64",
            "program": "${fileDirname}/build/${fileBasenameNoExtension}",
            "cwd": "${fileDirname}/build",
-            "preLaunchTask": "asm64"
+            "preLaunchTask": "asm64",
+            
        },
        {
-            "type": "lldb",
+            "name": "<file>: cppdbg x64",
+            "type": "cppdbg",
            "request": "launch",
-            "name": "<file>: lldb+GCC debug x64",
            "program": "${fileDirname}/build/${fileBasenameNoExtension}",
            "cwd": "${fileDirname}/build",
-            "preLaunchTask": "asm64+gcc"
+            "preLaunchTask": "asm64",
+    
        },
-
        {
+            "name": "<file>: by-gdb x64",
+            "type": "by-gdb",
+            "request": "launch",
+            "program": "${fileDirname}/build/${fileBasenameNoExtension}",
+            "cwd": "${fileDirname}/build",
+            "preLaunchTask": "asm64",
+    
+        },
+        {
+            "name": "<file>: gdb x64",
+            "type": "gdb",
+            "request": "launch",
+            "program": "${fileDirname}/build/${fileBasenameNoExtension}",
+            "preLaunchTask": "asm64",
+    
+        },
+        {
+            "name": "<file>: lldb+GCC x64",
            "type": "lldb",
            "request": "launch",
-            "name": "minimal: lldb debug x64",
-            "program": "${workspaceFolder}/minimal/build/minimal",
-            "cwd": "${workspaceFolder}/minimal/build"
-            // TODO тут требуется ещё указать задачу сборки minimal, но её ещё нет
-        },
+            "program": "${fileDirname}/build/${fileBasenameNoExtension}",
+            "cwd": "${fileDirname}/build",
+            "preLaunchTask": "asm64+gcc",
+        }
    ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +1,5 @@
 {
-    "debug.allowBreakpointsEverywhere": true
+    "debug.allowBreakpointsEverywhere": true,
+    "debug.inlineValues": "on",
+    "cmake.sourceDirectory": "/home/nullptr/Documents/Gitea/NASM/casm"
 }
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -6,10 +6,10 @@
            "type": "shell",
            "command": [
                "builddir=${fileDirname}/build;",
-                "mkdir -p $builddir;", 
+                "mkdir -p $builddir;",
                "rawfilename=$builddir/${fileBasenameNoExtension};",
-                "nasm -F dwarf -g -f elf64 -i ${fileDirname} -o $rawfilename.o ${file};",
-                "ld -m elf_x86_64 -o $rawfilename $rawfilename.o;"
+                "nasm -gdwarf -f elf64 -o $rawfilename.o ${file};",
+                "ld -g -m elf_x86_64 -o $rawfilename $rawfilename.o;"
            ],
            "problemMatcher": {
                "pattern": {
@@ -27,17 +27,16 @@
                "isDefault": true
            }
        },
-
        {
            "label": "asm64+gcc",
            "type": "shell",
            "command": [
-                        "builddir=${fileDirname}/build;",
-                        "mkdir -p $builddir;",
-                        "rawfilename=$builddir/${fileBasenameNoExtension};",
-                        "nasm -F dwarf -g -f elf64 -i ${fileDirname} -o $rawfilename.o ${file};",
-                        "gcc -m64 -o $rawfilename $rawfilename.o;"
-                    ],
+                "builddir=${fileDirname}/build;",
+                "mkdir -p $builddir;",
+                "rawfilename=$builddir/${fileBasenameNoExtension};",
+                "nasm -gdwarf -f elf64 -o $rawfilename.o ${file};",
+                "gcc -o $rawfilename $rawfilename.o;"
+            ],
            "problemMatcher": {
                "pattern": {
                    "regexp": "error"
@@ -49,9 +48,6 @@
                "reveal": "silent",
                "clear": true
            },
-            "group": {
-                "kind": "build"
-            }
        }
    ]
-}
+}
--- a/casm/CMakeLists.txt
+++ b/casm/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.14)
+
+project(casm)
+
+enable_language(ASM_NASM)
+
+set(CMAKE_ASM_NASM_FLAGS "-gdwarf -f elf64")
+
+add_executable(casm asm.asm c.c)
--- a/casm/asm.asm
+++ b/casm/asm.asm
@@ -0,0 +1,7 @@
+global main
+
+section .text
+main:
+    mov rdi, 0 
+    mov rax, 60
+    syscall
--- a/casm/c.c
+++ b/casm/c.c
@@ -0,0 +1,3 @@
+int sum(int a, int b) {
+    return a+b;
+}
--- a/minimal/minimal.asm
+++ b/minimal/minimal.asm
@@ -1,16 +1,11 @@
 global _start

 section .data
-text_message: db "Welcome to NASM!", 0xA
+nums dq 112, 113, 114, 115, 116  ; массив чисел типа qword
+currentAddr dq $

 section .text
 _start:
-    mov     rax, 1         ; write
-    mov     rdi, 1         ; stdout
-    mov     rsi, text_message
-    mov     rdx, 17        ; длина строки в байтах
-    syscall
-
-    mov     rax, 60        ; exit
-    mov     rdi, 1       ; код возврата = 0
+    mov rdi, [currentAddr - 8]  ; rdi = 116
+    mov rax, 60
    syscall
--- a/x86-64_Registers.png
+++ b/x86-64_Registers.png
--- a/x86-64_asm_sheet.md
+++ b/x86-64_asm_sheet.md
@@ -0,0 +1,569 @@
+# x86-64 ASM sheet
+
+## Addressing
+
+* No segmentation (except for `fs` and `gs` for special purposes like threading)
+
+* Relative to base register
+  * used for **data on the stack**, **arrays**, **structs** and **class members**
+  * [`base` + `index` * `scale` + `immediate_offset`]
+  * `base` is mandatory, can be any 64-bit register
+  * `index` can be any 64-bit register except `rsp`
+  * `scale` can be 1, 2, 4, or 8
+  * `immediate_offset` (called displacement with Gas) relative to the base register
+  * Gas syntax is `immediate_offset(base, index, scale)`
+
+* RIP-relative (a.k.a. PC-relative)
+  * used for **static data**
+  * contains a 32-bit sign-extended offset relative to the instruction pointer
+  * explicitely specified using `mov eax [rel label]` or `default rel` / `default abs` commands with NASM (uses 32-bit absolute addressing otherwise)
+  * explicitely specified using `mov eax label(%rip)` with Gas
+
+* 32-bit absolute
+  * 32 bits constant address sign-extended to 64 bits
+  * works for addresses below 2^31
+  * don't use for simple memory operands since RIP-relative addressing is shorter, faster (no need for relocations) and works everywhere
+  * used to access **static arrays with an index register** like `mov ebx, [intarray + rsi*4]` though it doesn't work for Windows and Linux DLLs and for MacOSX exes and DLLs because addresses are above 2^32 (it is used by gcc and clang for Linux exes, an image base relative addressing is used on Windows exes by MASM)
+  * an alternative that works everywhere is first loading the static array address into `rbx` using `lea` with a RIP-relative address and then address relatively from this base register (`lea rbx, [array]` then `mov eax, [rbx + rcx*4]`), other static arrays can then be accessed relatively (`mov [(array2-array1) + rbx + rcx*4], eax`)
+
+* 64-bit absolute
+  * `mov eax, dword [qword a]`
+  * can only be used with `mov` and registers `al`, `ax`, `eax` or `rax` (src or dst)
+  * can't contain a segment, base or index register
+
+## Position-Independent Code (PIC)
+
+* Easier and faster than the 32-bit Global Offset Table (GOT) technique since RIP-relative is position independent (note that the technique to access static arrays with an index register described earlier is position independent too)
+
+## General purpose registers
+
+ bit 0 - 63 | bit 0 - 31 | bit 0 - 15 | bit 8 - 15 | bit 0 - 7 
+:----------:|:----------:|:----------:|:----------:|:---------:
+ `rax`      | `eax`      | `ax`       | `ah`       | `al`
+ `rbx`      | `ebx`      | `bx`       | `bh`       | `bl`
+ `rcx`      | `ecx`      | `cx`       | `ch`       | `cl`
+ `rdx`      | `edx`      | `dx`       | `dh`       | `dl`
+ `rsi`      | `esi`      | `si`       |            | `sil`
+ `rdi`      | `edi`      | `di`       |            | `dil`
+ `rbp`      | `ebp`      | `bp`       |            | `bpl`
+ `rsp`      | `esp`      | `sp`       |            | `spl`
+ `r8`       | `r8d`      | `r8w`      |            | `r8b`
+ `r9`       | `r9d`      | `r9w`      |            | `r9b`
+ `r10`      | `r10d`     | `r10w`     |            | `r10b`
+ `r11`      | `r11d`     | `r11w`     |            | `r11b`
+ `r12`      | `r12d`     | `r12w`     |            | `r12b`
+ `r13`      | `r13d`     | `r13w`     |            | `r13b`
+ `r14`      | `r14d`     | `r14w`     |            | `r14b`
+ `r15`      | `r15d`     | `r15w`     |            | `r15b`
+ `rflags`   |            | `flags`    |            |
+ `rip`      |            |            |            |
+
+## `rflags` register
+
+* **CF (Carry Flag, bit 0)** — Set if an arithmetic operation generates a carry or a borrow out of the most-significant bit of the result; cleared otherwise. This flag indicates an overflow condition for unsigned-integer arithmetic. It is also used in multiple-precision arithmetic.
+* **PF (Parity Flag, bit 2)** — Set if the least-significant byte of the result contains an even number of 1 bits; cleared otherwise.
+* **AF (Auxiliary carry Flag, bit 4)** — Set if an arithmetic operation generates a carry or a borrow out of bit 3 of the result; cleared otherwise. This flag is used in binary-coded decimal (BCD) arithmetic.
+* **ZF (Zero Flag, bit 6)** — Set if the result is zero; cleared otherwise.
+* **SF (Sign Flag, bit 7)** — Set equal to the most-significant bit of the result, which is the sign bit of a signed integer. (0 indicates a positive value and 1 indicates a negative value.)
+* **OF (Overflow Flag, bit 11)** — Set if the integer result is too large a positive number or too small a negative number (excluding the sign-bit) to fit in the destination operand; cleared otherwise. This flag indicates an overflow condition for signed-integer (two’s complement) arithmetic.
+
+## Saturation and wraparound modes (of the instruction set)
+
+* **Wraparound arithmetic** — With wraparound arithmetic, a true out-of-range result is truncated (that is, the carry or overflow bit is ignored and only the least significant bits of the result are returned to the destination). Wraparound arithmetic is suitable for applications that control the range of operands to prevent out-of-range results. If the range of operands is not controlled, however, wraparound arithmetic can lead to large errors. For example, adding two large signed numbers can cause positive overflow and produce a negative result.
+* **Signed saturation arithmetic** — With signed saturation arithmetic, out-of-range results are limited to the representable range of signed integers for the integer size being operated on. For example, if positive overflow occurs when operating on signed word integers, the result is saturated to 7FFFH, which is the largest positive integer that can be represented in 16 bits; if negative overflow occurs, the result is saturated to 8000H.
+* **Unsigned saturation arithmetic** — With unsigned saturation arithmetic, out-of-range results are limited to the representable range of unsigned integers for the integer size. So, positive overflow when operating on unsigned byte integers results in FFH being returned and negative overflow results in 00H being returned.
+
+## Stack frames
+
+## Data transfer instructions
+
+* [**MOV**](http://www.felixcloutier.com/x86/MOV.html) — Move data between general-purpose registers; move data between memory and general-purpose or segment registers; move immediates to general-purpose registers.
+* [**CMOVcc**](http://www.felixcloutier.com/x86/CMOVcc.html) — Conditional move.
+* [**XCHG**](http://www.felixcloutier.com/x86/XCHG.html) — Exchange.
+* [**BSWAP**](http://www.felixcloutier.com/x86/BSWAP.html) — Byte swap.
+* [**XADD**](http://www.felixcloutier.com/x86/XADD.html) — Exchange and add.
+* [**CMPXCHG**](http://www.felixcloutier.com/x86/CMPXCHG.html) — Compare and exchange.
+* [**CMPXCHG8B / CMPXCHG16B**](http://www.felixcloutier.com/x86/CMPXCHG8B:CMPXCHG16B.html) — Compare and exchange 8/16 bytes.
+* [**PUSH**](http://www.felixcloutier.com/x86/PUSH.html) — Push onto stack.
+* [**POP**](http://www.felixcloutier.com/x86/POP.html) — Pop off of stack.
+* [**PUSHA / PUSHAD**](http://www.felixcloutier.com/x86/PUSHA:PUSHAD.html) — Push general-purpose registers onto stack.
+* [**POPA / POPAD**](http://www.felixcloutier.com/x86/POPA:POPAD.html) — Pop general-purpose registers from stack.
+* [**CWD / CDQ / CQO**](http://www.felixcloutier.com/x86/CWD:CDQ:CQO.html) — Convert word to doubleword/Convert doubleword to quadword.
+* [**CBW / CWDE / CDQE**](http://www.felixcloutier.com/x86/CBW:CWDE:CDQE.html) — Convert byte to word/Convert word to doubleword in `rax` register.
+* [**MOVSX / MOVSXD**](http://www.felixcloutier.com/x86/MOVSX:MOVSXD.html) — Move and sign extend.
+* [**MOVZX**](http://www.felixcloutier.com/x86/MOVZX.html) — Move and zero extend.
+
+## Binary arithmetic instructions
+
+* [**ADCX**](http://www.felixcloutier.com/x86/ADCX.html) — Unsigned integer add with carry.
+* [**ADOX**](http://www.felixcloutier.com/x86/ADOX.html) — Unsigned integer add with overflow.
+* [**ADD**](http://www.felixcloutier.com/x86/ADD.html) — Integer add.
+* [**ADC**](http://www.felixcloutier.com/x86/ADC.html) — Add with carry.
+* [**SUB**](http://www.felixcloutier.com/x86/SUB.html) — Subtract.
+* [**SBB**](http://www.felixcloutier.com/x86/SBB.html) — Subtract with borrow.
+* [**IMUL**](http://www.felixcloutier.com/x86/IMUL.html) — Signed multiply.
+* [**MUL**](http://www.felixcloutier.com/x86/MUL.html) — Unsigned multiply.
+* [**IDIV**](http://www.felixcloutier.com/x86/IDIV.html) — Signed divide.
+* [**DIV**](http://www.felixcloutier.com/x86/DIV.html) — Unsigned divide.
+* [**INC**](http://www.felixcloutier.com/x86/INC.html) — Increment.
+* [**DEC**](http://www.felixcloutier.com/x86/DEC.html) — Decrement.
+* [**NEG**](http://www.felixcloutier.com/x86/NEG.html) — Negate.
+* [**CMP**](http://www.felixcloutier.com/x86/CMP.html) — Compare.
+
+## Logical instructions
+
+* [**AND**](http://www.felixcloutier.com/x86/AND.html) — Perform bitwise logical AND.
+* [**OR**](http://www.felixcloutier.com/x86/OR.html) — Perform bitwise logical OR.
+* [**XOR**](http://www.felixcloutier.com/x86/XOR.html) — Perform bitwise logical exclusive OR.
+* [**NOT**](http://www.felixcloutier.com/x86/NOT.html) — Perform bitwise logical NOT.
+
+## Shift and rotate instructions
+
+* [**SAL / SAR / SHL / SHR**](http://www.felixcloutier.com/x86/SAL:SAR:SHL:SHR.html) — Shift arithmetic/logical left/right.
+* [**SHLD**](http://www.felixcloutier.com/x86/SHLD.html) — Shift left double.
+* [**SHRD**](http://www.felixcloutier.com/x86/SHRD.html) — Shift right double.
+* [**RCL / RCR / ROL / ROR**](http://www.felixcloutier.com/x86/RCL:RCR:ROL:ROR.html) — Rotate left/right and rotate left/right through carry.
+
+## Bit and byte instructions
+
+* [**BT**](http://www.felixcloutier.com/x86/BT.html) — Bit test.
+* [**BTS**](http://www.felixcloutier.com/x86/BTS.html) — Bit test and set.
+* [**BTR**](http://www.felixcloutier.com/x86/BTR.html) — Bit test and reset.
+* [**BTC**](http://www.felixcloutier.com/x86/BTC.html) — Bit test and complement.
+* [**BSF**](http://www.felixcloutier.com/x86/BSF.html) — Bit scan forward.
+* [**BSR**](http://www.felixcloutier.com/x86/BSR.html) — Bit scan reverse.
+* [**SETcc**](http://www.felixcloutier.com/x86/SETcc.html) — Set byte on condition.
+* [**TEST**](http://www.felixcloutier.com/x86/TEST.html) — Logical compare.
+* [**CRC32**](http://www.felixcloutier.com/x86/CRC32.html) — Provides hardware acceleration to calculate cyclic redundancy checks for fast and efficient implementation of data integrity protocols.
+* [**POPCNT**](http://www.felixcloutier.com/x86/POPCNT.html) — This instruction calculates the number of bits set to 1 in the second operand (source) and returns the count in the first operand (a destination register).
+
+## Control transfer instructions
+
+* [**JMP**](http://www.felixcloutier.com/x86/JMP.html) — Jump.
+* [**Jcc**](http://www.felixcloutier.com/x86/Jcc.html) — Jump if condition is met (RIP-relative operand).
+* [**LOOP / LOOPcc**](http://www.felixcloutier.com/x86/LOOP:LOOPcc.html) — Loop with `rcx` counter.
+* [**CALL**](http://www.felixcloutier.com/x86/CALL.html) — Call procedure.
+* [**RET**](http://www.felixcloutier.com/x86/RET.html) — Return.
+* [**IRET / IRETD / IRETQ**](http://www.felixcloutier.com/x86/IRET:IRETD.html) — Return from interrupt.
+* [**INT n / INTO / INTO 3**](http://www.felixcloutier.com/x86/INTn:INTO:INT3.html) — Call to interrupt procedure.
+* [**ENTER**](http://www.felixcloutier.com/x86/ENTER.html) — High-level procedure entry.
+* [**LEAVE**](http://www.felixcloutier.com/x86/LEAVE.html) — High-level procedure exit.
+
+## String instructions
+
+* [**MOVS / MOVSB / MOVSW / MOVSD / MOVSQ**](http://www.felixcloutier.com/x86/MOVS:MOVSB:MOVSW:MOVSD:MOVSQ.html) — Move data from string to string.
+* [**CMPS / CMPSB / CMPSW / CMPSD / CMPSQ**](http://www.felixcloutier.com/x86/CMPS:CMPSB:CMPSW:CMPSD:CMPSQ.html) — Compare string operands.
+* [**SCAS / SCASB / SCASW / SCASD**](http://www.felixcloutier.com/x86/SCAS:SCASB:SCASW:SCASD.html) — Scan string.
+* [**LODS / LODSB / LODSW / LODSD / LODSQ**](http://www.felixcloutier.com/x86/LODS:LODSB:LODSW:LODSD:LODSQ.html) — Load string.
+* [**STOS / STOSB / STOSW / STOSD / STOSQ**](http://www.felixcloutier.com/x86/STOS:STOSB:STOSW:STOSD:STOSQ.html) — Store string.
+* [**REP / REPE / REPZ / REPNE / REPNZ**](http://www.felixcloutier.com/x86/REP:REPE:REPZ:REPNE:REPNZ.html) — Repeat string operation prefix.
+
+## `rflags` control instructions
+
+* [**STC**](http://www.felixcloutier.com/x86/STC.html) — Set carry flag.
+* [**CLC**](http://www.felixcloutier.com/x86/CLC.html) — Clear the carry flag.
+* [**CMC**](http://www.felixcloutier.com/x86/CMC.html) — Complement the carry flag.
+* [**CLD**](http://www.felixcloutier.com/x86/CLD.html) — Clear the direction flag.
+* [**STD**](http://www.felixcloutier.com/x86/STD.html) — Set direction flag.
+* [**LAHF**](http://www.felixcloutier.com/x86/LAHF.html) — Load flags into `ah` register.
+* [**SAHF**](http://www.felixcloutier.com/x86/SAHF.html) — Store `ah` register into flags.
+* [**PUSHF / PUSHFQ**](http://www.felixcloutier.com/x86/PUSHF:PUSHFD:PUSHFQ.html) — Push `rflags` onto stack.
+* [**POPF / POPFQ**](http://www.felixcloutier.com/x86/POPF:POPFD:POPFQ.html) — Pop `rflags` from stack.
+* [**STI**](http://www.felixcloutier.com/x86/STI.html) — Set interrupt flag.
+* [**CLI**](http://www.felixcloutier.com/x86/CLI.html) — Clear the interrupt flag.
+
+## Miscellaneous instructions
+
+* [**LEA**](http://www.felixcloutier.com/x86/LEA.html) — Load effective address.
+* [**NOP**](http://www.felixcloutier.com/x86/NOP.html) — No operation.
+* [**UD**](http://www.felixcloutier.com/x86/UD.html) — Undefined instruction.
+* [**XLAT / XLATB**](http://www.felixcloutier.com/x86/XLAT:XLATB.html) — Table lookup translation.
+* [**CPUID**](http://www.felixcloutier.com/x86/CPUID.html) — Processor identification.
+* [**MOVBE**](http://www.felixcloutier.com/x86/MOVBE.html) — Move data after swapping data bytes.
+* [**PREFETCHW**](http://www.felixcloutier.com/x86/PREFETCHW.html) — Prefetch data into cache in anticipation of write.
+* [**CLFLUSH**](http://www.felixcloutier.com/x86/CLFLUSH.html) — Flushes and invalidates a memory operand and its associated cache line from all levels of the processor’s cache hierarchy.
+* [**CLFLUSHOPT**](http://www.felixcloutier.com/x86/CLFLUSHOPT.html) — Flushes and invalidates a memory operand and its associated cache line from all levels of the processor’s cache hierarchy with optimized memory system throughput.
+* [**RDRAND**](http://www.felixcloutier.com/x86/RDRAND.html) — Retrieves a random number generated from hardware.
+* [**RDSEED**](http://www.felixcloutier.com/x86/RDSEED.html) — Seed the random number generator from hardware.
+
+## User-mode extended states save/restore instructions
+
+* [**XSAVE**](http://www.felixcloutier.com/x86/XSAVE.html) — Save processor extended states to memory.
+* [**XSAVEC**](http://www.felixcloutier.com/x86/XSAVEC.html) — Save processor extended states with compaction to memory.
+* [**XSAVEOPT**](http://www.felixcloutier.com/x86/XSAVEOPT.html) — Save processor extended states to memory, optimized.
+* [**XRSTOR**](http://www.felixcloutier.com/x86/XRSTOR.html) — Restore processor extended states from memory.
+* [**XGETBV**](http://www.felixcloutier.com/x86/XGETBV.html) — Reads the state of an extended control register.
+
+## Bit manipulation instructions (BMI1, BMI2)
+
+* [**ANDN**](http://www.felixcloutier.com/x86/ANDN.html) — Bitwise AND of first source with inverted 2nd source operands.
+* [**BEXTR**](http://www.felixcloutier.com/x86/BEXTR.html) — Contiguous bitwise extract.
+* [**BLSI**](http://www.felixcloutier.com/x86/BLSI.html) — Extract lowest set bit.
+* [**BLSMSK**](http://www.felixcloutier.com/x86/BLSMSK.html) — Set all lower bits below first set bit to 1.
+* [**BLSR**](http://www.felixcloutier.com/x86/BLSR.html) — Reset lowest set bit.
+* [**BZHI**](http://www.felixcloutier.com/x86/BZHI.html) — Zero high bits starting from specified bit position.
+* [**LZCNT**](http://www.felixcloutier.com/x86/LZCNT.html) — Count the number leading zero bits.
+* [**MULX**](http://www.felixcloutier.com/x86/MULX.html) — Unsigned multiply without affecting arithmetic flags.
+* [**PDEP**](http://www.felixcloutier.com/x86/PDEP.html) — Parallel deposit of bits using a mask.
+* [**PEXT**](http://www.felixcloutier.com/x86/PEXT.html) — Parallel extraction of bits using a mask.
+* [**RORX**](http://www.felixcloutier.com/x86/RORX.html) — Rotate right without affecting arithmetic flags.
+* [**SARX / SHLX / SHRX**](http://www.felixcloutier.com/x86/SARX:SHLX:SHRX.html) — Shift arithmetic/logic left/right without affecting flags.
+* [**TZCNT**](http://www.felixcloutier.com/x86/TZCNT.html) — Count the number trailing zero bits.
+
+## x87 FPU overview
+
+* x87 FPU state is aliased to the MMX state, care must be taken when making transitions to MMX instructions to prevent incoherent or unexpected results.
+
+## x87 FPU data transfer instructions
+
+* [**FLD**](http://www.felixcloutier.com/x86/FLD.html) — Load floating-point value.
+* [**FST / FSTP**](http://www.felixcloutier.com/x86/FST:FSTP.html) — Store floating-point value without/with pop.
+* [**FILD**](http://www.felixcloutier.com/x86/FILD.html) — Load integer.
+* [**FIST / FISTP**](http://www.felixcloutier.com/x86/FIST:FISTP.html) — Store integer with/without pop.
+* [**FBLD**](http://www.felixcloutier.com/x86/FBLD.html) — Load BCD.
+* [**FBSTP**](http://www.felixcloutier.com/x86/FBSTP.html) — Store BCD and pop.
+* [**FXCH**](http://www.felixcloutier.com/x86/FXCH.html) — Exchange registers.
+* [**FCMOVcc**](http://www.felixcloutier.com/x86/FCMOVcc.html) — Floating-point conditional move.
+
+## x87 FPU basic arithmetic instructions
+
+* [**FADD / FADDP / FIADD**](http://www.felixcloutier.com/x86/FADD:FADDP:FIADD.html) — Add floating-point.
+* [**FSUB / FSUBP / FISUB**](http://www.felixcloutier.com/x86/FSUB:FSUBP:FISUB.html) — Subtract floating-point.
+* [**FSUBR / FSUBRP / FISUBR**](http://www.felixcloutier.com/x86/FSUBR:FSUBRP:FISUBR.html) — Subtract floating-point reverse.
+* [**FMUL / FMULP / FIMUL**](http://www.felixcloutier.com/x86/FMUL:FMULP:FIMUL.html) — Multiply floating-point.
+* [**FDIV / FDIVP / FIDIV**](http://www.felixcloutier.com/x86/FDIV:FDIVP:FIDIV.html) — Divide floating-point.
+* [**FDIVR / FDIVRP / FIDIVR**](http://www.felixcloutier.com/x86/FDIVR:FDIVRP:FIDIVR.html) — Divide floating-point reverse.
+* [**FPREM**](http://www.felixcloutier.com/x86/FPREM.html) — Partial remainder.
+* [**FPREM1**](http://www.felixcloutier.com/x86/FPREM1.html) — IEEE Partial remainder.
+* [**FABS**](http://www.felixcloutier.com/x86/FABS.html) — Absolute value.
+* [**FCHS**](http://www.felixcloutier.com/x86/FCHS.html) — Change sign.
+* [**FRNDINT**](http://www.felixcloutier.com/x86/FRNDINT.html) — Round to integer.
+* [**FSCALE**](http://www.felixcloutier.com/x86/FSCALE.html) — Scale by power of two.
+* [**FSQRT**](http://www.felixcloutier.com/x86/FSQRT.html) — Square root.
+* [**FXTRACT**](http://www.felixcloutier.com/x86/FXTRACT.html) — Extract exponent and significand.
+
+## x87 FPU comparison instructions
+
+* [**FCOM / FCOMP / FCOMPP**](http://www.felixcloutier.com/x86/FCOM:FCOMP:FCOMPP.html) — Compare floating-point.
+* [**FUCOM / FUCOMP / FUCOMPP**](http://www.felixcloutier.com/x86/FUCOM:FUCOMP:FUCOMPP.html) — Unordered compare floating-point.
+* [**FICOM / FICOMP**](http://www.felixcloutier.com/x86/FICOM:FICOMP.html) — Compare integer.
+* [**FCOMI / FCOMIP / FUCOMI / FUCOMIP**](http://www.felixcloutier.com/x86/FCOMI:FCOMIP:FUCOMI:FUCOMIP.html) — Compare floating-point and set `rflags`.
+* [**FTST**](http://www.felixcloutier.com/x86/FTST.html) — Test floating-point (compare with 0.0).
+* [**FXAM**](http://www.felixcloutier.com/x86/FXAM.html) — Examine floating-point.
+
+## x87 FPU transcendental instructions
+
+* [**FSIN**](http://www.felixcloutier.com/x86/FSIN.html) — Sine.
+* [**FCOS**](http://www.felixcloutier.com/x86/FCOS.html) — Cosine.
+* [**FSINCOS**](http://www.felixcloutier.com/x86/FSINCOS.html) — Sine and cosine.
+* [**FPTAN**](http://www.felixcloutier.com/x86/FPTAN.html) — Partial tangent.
+* [**FPATAN**](http://www.felixcloutier.com/x86/FPATAN.html) — Partial arctangent.
+* [**F2XM1**](http://www.felixcloutier.com/x86/F2XM1.html) — 2x − 1.
+* [**FYL2X**](http://www.felixcloutier.com/x86/FYL2X.html) — y ∗ log2x.
+* [**FYL2XP1**](http://www.felixcloutier.com/x86/FYL2XP1.html) — y ∗ log2(x + 1).
+
+## x87 FPU load constants instructions
+
+* [**FLD1 / FLDL2T / FLDL2E / FLDPI / FLDLG2 / FLDLN2 / FLDZ**](http://www.felixcloutier.com/x86/FLD1:FLDL2T:FLDL2E:FLDPI:FLDLG2:FLDLN2:FLDZ.html) — Load constants.
+
+## x87 FPU control instructions
+
+* [**FINCSTP**](http://www.felixcloutier.com/x86/FINCSTP.html) — Increment FPU register stack pointer.
+* [**FDECSTP**](http://www.felixcloutier.com/x86/FDECSTP.html) — Decrement FPU register stack pointer.
+* [**FFREE**](http://www.felixcloutier.com/x86/FFREE.html) — Free floating-point register.
+* [**FINIT / FNINIT**](http://www.felixcloutier.com/x86/FINIT:FNINIT.html) — Initialize FPU.
+* [**FCLEX / FNCLEX**](http://www.felixcloutier.com/x86/FCLEX:FNCLEX.html) — Clear floating-point exception flags.
+* [**FSTCW / FNSTCW**](http://www.felixcloutier.com/x86/FSTCW:FNSTCW.html) — Store FPU control word.
+* [**FLDCW**](http://www.felixcloutier.com/x86/FLDCW.html) — Load FPU control word.
+* [**FSTENV / FNSTENV**](http://www.felixcloutier.com/x86/FSTENV:FNSTENV.html) — Store FPU environment.
+* [**FLDENV**](http://www.felixcloutier.com/x86/FLDENV.html) — Load FPU environment.
+* [**FSAVE / FNSAVE**](http://www.felixcloutier.com/x86/FSAVE:FNSAVE.html) — Save FPU state.
+* [**FRSTOR**](http://www.felixcloutier.com/x86/FRSTOR.html) — Restore FPU state.
+* [**FSTSW / FNSTSW**](http://www.felixcloutier.com/x86/FSTSW:FNSTSW.html) — Store FPU status word.
+* [**WAIT / FWAIT**](http://www.felixcloutier.com/x86/WAIT:FWAIT.html) — Wait for FPU.
+* [**FNOP**](http://www.felixcloutier.com/x86/FNOP.html) — FPU no operation.
+
+## x87 FPU and SIMD state management instructions
+
+* [**FXSAVE**](http://www.felixcloutier.com/x86/FXSAVE.html) — Save x87 FPU and SIMD state.
+* [**FXRSTOR**](http://www.felixcloutier.com/x86/FXRSTOR.html) — Restore x87 FPU and SIMD state.
+
+## MMX overview
+
+* SIMD execution model to handle 64-bit packed integer data.
+* Eight new 64-bit data registers, called MMX registers.
+* Three new packed data types:
+  * 64-bit packed byte integers (signed and unsigned)
+  * 64-bit packed word integers (signed and unsigned)
+  * 64-bit packed doubleword integers (signed and unsigned)
+* MMX state is aliased to the x87 FPU state, care must be taken when making transitions to x87 FPU instructions to prevent incoherent or unexpected results.
+
+## MMX data transfer instructions
+
+* [**MOVD / MOVQ**](http://www.felixcloutier.com/x86/MOVD:MOVQ.html) — Move doubleword/quadword from/to MMX registers.
+
+## MMX conversion instructions
+
+* [**PACKSSWB / PACKSSDW**](http://www.felixcloutier.com/x86/PACKSSWB:PACKSSDW.html) — Pack words/doublewords into bytes with signed saturation.
+* [**PACKUSWB**](http://www.felixcloutier.com/x86/PACKUSWB.html) — Pack words into bytes with unsigned saturation.
+* [**PUNPCKHBW / PUNPCKHWD / PUNPCKHDQ**](http://www.felixcloutier.com/x86/PUNPCKHBW:PUNPCKHWD:PUNPCKHDQ:PUNPCKHQDQ.html) — Unpack high-order bytes/words/doublewords.
+* [**PUNPCKLBW / PUNPCKLWD / PUNPCKLDQ**](http://www.felixcloutier.com/x86/PUNPCKLBW:PUNPCKLWD:PUNPCKLDQ:PUNPCKLQDQ.html) — Unpack low-order bytes/words/doublewords.
+
+## MMX packed arithmetic instructions
+
+* [**PADDB / PADDW / PADDD**](http://www.felixcloutier.com/x86/PADDB:PADDW:PADDD:PADDQ.html) — Add packed byte/word/doubleword integers.
+* [**PADDSB / PADDSW**](http://www.felixcloutier.com/x86/PADDSB:PADDSW.html) — Add packed signed byte/word integers with signed saturation.
+* [**PADDUSB / PADDUSW**](http://www.felixcloutier.com/x86/PADDUSB:PADDUSW.html) — Add packed unsigned byte/word integers with unsigned saturation.
+* [**PSUBB / PSUBW / PSUBD**](http://www.felixcloutier.com/x86/PSUBB:PSUBW:PSUBD.html) — Subtract packed byte/word/doubleword integers.
+* [**PSUBSB / PSUBSW**](http://www.felixcloutier.com/x86/PSUBSB:PSUBSW.html) — Subtract packed signed byte/word integers with signed saturation.
+* [**PSUBUSB / PSUBUSW**](http://www.felixcloutier.com/x86/PSUBUSB:PSUBUSW.html) — Subtract packed unsigned byte/word integers with unsigned saturation.
+* [**PMULHW**](http://www.felixcloutier.com/x86/PMULHW.html) — Multiply packed signed word integers and store high result.
+* [**PMULLW**](http://www.felixcloutier.com/x86/PMULLW.html) — Multiply packed signed word integers and store low result.
+* [**PMADDWD**](http://www.felixcloutier.com/x86/PMADDWD.html) — Multiply and add packed word integers.
+
+## MMX comparison instructions
+
+* [**PCMPEQB / PCMPEQW / PCMPEQD**](http://www.felixcloutier.com/x86/PCMPEQB:PCMPEQW:PCMPEQD.html) — Compare packed bytes/words/doublewords for equal.
+* [**PCMPGTB / PCMPGTW / PCMPGTD**](http://www.felixcloutier.com/x86/PCMPGTB:PCMPGTW:PCMPGTD.html) — Compare packed signed byte/word/doubleword integers for greater than.
+
+## MMX logical instructions
+
+* [**PAND**](http://www.felixcloutier.com/x86/PAND.html) — Bitwise logical AND.
+* [**PANDN**](http://www.felixcloutier.com/x86/PANDN.html) — Bitwise logical AND NOT.
+* [**POR**](http://www.felixcloutier.com/x86/POR.html) — Bitwise logical OR.
+* [**PXOR**](http://www.felixcloutier.com/x86/PXOR.html) — Bitwise logical exclusive OR.
+
+## MMX shift and rotate instructions
+
+* [**PSLLW / PSLLD / PSLLQ**](http://www.felixcloutier.com/x86/PSLLW:PSLLD:PSLLQ.html) — Shift packed words/doublewords/quadwoards left logical.
+* [**PSRLW / PSRLD / PSRLQ**](http://www.felixcloutier.com/x86/PSRLW:PSRLD:PSRLQ.html) — Shift packed words/doublewords/quadwords right logical.
+* [**PSRAW / PSRAD**](http://www.felixcloutier.com/x86/PSRAW:PSRAD:PSRAQ.html) — Shift packed words/doublewords right arithmetic.
+
+## MMX state management instructions
+
+* [**EMMS**](http://www.felixcloutier.com/x86/EMMS.html) — Empty MMX state.
+
+## SSE overview
+
+* Expand the SIMD execution model by adding facilities for handling packed and scalar single-precision floating-point values contained in 128-bit registers.
+* Sixteen (eight for 32-bit mode) new 128-bit packed single-precision floating-point XMM registers available.
+* 128-bit packed and scalar single-precision floating-point instructions.
+* Enhancements to MMX instruction set with new operations on packed integer operands located in MMX registers.
+* Explicit prefetching of data, control of the cacheability of data, control of the
+ordering of store operations.
+
+## SSE data transfer instructions
+
+* [**MOVAPS**](http://www.felixcloutier.com/x86/MOVAPS.html) — Move four aligned packed single-precision floating-point values between XMM registers or between XMM register and memory.
+* [**MOVUPS**](http://www.felixcloutier.com/x86/MOVUPS.html) — Move four unaligned packed single-precision floating-point values between XMM registers or between XMM register and memory.
+* [**MOVHPS**](http://www.felixcloutier.com/x86/MOVHPS.html) — Move two packed single-precision floating-point values to an from the high quadword of an XMM register and memory.
+* [**MOVHLPS**](http://www.felixcloutier.com/x86/MOVHLPS.html) — Move two packed single-precision floating-point values from the high quadword of an XMM register to the low quadword of another XMM register.
+* [**MOVLPS**](http://www.felixcloutier.com/x86/MOVLPS.html) — Move two packed single-precision floating-point values to an from the low quadword of an XMM register and memory.
+* [**MOVLHPS**](http://www.felixcloutier.com/x86/MOVLHPS.html) — Move two packed single-precision floating-point values from the low quadword of an XMM register to the high quadword of another XMM register.
+* [**MOVMSKPS**](http://www.felixcloutier.com/x86/MOVMSKPS.html) — Extract sign mask from four packed single-precision floating-point values.
+* [**MOVSS**](http://www.felixcloutier.com/x86/MOVSS.html) — Move scalar single-precision floating-point value between XMM registers or between an XMM register and memory.
+
+## SSE packed arithmetic instructions
+
+* [**ADDPS**](http://www.felixcloutier.com/x86/ADDPS.html) — Add packed single-precision floating-point values.
+* [**ADDSS**](http://www.felixcloutier.com/x86/ADDSS.html) — Add scalar single-precision floating-point values.
+* [**SUBPS**](http://www.felixcloutier.com/x86/SUBPS.html) — Subtract packed single-precision floating-point values.
+* [**SUBSS**](http://www.felixcloutier.com/x86/SUBSS.html) — Subtract scalar single-precision floating-point values.
+* [**MULPS**](http://www.felixcloutier.com/x86/MULPS.html) — Multiply packed single-precision floating-point values.
+* [**MULSS**](http://www.felixcloutier.com/x86/MULSS.html) — Multiply scalar single-precision floating-point values.
+* [**DIVPS**](http://www.felixcloutier.com/x86/DIVPS.html) — Divide packed single-precision floating-point values.
+* [**DIVSS**](http://www.felixcloutier.com/x86/DIVSS.html) — Divide scalar single-precision floating-point values.
+* [**RCPPS**](http://www.felixcloutier.com/x86/RCPPS.html) — Compute reciprocals of packed single-precision floating-point values.
+* [**RCPSS**](http://www.felixcloutier.com/x86/RCPSS.html) — Compute reciprocal of scalar single-precision floating-point values.
+* [**SQRTPS**](http://www.felixcloutier.com/x86/SQRTPS.html) — Compute square roots of packed single-precision floating-point values.
+* [**SQRTSS**](http://www.felixcloutier.com/x86/SQRTSS.html) — Compute square root of scalar single-precision floating-point values.
+* [**RSQRTPS**](http://www.felixcloutier.com/x86/RSQRTPS.html) — Compute reciprocals of square roots of packed single-precision floating-point values.
+* [**RSQRTSS**](http://www.felixcloutier.com/x86/RSQRTSS.html) — Compute reciprocal of square root of scalar single-precision floating-point values.
+* [**MAXPS**](http://www.felixcloutier.com/x86/MAXPS.html) — Return maximum packed single-precision floating-point values.
+* [**MAXSS**](http://www.felixcloutier.com/x86/MAXSS.html) — Return maximum scalar single-precision floating-point values.
+* [**MINPS**](http://www.felixcloutier.com/x86/MINPS.html) — Return minimum packed single-precision floating-point values.
+* [**MINSS**](http://www.felixcloutier.com/x86/MINSS.html) — Return minimum scalar single-precision floating-point values.
+
+## SSE comparison instructions
+
+* [**CMPPS**](http://www.felixcloutier.com/x86/CMPPS.html) — Compare packed single-precision floating-point values.
+* [**CMPSS**](http://www.felixcloutier.com/x86/CMPSS.html) — Compare scalar single-precision floating-point values.
+* [**COMISS**](http://www.felixcloutier.com/x86/COMISS.html) — Perform ordered comparison of scalar single-precision floating-point values and set flags in `rflags` register.
+* [**UCOMISS**](http://www.felixcloutier.com/x86/UCOMISS.html) — Perform unordered comparison of scalar single-precision floating-point values and set flags in `rflags` register.
+
+## SSE logical instructions
+
+* [**ANDPS**](http://www.felixcloutier.com/x86/ANDPS.html) — Perform bitwise logical AND of packed single-precision floating-point values.
+* [**ANDNPS**](http://www.felixcloutier.com/x86/ANDNPS.html) — Perform bitwise logical AND NOT of packed single-precision floating-point values.
+* [**ORPS**](http://www.felixcloutier.com/x86/ORPS.html) — Perform bitwise logical OR of packed single-precision floating-point values.
+* [**XORPS**](http://www.felixcloutier.com/x86/XORPS.html) — Perform bitwise logical XOR of packed single-precision floating-point values.
+
+## SSE shuffle and unpack instructions
+
+* [**SHUFPS**](http://www.felixcloutier.com/x86/SHUFPS.html) — Shuffles values in packed single-precision floating-point operands.
+* [**UNPCKHPS**](http://www.felixcloutier.com/x86/UNPCKHPS.html) — Unpacks and interleaves the two high-order values from two single-precision floating-point operands.
+* [**UNPCKLPS**](http://www.felixcloutier.com/x86/UNPCKLPS.html) — Unpacks and interleaves the two low-order values from two single-precision floating-point operands.
+
+## SSE conversion instructions
+
+* [**CVTPI2PS**](http://www.felixcloutier.com/x86/CVTPI2PS.html) — Convert packed doubleword integers to packed single-precision floating-point values.
+* [**CVTSI2SS**](http://www.felixcloutier.com/x86/CVTSI2SS.html) — Convert doubleword integer to scalar single-precision floating-point value.
+* [**CVTPS2PI**](http://www.felixcloutier.com/x86/CVTPS2PI.html) — Convert packed single-precision floating-point values to packed doubleword integers.
+* [**CVTTPS2PI**](http://www.felixcloutier.com/x86/CVTTPS2PI.html) — Convert with truncation packed single-precision floating-point values to packed doubleword integers.
+* [**CVTSS2SI**](http://www.felixcloutier.com/x86/CVTSS2SI.html) — Convert a scalar single-precision floating-point value to a doubleword integer.
+* [**CVTTSS2SI**](http://www.felixcloutier.com/x86/CVTTSS2SI.html) — Convert with truncation a scalar single-precision floating-point value to a scalar doubleword integer.
+
+## SSE MXCSR management instructions
+
+* [**LDMXCSR**](http://www.felixcloutier.com/x86/LDMXCSR.html) — Load MXCSR register.
+* [**STMXCSR**](http://www.felixcloutier.com/x86/STMXCSR.html) — Save MXCSR register state.
+
+## SSE 64-bit integer instructions (MMX enhancements)
+
+* [**PAVGB / PAVGW**](http://www.felixcloutier.com/x86/PAVGB:PAVGW.html) — Compute average of packed unsigned byte integers.
+* [**PEXTRW**](http://www.felixcloutier.com/x86/PEXTRW.html) — Extract word.
+* [**PINSRW**](http://www.felixcloutier.com/x86/PINSRW.html) — Insert word.
+* [**PMAXUB**](http://www.felixcloutier.com/x86/PMAXUB:PMAXUW.html) — Maximum of packed unsigned byte integers.
+* [**PMAXSW**](http://www.felixcloutier.com/x86/PMAXSB:PMAXSW:PMAXSD:PMAXSQ.html) — Maximum of packed signed word integers.
+* [**PMINUB**](http://www.felixcloutier.com/x86/PMINUB:PMINUW.html) — Minimum of packed unsigned byte integers.
+* [**PMINSW**](http://www.felixcloutier.com/x86/PMINSB:PMINSW.html) — Minimum of packed signed word integers.
+* [**PMOVMSKB**](http://www.felixcloutier.com/x86/PMOVMSKB.html) — Move byte mask.
+* [**PMULHUW**](http://www.felixcloutier.com/x86/PMULHUW.html) — Multiply packed unsigned integers and store high result.
+* [**PSADBW**](http://www.felixcloutier.com/x86/MPSADBW.html) — Compute sum of absolute differences.
+* [**PSHUFW**](http://www.felixcloutier.com/x86/PSHUFW.html) — Shuffle packed integer word in MMX register.
+
+## SSE cacheability control, prefetch and ordering instructions
+
+* [**MASKMOVQ**](http://www.felixcloutier.com/x86/MASKMOVQ.html) — Non-temporal store of selected bytes from an MMX register into memory.
+* [**MOVNTQ**](http://www.felixcloutier.com/x86/MOVNTQ.html) — Non-temporal store of quadword from an MMX register into memory.
+* [**MOVNTPS**](http://www.felixcloutier.com/x86/MOVNTPS.html) — Non-temporal store of four packed single-precision floating-point values from an XMM register into memory.
+* [**PREFETCHh**](http://www.felixcloutier.com/x86/PREFETCHh.html) — Load 32 or more of bytes from memory to a selected level of the processor’s cache hierarchy.
+* [**SFENCE**](http://www.felixcloutier.com/x86/SFENCE.html) — Serializes store operations.
+
+## SSE2 overview
+
+* Packed and scalar 128-bit double-precision floating-point instructions.
+* Additional 64-bit and 128-bit packed byte/word/doubleword/quadword integers instructions.
+* 128-bit versions of integer instructions introduced with MMX and SSE.
+* Additional cacheability-control and instruction-ordering instructions.
+
+## SSE2 FP64 data movement instructions
+
+* [**MOVAPD**](http://www.felixcloutier.com/x86/MOVAPD.html) — Move two aligned packed double-precision floating-point values between XMM registers or between and XMM register and memory.
+* [**MOVUPD**](http://www.felixcloutier.com/x86/MOVUPD.html) — Move two unaligned packed double-precision floating-point values between XMM registers or between and XMM register and memory.
+* [**MOVHPD**](http://www.felixcloutier.com/x86/MOVHPD.html) — Move high packed double-precision floating-point value to an from the high quadword of an XMM register and memory.
+* [**MOVLPD**](http://www.felixcloutier.com/x86/MOVLPD.html) — Move low packed single-precision floating-point value to an from the low quadword of an XMM register and memory.
+* [**MOVMSKPD**](http://www.felixcloutier.com/x86/MOVMSKPD.html) — Extract sign mask from two packed double-precision floating-point values.
+* [**MOVSD**](http://www.felixcloutier.com/x86/MOVSD.html) — Move scalar double-precision floating-point value between XMM registers or between an XMM register and memory.
+
+## SSE2 FP64 packed arithmetic instructions
+
+* [**ADDPD**](http://www.felixcloutier.com/x86/ADDPD.html) — Add packed double-precision floating-point values.
+* [**ADDSD**](http://www.felixcloutier.com/x86/ADDSD.html) — Add scalar double precision floating-point values.
+* [**SUBPD**](http://www.felixcloutier.com/x86/SUBPD.html) — Subtract packed double-precision floating-point values.
+* [**SUBSD**](http://www.felixcloutier.com/x86/SUBSD.html) — Subtract scalar double-precision floating-point values.
+* [**MULPD**](http://www.felixcloutier.com/x86/MULPD.html) — Multiply packed double-precision floating-point values.
+* [**MULSD**](http://www.felixcloutier.com/x86/MULSD.html) — Multiply scalar double-precision floating-point values.
+* [**DIVPD**](http://www.felixcloutier.com/x86/DIVPD.html) — Divide packed double-precision floating-point values.
+* [**DIVSD**](http://www.felixcloutier.com/x86/DIVSD.html) — Divide scalar double-precision floating-point values.
+* [**SQRTPD**](http://www.felixcloutier.com/x86/SQRTPD.html) — Compute packed square roots of packed double-precision floating-point values.
+* [**SQRTSD**](http://www.felixcloutier.com/x86/SQRTSD.html) — Compute scalar square root of scalar double-precision floating-point values.
+* [**MAXPD**](http://www.felixcloutier.com/x86/MAXPD.html) — Return maximum packed double-precision floating-point values.
+* [**MAXSD**](http://www.felixcloutier.com/x86/MAXSD.html) — Return maximum scalar double-precision floating-point values.
+* [**MINPD**](http://www.felixcloutier.com/x86/MINPD.html) — Return minimum packed double-precision floating-point values.
+* [**MINSD**](http://www.felixcloutier.com/x86/MINSD.html) — Return minimum scalar double-precision floating-point values.
+
+## SSE2 FP64 logical instructions
+
+* [**ANDPD**](http://www.felixcloutier.com/x86/ANDPD.html) — Perform bitwise logical AND of packed double-precision floating-point values.
+* [**ANDNPD**](http://www.felixcloutier.com/x86/ANDNPD.html) — Perform bitwise logical AND NOT of packed double-precision floating-point values.
+* [**ORPD**](http://www.felixcloutier.com/x86/ORPD.html) — Perform bitwise logical OR of packed double-precision floating-point values.
+* [**XORPD**](http://www.felixcloutier.com/x86/XORPD.html) — Perform bitwise logical XOR of packed double-precision floating-point values.
+
+## SSE2 FP64 compare instructions
+
+* [**CMPPD**](http://www.felixcloutier.com/x86/CMPPD.html) — Compare packed double-precision floating-point values.
+* [**CMPSD**](http://www.felixcloutier.com/x86/CMPSD.html) — Compare scalar double-precision floating-point values.
+* [**COMISD**](http://www.felixcloutier.com/x86/COMISD.html) — Perform ordered comparison of scalar double-precision floating-point values and set flags in `rflags` register.
+* [**UCOMISD**](http://www.felixcloutier.com/x86/UCOMISD.html) — Perform unordered comparison of scalar double-precision floating-point values and set flags in `rflags` register.
+
+## SSE2 FP64 shuffle and unpack instructions
+
+* [**SHUFPD**](http://www.felixcloutier.com/x86/SHUFPD.html) — Shuffles values in packed double-precision floating-point operands.
+* [**UNPCKHPD**](http://www.felixcloutier.com/x86/UNPCKHPD.html) — Unpacks and interleaves the high values from two packed double-precision floating-point operands.
+* [**UNPCKLPD**](http://www.felixcloutier.com/x86/UNPCKLPD.html) — Unpacks and interleaves the low values from two packed double-precision floating-point operands.
+
+## SSE2 FP64 conversion instructions
+
+* [**CVTPD2PI**](http://www.felixcloutier.com/x86/CVTPD2PI.html) — Convert packed double-precision floating-point values to packed doubleword integers.
+* [**CVTTPD2PI**](http://www.felixcloutier.com/x86/CVTTPD2PI.html) — Convert with truncation packed double-precision floating-point values to packed doubleword integers.
+* [**CVTPI2PD**](http://www.felixcloutier.com/x86/CVTPI2PD.html) — Convert packed doubleword integers to packed double-precision floating-point values.
+* [**CVTPD2DQ**](http://www.felixcloutier.com/x86/CVTPD2DQ.html) — Convert packed double-precision floating-point values to packed doubleword integers.
+* [**CVTTPD2DQ**](http://www.felixcloutier.com/x86/CVTTPD2DQ.html) — Convert with truncation packed double-precision floating-point values to packed doubleword integers.
+* [**CVTDQ2PD**](http://www.felixcloutier.com/x86/CVTDQ2PD.html) — Convert packed doubleword integers to packed double-precision floating-point values.
+* [**CVTPS2PD**](http://www.felixcloutier.com/x86/CVTPS2PD.html) — Convert packed single-precision floating-point values to packed double-precision floating-point values.
+* [**CVTPD2PS**](http://www.felixcloutier.com/x86/CVTPS2PD.html) — Convert packed double-precision floating-point values to packed single-precision floating-point values.
+* [**CVTSS2SD**](http://www.felixcloutier.com/x86/CVTSS2SD.html) — Convert scalar single-precision floating-point values to scalar double-precision floating-point values.
+* [**CVTSD2SS**](http://www.felixcloutier.com/x86/CVTSD2SS.html) — Convert scalar double-precision floating-point values to scalar single-precision floating-point values.
+* [**CVTSD2SI**](http://www.felixcloutier.com/x86/CVTSD2SI.html) — Convert scalar double-precision floating-point values to a doubleword integer.
+* [**CVTTSD2SI**](http://www.felixcloutier.com/x86/CVTTSD2SI.html) — Convert with truncation scalar double-precision floating-point values to scalar doubleword integers.
+* [**CVTSI2SD**](http://www.felixcloutier.com/x86/CVTSI2SD.html) — Convert doubleword integer to scalar double-precision floating-point value.
+
+## SSE2 FP32 instructions (SSE enhancements)
+
+* [**CVTDQ2PS**](http://www.felixcloutier.com/x86/CVTDQ2PS.html) — Convert packed doubleword integers to packed single-precision floating-point values.
+* [**CVTPS2DQ**](http://www.felixcloutier.com/x86/CVTPS2DQ.html) — Convert packed single-precision floating-point values to packed doubleword integers.
+* [**CVTTPS2DQ**](http://www.felixcloutier.com/x86/CVTTPS2DQ.html) — Convert with truncation packed single-precision floating-point values to packed doubleword integers.
+
+## SSE2 integer instructions
+
+* [**MOVDQA**](http://www.felixcloutier.com/x86/MOVDQA:VMOVDQA32:VMOVDQA64.html) — Move aligned double quadword.
+* [**MOVDQU**](http://www.felixcloutier.com/x86/MOVDQU:VMOVDQU8:VMOVDQU16:VMOVDQU32:VMOVDQU64.html) — Move unaligned double quadword.
+* [**MOVQ2DQ**](http://www.felixcloutier.com/x86/MOVQ2DQ.html) — Move quadword integer from MMX to XMM registers.
+* [**MOVDQ2Q**](http://www.felixcloutier.com/x86/MOVDQ2Q.html) — Move quadword integer from XMM to MMX registers.
+* [**PMULUDQ**](http://www.felixcloutier.com/x86/PMULUDQ.html) — Multiply packed unsigned doubleword integers.
+* [**PADDQ**](http://www.felixcloutier.com/x86/PADDB:PADDW:PADDD:PADDQ.html) — Add packed quadword integers.
+* [**PSUBQ**](http://www.felixcloutier.com/x86/PSUBQ.html) — Subtract packed quadword integers.
+* [**PSHUFLW**](http://www.felixcloutier.com/x86/PSHUFLW.html) — Shuffle packed low words.
+* [**PSHUFHW**](http://www.felixcloutier.com/x86/PSHUFHW.html) — Shuffle packed high words.
+* [**PSHUFD**](http://www.felixcloutier.com/x86/PSHUFD.html) — Shuffle packed doublewords.
+* [**PSLLDQ**](http://www.felixcloutier.com/x86/PSLLDQ.html) — Shift double quadword left logical.
+* [**PSRLDQ**](http://www.felixcloutier.com/x86/PSRLDQ.html) — Shift double quadword right logical.
+* [**PUNPCKHQDQ**](http://www.felixcloutier.com/x86/PUNPCKHBW:PUNPCKHWD:PUNPCKHDQ:PUNPCKHQDQ.html) — Unpack high quadwords.
+* [**PUNPCKLQDQ**](http://www.felixcloutier.com/x86/PUNPCKLBW:PUNPCKLWD:PUNPCKLDQ:PUNPCKLQDQ.html) — Unpack low quadwords.
+
+## SSE2 cacheability control and ordering instructions
+
+* [**CLFLUSH**](http://www.felixcloutier.com/x86/CLFLUSH.html) — Flush cacheline.
+* [**LFENCE**](http://www.felixcloutier.com/x86/LFENCE.html) — Serializes load operations.
+* [**MFENCE**](http://www.felixcloutier.com/x86/MFENCE.html) — Serializes load and store operations.
+* [**PAUSE**](http://www.felixcloutier.com/x86/PAUSE.html) — Improves the performance of “spin-wait loops”.
+* [**MASKMOVDQU**](http://www.felixcloutier.com/x86/MASKMOVDQU.html) — Non-temporal store of selected bytes from an XMM register into memory.
+* [**MOVNTPD**](http://www.felixcloutier.com/x86/MOVNTPD.html) — Non-temporal store of two packed double-precision floating-point values from an XMM register into memory.
+* [**MOVNTDQ**](http://www.felixcloutier.com/x86/MOVNTDQ.html) — Non-temporal store of double quadword from an XMM register into memory.
+* [**MOVNTI**](http://www.felixcloutier.com/x86/MOVNTI.html) — Non-temporal store of a doubleword from a general-purpose register into memory.
+
+## References
+
+* https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf
+* http://www.agner.org/optimize/optimizing_assembly.pdf
+* https://www.nasm.us/xdoc/2.13.03/nasmdoc.pdf
+* https://godbolt.org/
+* https://www.lri.fr/~filliatr/ens/compil/x86-64.pdf
+* https://0xax.github.io/categories/assembler/
+
+## Instruction tables
+
+* http://www.agner.org/optimize/instruction_tables.pdf
+
+## Examples
+
+* https://github.com/torvalds/linux/tree/master/arch/x86
+* https://gist.github.com/rygorous/bf1659bf6cd1752ed114367d4b87b302
+* https://www.csee.umbc.edu/portal/help/nasm/sample_64.shtml
+
+## Utils
+
+* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+* https://git.ffmpeg.org/gitweb/ffmpeg.git/blob_plain/HEAD:/libavutil/x86/x86inc.asm
+* https://gist.github.com/rygorous/f729919ff64526a46e591d8f8b52058e