Project

General

Profile

nbnxn_sum.s

Berk Hess, 05/16/2019 01:52 PM

 
1
        .file   "nbnxn_sum.cpp"
2
# GNU C++11 (Ubuntu 7.3.0-27ubuntu1~18.04) version 7.3.0 (x86_64-linux-gnu)
3
#       compiled by GNU C version 7.3.0, GMP version 6.1.2, MPFR version 4.0.1, MPC version 1.1.0, isl version isl-0.19-GMP
4

    
5
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
6
# options passed:  -I /nethome/hess/gmx/obj/g_avx512/src
7
# -I /nethome/hess/gmx/src -I /opt/tcbsys/cuda/10.1/include
8
# -I /nethome/hess/gmx/src/external/lmfit -imultiarch x86_64-linux-gnu
9
# -D_GNU_SOURCE -D_REENTRANT -D GMX_DOUBLE=0 -D HAVE_CONFIG_H
10
# -D USE_STD_INTTYPES_H
11
# -isystem /nethome/hess/gmx/src/external/thread_mpi/include
12
# -isystem /nethome/hess/gmx/src/external/tng_io/include
13
# -isystem /nethome/hess/gmx/obj/g_avx512/tng/include
14
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp -mavx512f -mfma
15
# -mtune=generic -march=x86-64
16
# -auxbase-strip CMakeFiles/libgromacs.dir/mdlib/nbnxn_sum.cpp.o -O3
17
# -Wundef -Wextra -Wno-missing-field-initializers -Wpointer-arith
18
# -Wmissing-declarations -Wall -std=c++11 -fverbose-asm -funroll-all-loops
19
# -fexcess-precision=fast -fopenmp -fstack-protector-strong
20
# -Wformat-security
21
# options enabled:  -fPIC -fPIE -faggressive-loop-optimizations
22
# -falign-labels -fasynchronous-unwind-tables -fauto-inc-dec
23
# -fbranch-count-reg -fcaller-saves -fchkp-check-incomplete-type
24
# -fchkp-check-read -fchkp-check-write -fchkp-instrument-calls
25
# -fchkp-narrow-bounds -fchkp-optimize -fchkp-store-bounds
26
# -fchkp-use-static-bounds -fchkp-use-static-const-bounds
27
# -fchkp-use-wrappers -fcode-hoisting -fcombine-stack-adjustments -fcommon
28
# -fcompare-elim -fcprop-registers -fcrossjumping -fcse-follow-jumps
29
# -fdefer-pop -fdelete-null-pointer-checks -fdevirtualize
30
# -fdevirtualize-speculatively -fdwarf2-cfi-asm -fearly-inlining
31
# -feliminate-unused-debug-types -fexceptions -fexpensive-optimizations
32
# -fforward-propagate -ffp-int-builtin-inexact -ffunction-cse -fgcse
33
# -fgcse-after-reload -fgcse-lm -fgnu-runtime -fgnu-unique
34
# -fguess-branch-probability -fhoist-adjacent-loads -fident -fif-conversion
35
# -fif-conversion2 -findirect-inlining -finline -finline-atomics
36
# -finline-functions -finline-functions-called-once
37
# -finline-small-functions -fipa-bit-cp -fipa-cp -fipa-cp-clone -fipa-icf
38
# -fipa-icf-functions -fipa-icf-variables -fipa-profile -fipa-pure-const
39
# -fipa-ra -fipa-reference -fipa-sra -fipa-vrp -fira-hoist-pressure
40
# -fira-share-save-slots -fira-share-spill-slots
41
# -fisolate-erroneous-paths-dereference -fivopts -fkeep-static-consts
42
# -fleading-underscore -flifetime-dse -flra-remat -flto-odr-type-merging
43
# -fmath-errno -fmerge-constants -fmerge-debug-strings
44
# -fmove-loop-invariants -fomit-frame-pointer -foptimize-sibling-calls
45
# -foptimize-strlen -fpartial-inlining -fpeel-loops -fpeephole -fpeephole2
46
# -fplt -fpredictive-commoning -fprefetch-loop-arrays -free
47
# -freg-struct-return -frename-registers -freorder-blocks
48
# -freorder-functions -frerun-cse-after-loop
49
# -fsched-critical-path-heuristic -fsched-dep-count-heuristic
50
# -fsched-group-heuristic -fsched-interblock -fsched-last-insn-heuristic
51
# -fsched-rank-heuristic -fsched-spec -fsched-spec-insn-heuristic
52
# -fsched-stalled-insns-dep -fschedule-fusion -fschedule-insns2
53
# -fsemantic-interposition -fshow-column -fshrink-wrap
54
# -fshrink-wrap-separate -fsigned-zeros -fsplit-ivs-in-unroller
55
# -fsplit-loops -fsplit-paths -fsplit-wide-types -fssa-backprop
56
# -fssa-phiopt -fstack-protector-strong -fstdarg-opt -fstore-merging
57
# -fstrict-aliasing -fstrict-overflow -fstrict-volatile-bitfields
58
# -fsync-libcalls -fthread-jumps -ftoplevel-reorder -ftrapping-math
59
# -ftree-bit-ccp -ftree-builtin-call-dce -ftree-ccp -ftree-ch
60
# -ftree-coalesce-vars -ftree-copy-prop -ftree-cselim -ftree-dce
61
# -ftree-dominator-opts -ftree-dse -ftree-forwprop -ftree-fre
62
# -ftree-loop-distribute-patterns -ftree-loop-if-convert -ftree-loop-im
63
# -ftree-loop-ivcanon -ftree-loop-optimize -ftree-loop-vectorize
64
# -ftree-parallelize-loops= -ftree-partial-pre -ftree-phiprop -ftree-pre
65
# -ftree-pta -ftree-reassoc -ftree-scev-cprop -ftree-sink
66
# -ftree-slp-vectorize -ftree-slsr -ftree-sra -ftree-switch-conversion
67
# -ftree-tail-merge -ftree-ter -ftree-vrp -funit-at-a-time
68
# -funroll-all-loops -funroll-loops -funswitch-loops -funwind-tables
69
# -fverbose-asm -fweb -fzero-initialized-in-bss -m128bit-long-double -m64
70
# -m80387 -malign-stringops -mavx -mavx2 -mavx256-split-unaligned-load
71
# -mavx256-split-unaligned-store -mavx512f -mfancy-math-387 -mfma
72
# -mfp-ret-in-387 -mfxsr -mglibc -mieee-fp -mlong-double-80 -mmmx -mpopcnt
73
# -mpush-args -mred-zone -msse -msse2 -msse3 -msse4 -msse4.1 -msse4.2
74
# -mssse3 -mstv -mtls-direct-seg-refs -mvzeroupper -mxsave
75

    
76
        .text
77
        .p2align 4,,15
78
        .globl  _Z14sumSimpleListsiPKP16nbnxn_pairlist_t
79
        .type   _Z14sumSimpleListsiPKP16nbnxn_pairlist_t, @function
80
_Z14sumSimpleListsiPKP16nbnxn_pairlist_t:
81
.LFB2163:
82
        .cfi_startproc
83
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:58:     for (int s = 0; s < numLists; s++)
84
        testl   %edi, %edi      # numLists
85
        jle     .L10    #,
86
        movq    %rsi, %rdx      # srcSet, tmp192
87
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:56: {
88
        leaq    8(%rsp), %r10   #,
89
        .cfi_def_cfa 10, 0
90
        leal    -1(%rdi), %ecx  #, tmp194
91
        shrq    $3, %rdx        #, tmp192
92
        andq    $-64, %rsp      #,
93
        movl    $19, %r8d       #, tmp196
94
        negq    %rdx    # tmp193
95
        pushq   -8(%r10)        #
96
        pushq   %rbp    #
97
        andl    $7, %edx        #,
98
        leal    15(%rdx), %eax  #, tmp195
99
        .cfi_escape 0x10,0x6,0x2,0x76,0
100
        movq    %rsp, %rbp      #,
101
        pushq   %r10    #
102
        .cfi_escape 0xf,0x3,0x76,0x78,0x6
103
        pushq   %rbx    #
104
        .cfi_escape 0x10,0x3,0x2,0x76,0x70
105
        cmpl    $19, %eax       #, tmp195
106
        cmovb   %r8d, %eax      # tmp195,, tmp196, tmp261
107
        cmpl    %eax, %ecx      # tmp261, tmp194
108
        jb      .L11    #,
109
        testl   %edx, %edx      # prolog_loop_niters.5
110
        je      .L12    #,
111
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:60:         ncjTotal += srcSet[s]->ncjInUse;
112
        movq    (%rsi), %rbx    # *srcSet_10(D), *srcSet_10(D)
113
        cmpl    $1, %edx        #, prolog_loop_niters.5
114
        movl    172(%rbx), %r9d # _4->ncjInUse, ncjTotal
115
        je      .L13    #,
116
        movq    8(%rsi), %r11   # MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 8B], MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 8B]
117
        addl    172(%r11), %r9d # _93->ncjInUse, ncjTotal
118
        cmpl    $2, %edx        #, prolog_loop_niters.5
119
        je      .L14    #,
120
        movq    16(%rsi), %r10  # MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 16B], MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 16B]
121
        addl    172(%r10), %r9d # _104->ncjInUse, ncjTotal
122
        cmpl    $3, %edx        #, prolog_loop_niters.5
123
        je      .L15    #,
124
        movq    24(%rsi), %rcx  # MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 24B], MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 24B]
125
        addl    172(%rcx), %r9d # _115->ncjInUse, ncjTotal
126
        cmpl    $4, %edx        #, prolog_loop_niters.5
127
        je      .L16    #,
128
        movq    32(%rsi), %rax  # MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 32B], MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 32B]
129
        addl    172(%rax), %r9d # _126->ncjInUse, ncjTotal
130
        cmpl    $5, %edx        #, prolog_loop_niters.5
131
        je      .L17    #,
132
        movq    40(%rsi), %r8   # MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 40B], MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 40B]
133
        addl    172(%r8), %r9d  # _137->ncjInUse, ncjTotal
134
        cmpl    $6, %edx        #, prolog_loop_niters.5
135
        je      .L18    #,
136
        movq    48(%rsi), %rbx  # MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 48B], MEM[(struct nbnxn_pairlist_t * const *)srcSet_10(D) + 48B]
137
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:58:     for (int s = 0; s < numLists; s++)
138
        movl    $7, %r11d       #, s
139
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:60:         ncjTotal += srcSet[s]->ncjInUse;
140
        addl    172(%rbx), %r9d # _19->ncjInUse, ncjTotal
141
.L4:
142
        movl    %edi, %r10d     # numLists, niters.7
143
        leaq    (%rsi,%rdx,8), %rbx     #, ivtmp.31
144
        movl    $1, %ecx        #, ivtmp.29
145
        subl    %edx, %r10d     # prolog_loop_niters.5, niters.7
146
        movl    $-1, %edx       #, tmp228
147
        movl    %r10d, %eax     # niters.7, bnd.8
148
        vmovdqa64       (%rbx), %zmm0   # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
149
        kmovw   %edx, %k2       # tmp228, tmp421
150
        shrl    $4, %eax        #, bnd.8
151
        vmovdqa64       64(%rbx), %zmm1 # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
152
        kmovw   %edx, %k3       # tmp228, tmp422
153
        leal    -1(%rax), %r8d  #, tmp256
154
        vpgatherqd      172(,%zmm0,1), %ymm2{%k2}       #, vect__5.15, tmp421
155
        kmovw   %edx, %k1       # tmp228, tmp228
156
        vpgatherqd      172(,%zmm1,1), %ymm3{%k3}       #, vect__5.15, tmp422
157
        leaq    128(%rbx), %rdx #, ivtmp.31
158
        andl    $7, %r8d        #, tmp258
159
        cmpl    $1, %eax        #, bnd.8
160
        vshufi32x4      $68, %zmm3, %zmm2, %zmm2        #, vect__5.15, vect__5.15, vect_ncjTotal_12.17
161
        jbe     .L6     #,
162
        testl   %r8d, %r8d      # tmp258
163
        je      .L7     #,
164
        cmpl    $1, %r8d        #, tmp258
165
        je      .L67    #,
166
        cmpl    $2, %r8d        #, tmp258
167
        je      .L68    #,
168
        cmpl    $3, %r8d        #, tmp258
169
        je      .L69    #,
170
        cmpl    $4, %r8d        #, tmp258
171
        je      .L70    #,
172
        cmpl    $5, %r8d        #, tmp258
173
        je      .L71    #,
174
        cmpl    $6, %r8d        #, tmp258
175
        je      .L72    #,
176
        vmovdqa64       (%rdx), %zmm4   # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
177
        kmovw   %k1, %k4        # tmp228, tmp439
178
        kmovw   %k1, %k5        # tmp228, tmp440
179
        vmovdqa64       64(%rdx), %zmm6 # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
180
        leaq    256(%rbx), %rdx #, ivtmp.31
181
        movl    $2, %ecx        #, ivtmp.29
182
        vpgatherqd      172(,%zmm4,1), %ymm5{%k4}       #, vect__5.15, tmp439
183
        vpgatherqd      172(,%zmm6,1), %ymm7{%k5}       #, vect__5.15, tmp440
184
        vshufi32x4      $68, %zmm7, %zmm5, %zmm8        #, vect__5.15, vect__5.15, vect__5.16
185
        vpaddd  %zmm8, %zmm2, %zmm2     # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
186
.L72:
187
        vmovdqa64       (%rdx), %zmm9   # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
188
        kmovw   %k1, %k6        # tmp228, tmp441
189
        kmovw   %k1, %k7        # tmp228, tmp442
190
        vmovdqa64       64(%rdx), %zmm11        # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
191
        addl    $1, %ecx        #, ivtmp.29
192
        subq    $-128, %rdx     #, ivtmp.31
193
        vpgatherqd      172(,%zmm9,1), %ymm10{%k6}      #, vect__5.15, tmp441
194
        vpgatherqd      172(,%zmm11,1), %ymm12{%k7}     #, vect__5.15, tmp442
195
        vshufi32x4      $68, %zmm12, %zmm10, %zmm13     #, vect__5.15, vect__5.15, vect__5.16
196
        vpaddd  %zmm13, %zmm2, %zmm2    # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
197
.L71:
198
        vmovdqa64       (%rdx), %zmm14  # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
199
        kmovw   %k1, %k2        # tmp228, tmp443
200
        kmovw   %k1, %k3        # tmp228, tmp444
201
        vmovdqa64       64(%rdx), %zmm0 # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
202
        addl    $1, %ecx        #, ivtmp.29
203
        subq    $-128, %rdx     #, ivtmp.31
204
        vpgatherqd      172(,%zmm14,1), %ymm15{%k2}     #, vect__5.15, tmp443
205
        vpgatherqd      172(,%zmm0,1), %ymm1{%k3}       #, vect__5.15, tmp444
206
        vshufi32x4      $68, %zmm1, %zmm15, %zmm3       #, vect__5.15, vect__5.15, vect__5.16
207
        vpaddd  %zmm3, %zmm2, %zmm2     # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
208
.L70:
209
        vmovdqa64       (%rdx), %zmm4   # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
210
        kmovw   %k1, %k4        # tmp228, tmp445
211
        kmovw   %k1, %k5        # tmp228, tmp446
212
        vmovdqa64       64(%rdx), %zmm6 # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
213
        addl    $1, %ecx        #, ivtmp.29
214
        subq    $-128, %rdx     #, ivtmp.31
215
        vpgatherqd      172(,%zmm4,1), %ymm5{%k4}       #, vect__5.15, tmp445
216
        vpgatherqd      172(,%zmm6,1), %ymm7{%k5}       #, vect__5.15, tmp446
217
        vshufi32x4      $68, %zmm7, %zmm5, %zmm8        #, vect__5.15, vect__5.15, vect__5.16
218
        vpaddd  %zmm8, %zmm2, %zmm2     # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
219
.L69:
220
        vmovdqa64       (%rdx), %zmm9   # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
221
        kmovw   %k1, %k6        # tmp228, tmp447
222
        kmovw   %k1, %k7        # tmp228, tmp448
223
        vmovdqa64       64(%rdx), %zmm11        # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
224
        addl    $1, %ecx        #, ivtmp.29
225
        subq    $-128, %rdx     #, ivtmp.31
226
        vpgatherqd      172(,%zmm9,1), %ymm10{%k6}      #, vect__5.15, tmp447
227
        vpgatherqd      172(,%zmm11,1), %ymm12{%k7}     #, vect__5.15, tmp448
228
        vshufi32x4      $68, %zmm12, %zmm10, %zmm13     #, vect__5.15, vect__5.15, vect__5.16
229
        vpaddd  %zmm13, %zmm2, %zmm2    # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
230
.L68:
231
        vmovdqa64       (%rdx), %zmm14  # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
232
        kmovw   %k1, %k2        # tmp228, tmp449
233
        kmovw   %k1, %k3        # tmp228, tmp450
234
        vmovdqa64       64(%rdx), %zmm0 # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
235
        addl    $1, %ecx        #, ivtmp.29
236
        subq    $-128, %rdx     #, ivtmp.31
237
        vpgatherqd      172(,%zmm14,1), %ymm15{%k2}     #, vect__5.15, tmp449
238
        vpgatherqd      172(,%zmm0,1), %ymm1{%k3}       #, vect__5.15, tmp450
239
        vshufi32x4      $68, %zmm1, %zmm15, %zmm3       #, vect__5.15, vect__5.15, vect__5.16
240
        vpaddd  %zmm3, %zmm2, %zmm2     # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
241
.L67:
242
        vmovdqa64       (%rdx), %zmm4   # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
243
        kmovw   %k1, %k4        # tmp228, tmp451
244
        kmovw   %k1, %k5        # tmp228, tmp452
245
        vmovdqa64       64(%rdx), %zmm6 # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
246
        addl    $1, %ecx        #, ivtmp.29
247
        subq    $-128, %rdx     #, ivtmp.31
248
        vpgatherqd      172(,%zmm4,1), %ymm5{%k4}       #, vect__5.15, tmp451
249
        cmpl    %ecx, %eax      # ivtmp.29, bnd.8
250
        vpgatherqd      172(,%zmm6,1), %ymm7{%k5}       #, vect__5.15, tmp452
251
        vshufi32x4      $68, %zmm7, %zmm5, %zmm8        #, vect__5.15, vect__5.15, vect__5.16
252
        vpaddd  %zmm8, %zmm2, %zmm2     # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
253
        jbe     .L6     #,
254
.L7:
255
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:60:         ncjTotal += srcSet[s]->ncjInUse;
256
        vmovdqa64       (%rdx), %zmm9   # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
257
        kmovw   %k1, %k6        # tmp228, tmp423
258
        kmovw   %k1, %k7        # tmp228, tmp424
259
        vmovdqa64       64(%rdx), %zmm11        # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
260
        kmovw   %k1, %k2        # tmp228, tmp425
261
        kmovw   %k1, %k3        # tmp228, tmp426
262
        vpgatherqd      172(,%zmm9,1), %ymm10{%k6}      #, vect__5.15, tmp423
263
        vmovdqa64       128(%rdx), %zmm15       # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
264
        kmovw   %k1, %k4        # tmp228, tmp427
265
        vpgatherqd      172(,%zmm11,1), %ymm12{%k7}     #, vect__5.15, tmp424
266
        vmovdqa64       192(%rdx), %zmm3        # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
267
        kmovw   %k1, %k5        # tmp228, tmp428
268
        vpgatherqd      172(,%zmm15,1), %ymm0{%k2}      #, vect__5.15, tmp425
269
        vmovdqa64       256(%rdx), %zmm6        # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
270
        kmovw   %k1, %k6        # tmp228, tmp429
271
        vshufi32x4      $68, %zmm12, %zmm10, %zmm13     #, vect__5.15, vect__5.15, vect__5.16
272
        vpgatherqd      172(,%zmm3,1), %ymm1{%k3}       #, vect__5.15, tmp426
273
        vmovdqa64       320(%rdx), %zmm8        # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
274
        vpgatherqd      172(,%zmm6,1), %ymm7{%k4}       #, vect__5.15, tmp427
275
        vmovdqa64       384(%rdx), %zmm11       # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
276
        kmovw   %k1, %k7        # tmp228, tmp430
277
        vshufi32x4      $68, %zmm1, %zmm0, %zmm4        #, vect__5.15, vect__5.15, vect__5.16
278
        vmovdqa64       512(%rdx), %zmm3        # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
279
        kmovw   %k1, %k2        # tmp228, tmp431
280
        vpaddd  %zmm13, %zmm2, %zmm14   # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
281
        vpgatherqd      172(,%zmm8,1), %ymm2{%k5}       #, vect__5.15, tmp428
282
        vmovdqa64       448(%rdx), %zmm13       # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
283
        vpgatherqd      172(,%zmm11,1), %ymm12{%k6}     #, vect__5.15, tmp429
284
        vmovdqa64       640(%rdx), %zmm8        # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
285
        vpgatherqd      172(,%zmm3,1), %ymm0{%k2}       #, vect__5.15, tmp431
286
        vpaddd  %zmm4, %zmm14, %zmm5    # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
287
        vshufi32x4      $68, %zmm2, %zmm7, %zmm9        #, vect__5.15, vect__5.15, vect__5.16
288
        vpgatherqd      172(,%zmm13,1), %ymm14{%k7}     #, vect__5.15, tmp430
289
        kmovw   %k1, %k3        # tmp228, tmp432
290
        kmovw   %k1, %k4        # tmp228, tmp433
291
        kmovw   %k1, %k5        # tmp228, tmp434
292
        vshufi32x4      $68, %zmm14, %zmm12, %zmm15     #, vect__5.15, vect__5.15, vect__5.16
293
        vpgatherqd      172(,%zmm8,1), %ymm2{%k4}       #, vect__5.15, tmp433
294
        kmovw   %k1, %k6        # tmp228, tmp435
295
        vpaddd  %zmm9, %zmm5, %zmm10    # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
296
        vmovdqa64       576(%rdx), %zmm5        # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
297
        kmovw   %k1, %k7        # tmp228, tmp436
298
        vmovdqa64       704(%rdx), %zmm9        # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
299
        kmovw   %k1, %k2        # tmp228, tmp437
300
        addl    $8, %ecx        #, ivtmp.29
301
        vpaddd  %zmm15, %zmm10, %zmm4   # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
302
        vpgatherqd      172(,%zmm5,1), %ymm1{%k3}       #, vect__5.15, tmp432
303
        vmovdqa64       768(%rdx), %zmm13       # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
304
        vpgatherqd      172(,%zmm9,1), %ymm10{%k5}      #, vect__5.15, tmp434
305
        vmovdqa64       832(%rdx), %zmm15       # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
306
        kmovw   %k1, %k3        # tmp228, tmp438
307
        vshufi32x4      $68, %zmm1, %zmm0, %zmm6        #, vect__5.15, vect__5.15, vect__5.16
308
        vmovdqa64       896(%rdx), %zmm5        # MEM[base: _151, offset: 0B], MEM[base: _151, offset: 0B]
309
        vpgatherqd      172(,%zmm13,1), %ymm14{%k6}     #, vect__5.15, tmp435
310
        vshufi32x4      $68, %zmm10, %zmm2, %zmm11      #, vect__5.15, vect__5.15, vect__5.16
311
        addq    $1024, %rdx     #, ivtmp.31
312
        vpaddd  %zmm6, %zmm4, %zmm7     # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
313
        vpgatherqd      172(,%zmm15,1), %ymm4{%k7}      #, vect__5.15, tmp436
314
        vpgatherqd      172(,%zmm5,1), %ymm6{%k2}       #, vect__5.15, tmp437
315
        vpaddd  %zmm11, %zmm7, %zmm12   # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
316
        vmovdqa64       -64(%rdx), %zmm7        # MEM[base: _151, offset: 64B], MEM[base: _151, offset: 64B]
317
        cmpl    %ecx, %eax      # ivtmp.29, bnd.8
318
        vshufi32x4      $68, %zmm4, %zmm14, %zmm3       #, vect__5.15, vect__5.15, vect__5.16
319
        vpgatherqd      172(,%zmm7,1), %ymm1{%k3}       #, vect__5.15, tmp438
320
        vshufi32x4      $68, %zmm1, %zmm6, %zmm8        #, vect__5.15, vect__5.15, vect__5.16
321
        vpaddd  %zmm3, %zmm12, %zmm0    # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
322
        vpaddd  %zmm8, %zmm0, %zmm2     # vect__5.16, vect_ncjTotal_12.17, vect_ncjTotal_12.17
323
        ja      .L7     #,
324
.L6:
325
        vpxord  %zmm9, %zmm9, %zmm9     # tmp235
326
        vmovdqa64       .LC0(%rip), %zmm11      #, tmp237
327
        movl    %r10d, %ebx     # niters.7, niters_vector_mult_vf.9
328
        vmovdqa64       .LC1(%rip), %zmm13      #, tmp239
329
        andl    $-16, %ebx      #, niters_vector_mult_vf.9
330
        vmovdqa64       .LC2(%rip), %zmm15      #, tmp241
331
        leal    (%rbx,%r11), %r8d       #, tmp.10
332
        vshufi32x4      $78, %zmm9, %zmm2, %zmm10       #, tmp235, vect_ncjTotal_12.17, vect_ncjTotal_12.19
333
        vpaddd  %zmm10, %zmm2, %zmm2    # vect_ncjTotal_12.19, vect_ncjTotal_12.17, vect_ncjTotal_12.19
334
        vpermi2d        %zmm9, %zmm2, %zmm11    # tmp235, vect_ncjTotal_12.19, vect_ncjTotal_12.19
335
        vpaddd  %zmm11, %zmm2, %zmm12   # vect_ncjTotal_12.19, vect_ncjTotal_12.19, vect_ncjTotal_12.19
336
        vpermi2d        %zmm9, %zmm12, %zmm13   # tmp235, vect_ncjTotal_12.19, vect_ncjTotal_12.19
337
        vpaddd  %zmm13, %zmm12, %zmm14  # vect_ncjTotal_12.19, vect_ncjTotal_12.19, vect_ncjTotal_12.19
338
        vpermi2d        %zmm9, %zmm14, %zmm15   # tmp235, vect_ncjTotal_12.19, vect_ncjTotal_12.19
339
        vpaddd  %zmm15, %zmm14, %zmm4   # vect_ncjTotal_12.19, vect_ncjTotal_12.19, vect_ncjTotal_12.19
340
        vmovd   %xmm4, %eax     # tmp245, stmp_ncjTotal_12.18
341
        addl    %r9d, %eax      # ncjTotal, <retval>
342
        cmpl    %ebx, %r10d     # niters_vector_mult_vf.9, niters.7
343
        je      .L96    #,
344
        vzeroupper
345
.L3:
346
        movslq  %r8d, %r9       # tmp.10, ivtmp.23
347
        movq    %r9, %rdx       # ivtmp.23, tmp253
348
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:60:         ncjTotal += srcSet[s]->ncjInUse;
349
        movq    (%rsi,%r9,8), %rcx      # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
350
        leaq    1(%r9), %r11    #, ivtmp.23
351
        notq    %rdx    # tmp253
352
        addl    %edi, %edx      # numLists, tmp252
353
        andl    $7, %edx        #, tmp254
354
        addl    172(%rcx), %eax # _43->ncjInUse, <retval>
355
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:58:     for (int s = 0; s < numLists; s++)
356
        cmpl    %r11d, %edi     # ivtmp.23, numLists
357
        jle     .L90    #,
358
        testl   %edx, %edx      # tmp254
359
        je      .L9     #,
360
        cmpl    $1, %edx        #, tmp254
361
        je      .L61    #,
362
        cmpl    $2, %edx        #, tmp254
363
        je      .L62    #,
364
        cmpl    $3, %edx        #, tmp254
365
        je      .L63    #,
366
        cmpl    $4, %edx        #, tmp254
367
        je      .L64    #,
368
        cmpl    $5, %edx        #, tmp254
369
        je      .L65    #,
370
        cmpl    $6, %edx        #, tmp254
371
        je      .L66    #,
372
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:60:         ncjTotal += srcSet[s]->ncjInUse;
373
        movq    (%rsi,%r11,8), %r10     # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
374
        leaq    2(%r9), %r11    #, ivtmp.23
375
        addl    172(%r10), %eax # _43->ncjInUse, <retval>
376
.L66:
377
        movq    (%rsi,%r11,8), %rbx     # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
378
        addq    $1, %r11        #, ivtmp.23
379
        addl    172(%rbx), %eax # _43->ncjInUse, <retval>
380
.L65:
381
        movq    (%rsi,%r11,8), %r8      # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
382
        addq    $1, %r11        #, ivtmp.23
383
        addl    172(%r8), %eax  # _43->ncjInUse, <retval>
384
.L64:
385
        movq    (%rsi,%r11,8), %r9      # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
386
        addq    $1, %r11        #, ivtmp.23
387
        addl    172(%r9), %eax  # _43->ncjInUse, <retval>
388
.L63:
389
        movq    (%rsi,%r11,8), %rdx     # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
390
        addq    $1, %r11        #, ivtmp.23
391
        addl    172(%rdx), %eax # _43->ncjInUse, <retval>
392
.L62:
393
        movq    (%rsi,%r11,8), %rcx     # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
394
        addq    $1, %r11        #, ivtmp.23
395
        addl    172(%rcx), %eax # _43->ncjInUse, <retval>
396
.L61:
397
        movq    (%rsi,%r11,8), %r10     # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
398
        addq    $1, %r11        #, ivtmp.23
399
        addl    172(%r10), %eax # _43->ncjInUse, <retval>
400
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:58:     for (int s = 0; s < numLists; s++)
401
        cmpl    %r11d, %edi     # ivtmp.23, numLists
402
        jle     .L90    #,
403
.L9:
404
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:60:         ncjTotal += srcSet[s]->ncjInUse;
405
        movq    (%rsi,%r11,8), %rbx     # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
406
        movq    8(%rsi,%r11,8), %r8     # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
407
        movq    16(%rsi,%r11,8), %r9    # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
408
        movq    24(%rsi,%r11,8), %rdx   # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
409
        movq    32(%rsi,%r11,8), %rcx   # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
410
        movq    40(%rsi,%r11,8), %r10   # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
411
        addl    172(%rbx), %eax # _43->ncjInUse, tmp329
412
        movq    48(%rsi,%r11,8), %rbx   # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
413
        addl    172(%r8), %eax  # _43->ncjInUse, tmp334
414
        movq    56(%rsi,%r11,8), %r8    # MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B], MEM[base: srcSet_10(D), index: ivtmp.23_142, step: 8, offset: 0B]
415
        addq    $8, %r11        #, ivtmp.23
416
        addl    172(%r9), %eax  # _43->ncjInUse, tmp339
417
        addl    172(%rdx), %eax # _43->ncjInUse, tmp344
418
        addl    172(%rcx), %eax # _43->ncjInUse, tmp349
419
        addl    172(%r10), %eax # _43->ncjInUse, tmp354
420
        addl    172(%rbx), %eax # _43->ncjInUse, tmp359
421
        addl    172(%r8), %eax  # _43->ncjInUse, <retval>
422
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:58:     for (int s = 0; s < numLists; s++)
423
        cmpl    %r11d, %edi     # ivtmp.23, numLists
424
        jg      .L9     #,
425
.L90:
426
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:64: }
427
        popq    %rbx    #
428
        popq    %rsi    #
429
        .cfi_remember_state
430
        .cfi_def_cfa 10, 0
431
        popq    %rbp    #
432
        leaq    -8(%rsi), %rsp  #,
433
        .cfi_def_cfa 7, 8
434
        ret
435
        .p2align 4,,10
436
        .p2align 3
437
.L15:
438
        .cfi_restore_state
439
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:58:     for (int s = 0; s < numLists; s++)
440
        movl    $3, %r11d       #, s
441
        jmp     .L4     #
442
        .p2align 4,,10
443
        .p2align 3
444
.L10:
445
        .cfi_def_cfa 7, 8
446
        .cfi_restore 3
447
        .cfi_restore 6
448
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:57:     int ncjTotal = 0;
449
        xorl    %eax, %eax      # <retval>
450
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:64: }
451
        ret
452
        .p2align 4,,10
453
        .p2align 3
454
.L13:
455
        .cfi_escape 0xf,0x3,0x76,0x78,0x6
456
        .cfi_escape 0x10,0x3,0x2,0x76,0x70
457
        .cfi_escape 0x10,0x6,0x2,0x76,0
458
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:58:     for (int s = 0; s < numLists; s++)
459
        movl    $1, %r11d       #, s
460
        jmp     .L4     #
461
        .p2align 4,,10
462
        .p2align 3
463
.L14:
464
        movl    $2, %r11d       #, s
465
        jmp     .L4     #
466
        .p2align 4,,10
467
        .p2align 3
468
.L11:
469
        xorl    %r8d, %r8d      # tmp.10
470
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:57:     int ncjTotal = 0;
471
        xorl    %eax, %eax      # <retval>
472
        jmp     .L3     #
473
        .p2align 4,,10
474
        .p2align 3
475
.L16:
476
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:58:     for (int s = 0; s < numLists; s++)
477
        movl    $4, %r11d       #, s
478
        jmp     .L4     #
479
        .p2align 4,,10
480
        .p2align 3
481
.L17:
482
        movl    $5, %r11d       #, s
483
        jmp     .L4     #
484
        .p2align 4,,10
485
        .p2align 3
486
.L12:
487
        xorl    %r11d, %r11d    # s
488
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:57:     int ncjTotal = 0;
489
        xorl    %r9d, %r9d      # ncjTotal
490
        jmp     .L4     #
491
        .p2align 4,,10
492
        .p2align 3
493
.L18:
494
# /nethome/hess/gmx/src/gromacs/mdlib/nbnxn_sum.cpp:58:     for (int s = 0; s < numLists; s++)
495
        movl    $6, %r11d       #, s
496
        jmp     .L4     #
497
        .p2align 4,,10
498
        .p2align 3
499
.L96:
500
        vzeroupper
501
        jmp     .L90    #
502
        .cfi_endproc
503
.LFE2163:
504
        .size   _Z14sumSimpleListsiPKP16nbnxn_pairlist_t, .-_Z14sumSimpleListsiPKP16nbnxn_pairlist_t
505
        .section        .rodata
506
        .align 64
507
.LC0:
508
        .long   4
509
        .long   5
510
        .long   6
511
        .long   7
512
        .long   8
513
        .long   9
514
        .long   10
515
        .long   11
516
        .long   12
517
        .long   13
518
        .long   14
519
        .long   15
520
        .long   16
521
        .long   17
522
        .long   18
523
        .long   19
524
        .align 64
525
.LC1:
526
        .long   2
527
        .long   3
528
        .long   4
529
        .long   5
530
        .long   6
531
        .long   7
532
        .long   8
533
        .long   9
534
        .long   10
535
        .long   11
536
        .long   12
537
        .long   13
538
        .long   14
539
        .long   15
540
        .long   16
541
        .long   17
542
        .align 64
543
.LC2:
544
        .long   1
545
        .long   2
546
        .long   3
547
        .long   4
548
        .long   5
549
        .long   6
550
        .long   7
551
        .long   8
552
        .long   9
553
        .long   10
554
        .long   11
555
        .long   12
556
        .long   13
557
        .long   14
558
        .long   15
559
        .long   16
560
        .ident  "GCC: (Ubuntu 7.3.0-27ubuntu1~18.04) 7.3.0"
561
        .section        .note.GNU-stack,"",@progbits