Forum Home
    • Register
    • Login
    • Search
    • Recent
    • Tags
    • Popular

    [Dev] NeoScrypt GPU Miner - Public Beta Test

    Technical Development
    52
    802
    574910
    Loading More Posts
    • Oldest to Newest
    • Newest to Oldest
    • Most Votes
    Reply
    • Reply as topic
    Log in to reply
    This topic has been deleted. Only users with topic management privileges can see it.
    • S
      slowhash Regular Member last edited by

      There are people doing minor mods to the latest wolf kernel, and bumping the speed up just a tad. One decreased my speed by about 1.5%, the other increased by about 1.5%, but combined they gave me about 9 kh/s on my 290’s, roughly 2.5%.

      // NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20

      // Stupid AMD compiler ignores the unroll pragma in these two
      #define SALSA_SMALL_UNROLL 3
      #define CHACHA_SMALL_UNROLL 3

      // If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted
      // as the unroll factor; must divide cleanly into ten.
      // Usually a bad idea.
      //#define SMALL_BLAKE2S
      //#define BLAKE2S_UNROLL 5

      #define BLOCK_SIZE 64U
      #define FASTKDF_BUFFER_SIZE 256U
      #ifndef PASSWORD_LEN
      #define PASSWORD_LEN 80U
      #endif

      #if !defined(cl_khr_byte_addressable_store)
      #error “Device does not support unaligned stores”
      #endif

      // Swaps 128 bytes at a time without using temp vars
      void SwapBytes128(void *restrict A, void *restrict B, uint len)
      {
      #pragma unroll 2
      for(int i = 0; i < (len >> 7); ++i)
      {
      ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
      ((ulong16 *)B)[i] ^= ((ulong16 *)A)[i];
      ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
      }
      }

      void CopyBytes128(void *restrict dst, const void *restrict src, uint len)
      {
      #pragma unroll 2
      for(int i = 0; i < len; ++i)
      ((ulong16 *)dst)[i] = ((ulong16 *)src)[i];
      }

      void CopyBytes(void *restrict dst, const void *restrict src, uint len)
      {
      for(int i = 0; i < len; ++i)
      ((uchar *)dst)[i] = ((uchar *)src)[i];
      }

      //
      // a bit of byte alignment checking goes a long ways…
      //
      void XORBytesInPlace(void *restrict dst, const void *restrict src, uint mod)
      {
      switch(mod % 4)
      {
      case 0:
      #pragma unroll 2
      for(int i = 0; i < 4; i+=2)
      {
      ((uint2 *)dst)[i] ^= ((uint2 *)src)[i];
      ((uint2 *)dst)[i+1] ^= ((uint2 *)src)[i+1];
      }
      break;

      case 2:
      #pragma unroll 8
      for(int i = 0; i < 16; i+=2)
      {
      ((uchar2 *)dst)[i] ^= ((uchar2 *)src)[i];
      ((uchar2 *)dst)[i+1] ^= ((uchar2 *)src)[i+1];
      }
      break;

      default:
      #pragma unroll 8
      for(int i = 0; i < 31; i+=4)
      {
      ((uchar *)dst)[i] ^= ((uchar *)src)[i];
      ((uchar *)dst)[i+1] ^= ((uchar *)src)[i+1];
      ((uchar *)dst)[i+2] ^= ((uchar *)src)[i+2];
      ((uchar *)dst)[i+3] ^= ((uchar *)src)[i+3];
      }
      }
      }

      void XORBytes(void *restrict dst, const void *restrict src1, const void *restrict src2, uint len)
      {
      #pragma unroll 1
      for(int i = 0; i < len; ++i)
      ((uchar *)dst)[i] = ((uchar *)src1)[i] ^ ((uchar *)src2)[i];
      }

      // Blake2S

      #define BLAKE2S_BLOCK_SIZE 64U
      #define BLAKE2S_OUT_SIZE 32U
      #define BLAKE2S_KEY_SIZE 32U

      static const __constant uint BLAKE2S_IV[8] =
      {
      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
      };

      static const __constant uchar BLAKE2S_SIGMA[10][16] =
      {
      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
      { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
      { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
      { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
      { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
      { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
      { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
      { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
      { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
      };

      #define BLAKE_G(idx0, idx1, a, b, c, d, key) do { \
      a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \
      d = rotate(d ^ a, 16U); \
      c += d; \
      b = rotate(b ^ c, 20U); \
      a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \
      d = rotate(d ^ a, 24U); \
      c += d; \
      b = rotate(b ^ c, 25U); \
      } while(0)

      void Blake2S(uint *restrict inout, const uint *restrict inkey)
      {
      uint16 V;
      uint8 tmpblock;

      // Load first block (IV into V.lo) and constants (IV into V.hi)
      V.lo = V.hi = vload8(0U, BLAKE2S_IV);

      // XOR with initial constant
      V.s0 ^= 0x01012020;

      // Copy input block for later
      tmpblock = V.lo;

      // XOR length of message so far (including this block)
      // There are two uints for this field, but high uint is zero
      V.sc ^= BLAKE2S_BLOCK_SIZE;

      // Compress state, using the key as the key
      #ifdef SMALL_BLAKE2S
      #pragma unroll BLAKE2S_UNROLL
      #else
      #pragma unroll
      #endif
      for(int x = 0; x < 10; ++x)
      {
      BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey);
      BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey);
      BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey);
      BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey);
      BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey);
      BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey);
      BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey);
      BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);
      }

      // XOR low part of state with the high part,
      // then with the original input block.
      V.lo ^= V.hi ^ tmpblock;

      // Load constants (IV into V.hi)
      V.hi = vload8(0U, BLAKE2S_IV);

      // Copy input block for later
      tmpblock = V.lo;

      // XOR length of message into block again
      V.sc ^= BLAKE2S_BLOCK_SIZE << 1;

      // Last block compression - XOR final constant into state
      V.se ^= 0xFFFFFFFFU;

      // Compress block, using the input as the key
      #ifdef SMALL_BLAKE2S
      #pragma unroll BLAKE2S_UNROLL
      #else
      #pragma unroll
      #endif
      for(int x = 0; x < 10; ++x)
      {
      BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout);
      BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout);
      BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout);
      BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout);
      BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout);
      BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout);
      BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout);
      BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);
      }

      // XOR low part of state with high part, then with input block
      V.lo ^= V.hi ^ tmpblock;

      // Store result in input/output buffer
      vstore8(V.lo, 0, inout);
      }

      /* FastKDF, a fast buffered key derivation function:
      * FASTKDF_BUFFER_SIZE must be a power of 2;
      * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
      * prf_output_size must be

      1 Reply Last reply Reply Quote 0
      • A
        Alpha Wolf last edited by

        There are people doing minor mods to the latest wolf kernel, and bumping the speed up just a tad. One decreased my speed by about 1.5%, the other increased by about 1.5%, but combined they gave me about 9 kh/s on my 290’s, roughly 2.5%.

        With my already tweaked sgminer.conf and your file/tweaks I increase from 134.5 Kh/s per R9 270 non X to 149.2 Kh/s per card.

        I have duel 270 non X in this system I tested with. Well done with your tweaking, thanks for sharing.

        sgminer-5.1-dev-2014-11-10-win32

        ,
        "xintensity" : "4,4",
        "vectors" : "1,1",
        "worksize" : "64,64",
        "thread-concurrency" : "8192,8192",
        "gpu-engine" : "1100,1100,",
        "gpu-memclock" : "1450,1450",
        
        1 Reply Last reply Reply Quote 0
        • S
          slowhash Regular Member last edited by

          Nice to know that the interest in improving the kernel didn’t go away when Wolf said he was keeping his improvements to himself…

          I’m not in any way faulting him for that decision, but that doesn’t mean that I have to like it either… ;)

          BTW, I got into X11 mining, and guess who showed up as the top kernel writer… lol

          1 Reply Last reply Reply Quote 0
          • W
            Wolf0 Regular Member last edited by

            Nice to know that the interest in improving the kernel didn’t go away when Wolf said he was keeping his improvements to himself…

            I’m not in any way faulting him for that decision, but that doesn’t mean that I have to like it either… ;)

            BTW, I got into X11 mining, and guess who showed up as the top kernel writer… lol

            Haha, yeah, my bins leaked.

            Also, this:

            switch(mod % 4)

            is just bad.

            1 Reply Last reply Reply Quote 0
            • S
              slowhash Regular Member last edited by

              ^^ and like you said, the compiler probably fixed that.

              But people are trying, in order to make mistakes like that. :)

              1 Reply Last reply Reply Quote 0
              • A
                Alpha Wolf last edited by

                Removed this line = 151.1Kh/s per 270 non X…

                 switch(mod % 4)
                  {
                
                1 Reply Last reply Reply Quote 0
                • S
                  slowhash Regular Member last edited by

                  Anyone following the Nvidia neoscrypt development?

                  https://bitcointalk.org/index.php?topic=916336.0

                  1 Reply Last reply Reply Quote 0
                  • kris_davison
                    kris_davison last edited by

                    Yeah I ended up donating 0.02 BTC and he sent me the files. (Windows first but then Linux as Microsoft crap is no good to me.)

                    I’m getting a very healthy ~410khs on my 3x750ti rig up from ~115khs. He said if he gets a total of ~2BTC he will open source his work which is fair enough.

                    1 Reply Last reply Reply Quote 0
                    • SpartanC001
                      SpartanC001 Regular Member last edited by

                      hm, anyone willing to send him btc on my behalf if i send paypal? id really like to try this out

                      got a 660ti painfully muching away on opencl code at 25kh/s, it needs better :D

                      1 Reply Last reply Reply Quote 0
                      • A
                        Alpha Wolf last edited by

                        Anyone following the Nvidia neoscrypt development?

                        https://bitcointalk.org/index.php?topic=916336.0

                        Am now, lol thanks

                        1 Reply Last reply Reply Quote 0
                        • A
                          Alpha Wolf last edited by

                          hm, anyone willing to send him btc on my behalf if i send paypal? id really like to try this out

                          got a 660ti painfully muching away on opencl code at 25kh/s, it needs better :D

                          25kh/s seems a little low for that card, I’m getting 37kh/s from mine.

                          1 Reply Last reply Reply Quote 0
                          • kris_davison
                            kris_davison last edited by

                            I would happily pay the 0.02btc for you but that was the last dregs of my btc. Post a btc address and I’ll see what I can do. :)

                            1 Reply Last reply Reply Quote 0
                            • SpartanC001
                              SpartanC001 Regular Member last edited by

                              25kh/s seems a little low for that card, I’m getting 37kh/s from mine.

                              I can get 34kh/s if i overclock the pants off it, but its not stable and even slight overclocks cause it to crash rather quickly (i think ive killed it :/)

                              I would happily pay the 0.02btc for you but that was the last dregs of my btc. Post a btc address and I’ll see what I can do. :)

                              Would be awesome, ill set up a blockchain.info wallet as i dont have a bitcoin wallet downloaded, and nor do i have enough space for one on my SSD

                              1 Reply Last reply Reply Quote 0
                              • A
                                Alpha Wolf last edited by

                                Anyone here gotten a copy of this new Nvidia ccminer? I’m wondering how it works on cards like the 760 660 Ti?

                                I have a little BTC built up and wondering if its worth it and if you can solo mine with it?

                                I tried to login in over there but don’t think I have a account setup there yet. lol 45mins until I can try again. lol

                                1 Reply Last reply Reply Quote 0
                                • kris_davison
                                  kris_davison last edited by

                                  Yes me …

                                  I’m getting a very healthy ~410khs on my 3x750ti rig up from ~115khs. He said if he gets a total of ~2BTC he will open source his work which is fair enough.

                                  Also SPARTAN Ive got the BTC so post your address and it will be on its way to you :)

                                  1 Reply Last reply Reply Quote 0
                                  • SpartanC001
                                    SpartanC001 Regular Member last edited by

                                    Anyone here gotten a copy of this new Nvidia ccminer? I’m wondering how it works on cards like the 760 660 Ti?

                                    I have a little BTC built up and wondering if its worth it and if you can solo mine with it?

                                    I tried to login in over there but don’t think I have a account setup there yet. lol 45mins until I can try again. lol

                                    I have a 660Ti, i will report how it runs once i get my hands on the miner :D

                                    ill probably do some comparisons between stock and overclocked aswell (if i can overclock at all depends on how my card feels - its been pushed pretty damn hard almost its entire lifespan poor thing)

                                    i would expect the 660Ti to get about the same ish maybe higher hashrates than a 750Ti, as the Maxwell cards really do punch above their weight compared to Keplers

                                    1 Reply Last reply Reply Quote 0
                                    • A
                                      Alpha Wolf last edited by

                                      I have a 660Ti, i will report how it runs once i get my hands on the miner :D

                                      ill probably do some comparisons between stock and overclocked aswell (if i can overclock at all depends on how my card feels - its been pushed pretty damn hard almost its entire lifespan poor thing)

                                      i would expect the 660Ti to get about the same ish maybe higher hashrates than a 750Ti, as the Maxwell cards really do punch above their weight compared to Keplers

                                      Thanks, looking forward to hearing about your results.

                                      1 Reply Last reply Reply Quote 0
                                      • SpartanC001
                                        SpartanC001 Regular Member last edited by

                                        Thanks, looking forward to hearing about your results.

                                        Well, i got the miner files, it requires compute version 3.5 or above, this means it will only run on the following cards atm:

                                        GTX 750
                                        GTX 750Ti
                                        GTX 780
                                        GTX 780Ti
                                        GTX Titan
                                        GTX Titan Black
                                        GTX Titan Z
                                        GTX 960
                                        GTX 970
                                        GTX 980

                                        Other Kepler cards are 3.0
                                        Fermi cards are 2.1

                                        I have contacted djm, and he is looking into making the miner work with compute version 2.1/3.0 cards

                                        Sydney

                                        1 Reply Last reply Reply Quote 0
                                        • A
                                          Alpha Wolf last edited by

                                          Cool, thanks for the update.

                                          1 Reply Last reply Reply Quote 0
                                          • A
                                            Alpha Wolf last edited by

                                            Anyone wish to share a link to the complied copy for W7 and Linux. I see DJM34 has released the code now.

                                            So I’m guessing it is now okay to share his work with others?

                                            1 Reply Last reply Reply Quote 0
                                            • First post
                                              Last post