diff --git a/contrib/loaders/flash/stm32lx.S b/contrib/loaders/flash/stm32lx.S
index 88deed32e9373cd6e33c6a20dd3338e0703fa6f1..8f9fd0b2b21d778ed98e098aa0beb6a6a9f696e8 100644
--- a/contrib/loaders/flash/stm32lx.S
+++ b/contrib/loaders/flash/stm32lx.S
@@ -8,6 +8,9 @@
  *   Copyright (C) 2011 Clement Burin des Roziers                          *
  *   clement.burin-des-roziers@hikob.com                                   *
  *                                                                         *
+ *   Copyright (C) 2017 Armin van der Togt                                 *
+ *   armin@otheruse.nl                                                     *
+ *                                                                         *
  *   This program is free software; you can redistribute it and/or modify  *
  *   it under the terms of the GNU General Public License as published by  *
  *   the Free Software Foundation; either version 2 of the License, or     *
@@ -28,7 +31,7 @@
 // Build : arm-eabi-gcc -c stm32lx.S
 	.text
 	.syntax unified
-	.cpu cortex-m3
+	.cpu cortex-m0
 	.thumb
 	.thumb_func
 	.global write
@@ -39,24 +42,21 @@
 	r2 - count
 */
 
-	// Set 0 to r3
-	movs	r3, #0
+	// r2 = source + count * 4
+	lsls	r2, r2, #2
+	adds	r2, r1, r2
 	// Go to compare
-	b.n test_done
-
+	b	test_done
 write_word:
-	// Load one word from address in r0, increment by 4
-	ldr.w	ip, [r1], #4
-	// Store the word to address in r1, increment by 4
-	str.w	ip, [r0], #4
-	// Increment r3
-	adds	r3, #1
-
+	// load word from address in r1 and increase r1 by 4
+	ldmia r1!, {r3}
+	// store word to address in r0 and increase r0 by 4
+	stmia r0!, {r3}
 test_done:
-	// Compare r3 and r2
-	cmp 	r3, r2
-	// Loop if not zero
-	bcc.n	write_word
+	// compare r1 and r2
+	cmp	r1, r2
+	// loop if not equal
+	bne	write_word
 
 	// Set breakpoint to exit
 	bkpt	#0x00
diff --git a/src/flash/nor/stm32lx.c b/src/flash/nor/stm32lx.c
index 0b392334fb2149c16407393c8aae0340d3d867ae..63dc9617f6ce76e91299810673e8d01125a91304 100644
--- a/src/flash/nor/stm32lx.c
+++ b/src/flash/nor/stm32lx.c
@@ -450,19 +450,7 @@ static int stm32lx_write_half_pages(struct flash_bank *bank, const uint8_t *buff
 	/* see contib/loaders/flash/stm32lx.S for src */
 
 	static const uint8_t stm32lx_flash_write_code[] = {
-		/* write_word: */
-		0x00, 0x23,             /* movs r3, #0 */
-		0x04, 0xe0,             /* b test_done */
-
-		/* write_word: */
-		0x51, 0xf8, 0x04, 0xcb, /* ldr ip, [r1], #4 */
-		0x40, 0xf8, 0x04, 0xcb, /* str ip, [r0], #4 */
-		0x01, 0x33,             /* adds r3, #1 */
-
-		/* test_done: */
-		0x93, 0x42,             /* cmp r3, r2 */
-		0xf8, 0xd3,             /* bcc write_word */
-		0x00, 0xbe,             /* bkpt 0 */
+			0x92, 0x00, 0x8A, 0x18, 0x01, 0xE0, 0x08, 0xC9, 0x08, 0xC0, 0x91, 0x42, 0xFB, 0xD1, 0x00, 0xBE
 	};
 
 	/* Make sure we're performing a half-page aligned write. */
@@ -588,7 +576,7 @@ static int stm32lx_write_half_pages(struct flash_bank *bank, const uint8_t *buff
 		 * is reduced by 50% using this slower method.
 		 */
 
-		LOG_WARNING("couldn't use loader, falling back to page memory writes");
+		LOG_WARNING("Couldn't use loader, falling back to page memory writes");
 
 		while (count > 0) {
 			uint32_t this_count;