Merge remote-tracking branch 'origin/releases/gcc-9' into devel/omp/gcc-9

Update all GCC 9 commits between tag 'releases/gcc-9.2.0' (r9-7613) and
today's GCC 9 branch (r9-8340-g7beafc829c5b122298093ba517023015611aeca8).
diff --git a/ChangeLog.omp b/ChangeLog.omp
new file mode 100644
index 0000000..156a9b9
--- /dev/null
+++ b/ChangeLog.omp
@@ -0,0 +1,11 @@
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+
+	* configure.ac (amdgcn*-*-*): Add target-libffi to noconfigdirs for AMD
+	GCN.
+	* configure: Regenerated.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* Makefile.def (lang_env_dependencies): Disable `cxx' dependency
+	for `libffi'.
+	* Makefile.in: Regenerate.
diff --git a/Makefile.def b/Makefile.def
index 1aab271..c5cc312 100644
--- a/Makefile.def
+++ b/Makefile.def
@@ -162,7 +162,7 @@
                    missing=maintainer-clean; };
 target_modules = { module= winsup; };
 target_modules = { module= libgloss; no_check=true; };
-target_modules = { module= libffi; no_install=true; };
+target_modules = { module= libffi; bootstrap=true; no_install=true; };
 target_modules = { module= zlib; };
 target_modules = { module= rda; };
 target_modules = { module= libada; };
@@ -533,7 +533,10 @@
 // environment (e.g. on libstdc++).  By default target modules depend
 // on libgcc and newlib/libgloss.
 lang_env_dependencies = { module=libitm; cxx=true; };
-lang_env_dependencies = { module=libffi; cxx=true; };
+// Disable libstdc++ dependency for libffi as it's only needed for
+// the library's test suite and it can cause a dependency loop when
+// libgomp, needed by libstdc++, uses libffi.
+// lang_env_dependencies = { module=libffi; cxx=true; };
 lang_env_dependencies = { module=liboffloadmic; cxx=true; };
 lang_env_dependencies = { module=newlib; no_c=true; };
 lang_env_dependencies = { module=libgloss; no_c=true; };
@@ -550,6 +553,8 @@
 dependencies = { module=all-target-libgo; on=all-target-libbacktrace; };
 dependencies = { module=all-target-libgo; on=all-target-libffi; };
 dependencies = { module=all-target-libgo; on=all-target-libatomic; };
+dependencies = { module=all-target-libgomp; on=all-target-libffi; };
+dependencies = { module=configure-target-libgomp; on=configure-target-libffi; };
 dependencies = { module=configure-target-libphobos; on=configure-target-libbacktrace; };
 dependencies = { module=configure-target-libphobos; on=configure-target-zlib; };
 dependencies = { module=all-target-libphobos; on=all-target-libbacktrace; };
@@ -570,6 +575,7 @@
 dependencies = { module=install-target-libgfortran; on=install-target-libgcc; };
 dependencies = { module=install-target-libphobos; on=install-target-libatomic; };
 dependencies = { module=install-target-libsanitizer; on=install-target-libstdc++-v3; };
+dependencies = { module=install-target-libgomp; on=install-target-libffi; };
 dependencies = { module=install-target-libsanitizer; on=install-target-libgcc; };
 dependencies = { module=install-target-libvtv; on=install-target-libstdc++-v3; };
 dependencies = { module=install-target-libvtv; on=install-target-libgcc; };
diff --git a/Makefile.in b/Makefile.in
index 64e091b..e1e2bd7 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1219,7 +1219,9 @@
 all-target: maybe-all-target-libtermcap
 all-target: maybe-all-target-winsup
 all-target: maybe-all-target-libgloss
+@if target-libffi-no-bootstrap
 all-target: maybe-all-target-libffi
+@endif target-libffi-no-bootstrap
 all-target: maybe-all-target-zlib
 all-target: maybe-all-target-rda
 all-target: maybe-all-target-libada
@@ -48974,7 +48976,6 @@
 @if target-libffi
 maybe-configure-target-libffi: configure-target-libffi
 configure-target-libffi: 
-	@: $(MAKE); $(unstage)
 	@r=`${PWD_COMMAND}`; export r; \
 	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
 	echo "Checking multilib configuration for libffi..."; \
@@ -49012,6 +49013,412 @@
 
 
 
+.PHONY: configure-stage1-target-libffi maybe-configure-stage1-target-libffi
+maybe-configure-stage1-target-libffi:
+@if target-libffi-bootstrap
+maybe-configure-stage1-target-libffi: configure-stage1-target-libffi
+configure-stage1-target-libffi:
+	@[ $(current_stage) = stage1 ] || $(MAKE) stage1-start
+	@$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGE1_TFLAGS)"; \
+	echo "Checking multilib configuration for libffi..."; \
+	$(CC_FOR_TARGET) --print-multi-lib > $(TARGET_SUBDIR)/libffi/multilib.tmp 2> /dev/null; \
+	if test -r $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	  if cmp -s $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	    rm -f $(TARGET_SUBDIR)/libffi/multilib.tmp; \
+	  else \
+	    rm -f $(TARGET_SUBDIR)/libffi/Makefile; \
+	    mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	  fi; \
+	else \
+	  mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	fi; \
+	test ! -f $(TARGET_SUBDIR)/libffi/Makefile || exit 0; \
+	$(NORMAL_TARGET_EXPORTS) \
+	CFLAGS="$(CFLAGS_FOR_TARGET)"; export CFLAGS; \
+	CXXFLAGS="$(CXXFLAGS_FOR_TARGET)"; export CXXFLAGS; \
+	LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)"; export LIBCFLAGS;  \
+	echo Configuring stage 1 in $(TARGET_SUBDIR)/libffi; \
+	$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi; \
+	cd $(TARGET_SUBDIR)/libffi || exit 1; \
+	case $(srcdir) in \
+	  /* | [A-Za-z]:[\\/]*) topdir=$(srcdir) ;; \
+	  *) topdir=`echo $(TARGET_SUBDIR)/libffi/ | \
+		sed -e 's,\./,,g' -e 's,[^/]*/,../,g' `$(srcdir) ;; \
+	esac; \
+	module_srcdir=libffi; \
+	$(SHELL) $$s/$$module_srcdir/configure \
+	  --srcdir=$${topdir}/$$module_srcdir \
+	  $(TARGET_CONFIGARGS) --build=${build_alias} --host=${target_alias} \
+	  --target=${target_alias} \
+	   \
+	  $(STAGE1_CONFIGURE_FLAGS)
+@endif target-libffi-bootstrap
+
+.PHONY: configure-stage2-target-libffi maybe-configure-stage2-target-libffi
+maybe-configure-stage2-target-libffi:
+@if target-libffi-bootstrap
+maybe-configure-stage2-target-libffi: configure-stage2-target-libffi
+configure-stage2-target-libffi:
+	@[ $(current_stage) = stage2 ] || $(MAKE) stage2-start
+	@$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGE2_TFLAGS)"; \
+	echo "Checking multilib configuration for libffi..."; \
+	$(CC_FOR_TARGET) --print-multi-lib > $(TARGET_SUBDIR)/libffi/multilib.tmp 2> /dev/null; \
+	if test -r $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	  if cmp -s $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	    rm -f $(TARGET_SUBDIR)/libffi/multilib.tmp; \
+	  else \
+	    rm -f $(TARGET_SUBDIR)/libffi/Makefile; \
+	    mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	  fi; \
+	else \
+	  mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	fi; \
+	test ! -f $(TARGET_SUBDIR)/libffi/Makefile || exit 0; \
+	$(NORMAL_TARGET_EXPORTS) \
+	 \
+	CFLAGS="$(CFLAGS_FOR_TARGET)"; export CFLAGS; \
+	CXXFLAGS="$(CXXFLAGS_FOR_TARGET)"; export CXXFLAGS; \
+	LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)"; export LIBCFLAGS;  \
+	echo Configuring stage 2 in $(TARGET_SUBDIR)/libffi; \
+	$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi; \
+	cd $(TARGET_SUBDIR)/libffi || exit 1; \
+	case $(srcdir) in \
+	  /* | [A-Za-z]:[\\/]*) topdir=$(srcdir) ;; \
+	  *) topdir=`echo $(TARGET_SUBDIR)/libffi/ | \
+		sed -e 's,\./,,g' -e 's,[^/]*/,../,g' `$(srcdir) ;; \
+	esac; \
+	module_srcdir=libffi; \
+	$(SHELL) $$s/$$module_srcdir/configure \
+	  --srcdir=$${topdir}/$$module_srcdir \
+	  $(TARGET_CONFIGARGS) --build=${build_alias} --host=${target_alias} \
+	  --target=${target_alias} \
+	  --with-build-libsubdir=$(HOST_SUBDIR) \
+	  $(STAGE2_CONFIGURE_FLAGS)
+@endif target-libffi-bootstrap
+
+.PHONY: configure-stage3-target-libffi maybe-configure-stage3-target-libffi
+maybe-configure-stage3-target-libffi:
+@if target-libffi-bootstrap
+maybe-configure-stage3-target-libffi: configure-stage3-target-libffi
+configure-stage3-target-libffi:
+	@[ $(current_stage) = stage3 ] || $(MAKE) stage3-start
+	@$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGE3_TFLAGS)"; \
+	echo "Checking multilib configuration for libffi..."; \
+	$(CC_FOR_TARGET) --print-multi-lib > $(TARGET_SUBDIR)/libffi/multilib.tmp 2> /dev/null; \
+	if test -r $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	  if cmp -s $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	    rm -f $(TARGET_SUBDIR)/libffi/multilib.tmp; \
+	  else \
+	    rm -f $(TARGET_SUBDIR)/libffi/Makefile; \
+	    mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	  fi; \
+	else \
+	  mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	fi; \
+	test ! -f $(TARGET_SUBDIR)/libffi/Makefile || exit 0; \
+	$(NORMAL_TARGET_EXPORTS) \
+	 \
+	CFLAGS="$(CFLAGS_FOR_TARGET)"; export CFLAGS; \
+	CXXFLAGS="$(CXXFLAGS_FOR_TARGET)"; export CXXFLAGS; \
+	LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)"; export LIBCFLAGS;  \
+	echo Configuring stage 3 in $(TARGET_SUBDIR)/libffi; \
+	$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi; \
+	cd $(TARGET_SUBDIR)/libffi || exit 1; \
+	case $(srcdir) in \
+	  /* | [A-Za-z]:[\\/]*) topdir=$(srcdir) ;; \
+	  *) topdir=`echo $(TARGET_SUBDIR)/libffi/ | \
+		sed -e 's,\./,,g' -e 's,[^/]*/,../,g' `$(srcdir) ;; \
+	esac; \
+	module_srcdir=libffi; \
+	$(SHELL) $$s/$$module_srcdir/configure \
+	  --srcdir=$${topdir}/$$module_srcdir \
+	  $(TARGET_CONFIGARGS) --build=${build_alias} --host=${target_alias} \
+	  --target=${target_alias} \
+	  --with-build-libsubdir=$(HOST_SUBDIR) \
+	  $(STAGE3_CONFIGURE_FLAGS)
+@endif target-libffi-bootstrap
+
+.PHONY: configure-stage4-target-libffi maybe-configure-stage4-target-libffi
+maybe-configure-stage4-target-libffi:
+@if target-libffi-bootstrap
+maybe-configure-stage4-target-libffi: configure-stage4-target-libffi
+configure-stage4-target-libffi:
+	@[ $(current_stage) = stage4 ] || $(MAKE) stage4-start
+	@$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGE4_TFLAGS)"; \
+	echo "Checking multilib configuration for libffi..."; \
+	$(CC_FOR_TARGET) --print-multi-lib > $(TARGET_SUBDIR)/libffi/multilib.tmp 2> /dev/null; \
+	if test -r $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	  if cmp -s $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	    rm -f $(TARGET_SUBDIR)/libffi/multilib.tmp; \
+	  else \
+	    rm -f $(TARGET_SUBDIR)/libffi/Makefile; \
+	    mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	  fi; \
+	else \
+	  mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	fi; \
+	test ! -f $(TARGET_SUBDIR)/libffi/Makefile || exit 0; \
+	$(NORMAL_TARGET_EXPORTS) \
+	 \
+	CFLAGS="$(CFLAGS_FOR_TARGET)"; export CFLAGS; \
+	CXXFLAGS="$(CXXFLAGS_FOR_TARGET)"; export CXXFLAGS; \
+	LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)"; export LIBCFLAGS;  \
+	echo Configuring stage 4 in $(TARGET_SUBDIR)/libffi; \
+	$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi; \
+	cd $(TARGET_SUBDIR)/libffi || exit 1; \
+	case $(srcdir) in \
+	  /* | [A-Za-z]:[\\/]*) topdir=$(srcdir) ;; \
+	  *) topdir=`echo $(TARGET_SUBDIR)/libffi/ | \
+		sed -e 's,\./,,g' -e 's,[^/]*/,../,g' `$(srcdir) ;; \
+	esac; \
+	module_srcdir=libffi; \
+	$(SHELL) $$s/$$module_srcdir/configure \
+	  --srcdir=$${topdir}/$$module_srcdir \
+	  $(TARGET_CONFIGARGS) --build=${build_alias} --host=${target_alias} \
+	  --target=${target_alias} \
+	  --with-build-libsubdir=$(HOST_SUBDIR) \
+	  $(STAGE4_CONFIGURE_FLAGS)
+@endif target-libffi-bootstrap
+
+.PHONY: configure-stageprofile-target-libffi maybe-configure-stageprofile-target-libffi
+maybe-configure-stageprofile-target-libffi:
+@if target-libffi-bootstrap
+maybe-configure-stageprofile-target-libffi: configure-stageprofile-target-libffi
+configure-stageprofile-target-libffi:
+	@[ $(current_stage) = stageprofile ] || $(MAKE) stageprofile-start
+	@$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEprofile_TFLAGS)"; \
+	echo "Checking multilib configuration for libffi..."; \
+	$(CC_FOR_TARGET) --print-multi-lib > $(TARGET_SUBDIR)/libffi/multilib.tmp 2> /dev/null; \
+	if test -r $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	  if cmp -s $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	    rm -f $(TARGET_SUBDIR)/libffi/multilib.tmp; \
+	  else \
+	    rm -f $(TARGET_SUBDIR)/libffi/Makefile; \
+	    mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	  fi; \
+	else \
+	  mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	fi; \
+	test ! -f $(TARGET_SUBDIR)/libffi/Makefile || exit 0; \
+	$(NORMAL_TARGET_EXPORTS) \
+	 \
+	CFLAGS="$(CFLAGS_FOR_TARGET)"; export CFLAGS; \
+	CXXFLAGS="$(CXXFLAGS_FOR_TARGET)"; export CXXFLAGS; \
+	LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)"; export LIBCFLAGS;  \
+	echo Configuring stage profile in $(TARGET_SUBDIR)/libffi; \
+	$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi; \
+	cd $(TARGET_SUBDIR)/libffi || exit 1; \
+	case $(srcdir) in \
+	  /* | [A-Za-z]:[\\/]*) topdir=$(srcdir) ;; \
+	  *) topdir=`echo $(TARGET_SUBDIR)/libffi/ | \
+		sed -e 's,\./,,g' -e 's,[^/]*/,../,g' `$(srcdir) ;; \
+	esac; \
+	module_srcdir=libffi; \
+	$(SHELL) $$s/$$module_srcdir/configure \
+	  --srcdir=$${topdir}/$$module_srcdir \
+	  $(TARGET_CONFIGARGS) --build=${build_alias} --host=${target_alias} \
+	  --target=${target_alias} \
+	  --with-build-libsubdir=$(HOST_SUBDIR) \
+	  $(STAGEprofile_CONFIGURE_FLAGS)
+@endif target-libffi-bootstrap
+
+.PHONY: configure-stagetrain-target-libffi maybe-configure-stagetrain-target-libffi
+maybe-configure-stagetrain-target-libffi:
+@if target-libffi-bootstrap
+maybe-configure-stagetrain-target-libffi: configure-stagetrain-target-libffi
+configure-stagetrain-target-libffi:
+	@[ $(current_stage) = stagetrain ] || $(MAKE) stagetrain-start
+	@$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEtrain_TFLAGS)"; \
+	echo "Checking multilib configuration for libffi..."; \
+	$(CC_FOR_TARGET) --print-multi-lib > $(TARGET_SUBDIR)/libffi/multilib.tmp 2> /dev/null; \
+	if test -r $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	  if cmp -s $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	    rm -f $(TARGET_SUBDIR)/libffi/multilib.tmp; \
+	  else \
+	    rm -f $(TARGET_SUBDIR)/libffi/Makefile; \
+	    mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	  fi; \
+	else \
+	  mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	fi; \
+	test ! -f $(TARGET_SUBDIR)/libffi/Makefile || exit 0; \
+	$(NORMAL_TARGET_EXPORTS) \
+	 \
+	CFLAGS="$(CFLAGS_FOR_TARGET)"; export CFLAGS; \
+	CXXFLAGS="$(CXXFLAGS_FOR_TARGET)"; export CXXFLAGS; \
+	LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)"; export LIBCFLAGS;  \
+	echo Configuring stage train in $(TARGET_SUBDIR)/libffi; \
+	$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi; \
+	cd $(TARGET_SUBDIR)/libffi || exit 1; \
+	case $(srcdir) in \
+	  /* | [A-Za-z]:[\\/]*) topdir=$(srcdir) ;; \
+	  *) topdir=`echo $(TARGET_SUBDIR)/libffi/ | \
+		sed -e 's,\./,,g' -e 's,[^/]*/,../,g' `$(srcdir) ;; \
+	esac; \
+	module_srcdir=libffi; \
+	$(SHELL) $$s/$$module_srcdir/configure \
+	  --srcdir=$${topdir}/$$module_srcdir \
+	  $(TARGET_CONFIGARGS) --build=${build_alias} --host=${target_alias} \
+	  --target=${target_alias} \
+	  --with-build-libsubdir=$(HOST_SUBDIR) \
+	  $(STAGEtrain_CONFIGURE_FLAGS)
+@endif target-libffi-bootstrap
+
+.PHONY: configure-stagefeedback-target-libffi maybe-configure-stagefeedback-target-libffi
+maybe-configure-stagefeedback-target-libffi:
+@if target-libffi-bootstrap
+maybe-configure-stagefeedback-target-libffi: configure-stagefeedback-target-libffi
+configure-stagefeedback-target-libffi:
+	@[ $(current_stage) = stagefeedback ] || $(MAKE) stagefeedback-start
+	@$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEfeedback_TFLAGS)"; \
+	echo "Checking multilib configuration for libffi..."; \
+	$(CC_FOR_TARGET) --print-multi-lib > $(TARGET_SUBDIR)/libffi/multilib.tmp 2> /dev/null; \
+	if test -r $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	  if cmp -s $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	    rm -f $(TARGET_SUBDIR)/libffi/multilib.tmp; \
+	  else \
+	    rm -f $(TARGET_SUBDIR)/libffi/Makefile; \
+	    mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	  fi; \
+	else \
+	  mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	fi; \
+	test ! -f $(TARGET_SUBDIR)/libffi/Makefile || exit 0; \
+	$(NORMAL_TARGET_EXPORTS) \
+	 \
+	CFLAGS="$(CFLAGS_FOR_TARGET)"; export CFLAGS; \
+	CXXFLAGS="$(CXXFLAGS_FOR_TARGET)"; export CXXFLAGS; \
+	LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)"; export LIBCFLAGS;  \
+	echo Configuring stage feedback in $(TARGET_SUBDIR)/libffi; \
+	$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi; \
+	cd $(TARGET_SUBDIR)/libffi || exit 1; \
+	case $(srcdir) in \
+	  /* | [A-Za-z]:[\\/]*) topdir=$(srcdir) ;; \
+	  *) topdir=`echo $(TARGET_SUBDIR)/libffi/ | \
+		sed -e 's,\./,,g' -e 's,[^/]*/,../,g' `$(srcdir) ;; \
+	esac; \
+	module_srcdir=libffi; \
+	$(SHELL) $$s/$$module_srcdir/configure \
+	  --srcdir=$${topdir}/$$module_srcdir \
+	  $(TARGET_CONFIGARGS) --build=${build_alias} --host=${target_alias} \
+	  --target=${target_alias} \
+	  --with-build-libsubdir=$(HOST_SUBDIR) \
+	  $(STAGEfeedback_CONFIGURE_FLAGS)
+@endif target-libffi-bootstrap
+
+.PHONY: configure-stageautoprofile-target-libffi maybe-configure-stageautoprofile-target-libffi
+maybe-configure-stageautoprofile-target-libffi:
+@if target-libffi-bootstrap
+maybe-configure-stageautoprofile-target-libffi: configure-stageautoprofile-target-libffi
+configure-stageautoprofile-target-libffi:
+	@[ $(current_stage) = stageautoprofile ] || $(MAKE) stageautoprofile-start
+	@$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEautoprofile_TFLAGS)"; \
+	echo "Checking multilib configuration for libffi..."; \
+	$(CC_FOR_TARGET) --print-multi-lib > $(TARGET_SUBDIR)/libffi/multilib.tmp 2> /dev/null; \
+	if test -r $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	  if cmp -s $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	    rm -f $(TARGET_SUBDIR)/libffi/multilib.tmp; \
+	  else \
+	    rm -f $(TARGET_SUBDIR)/libffi/Makefile; \
+	    mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	  fi; \
+	else \
+	  mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	fi; \
+	test ! -f $(TARGET_SUBDIR)/libffi/Makefile || exit 0; \
+	$(NORMAL_TARGET_EXPORTS) \
+	 \
+	CFLAGS="$(CFLAGS_FOR_TARGET)"; export CFLAGS; \
+	CXXFLAGS="$(CXXFLAGS_FOR_TARGET)"; export CXXFLAGS; \
+	LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)"; export LIBCFLAGS;  \
+	echo Configuring stage autoprofile in $(TARGET_SUBDIR)/libffi; \
+	$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi; \
+	cd $(TARGET_SUBDIR)/libffi || exit 1; \
+	case $(srcdir) in \
+	  /* | [A-Za-z]:[\\/]*) topdir=$(srcdir) ;; \
+	  *) topdir=`echo $(TARGET_SUBDIR)/libffi/ | \
+		sed -e 's,\./,,g' -e 's,[^/]*/,../,g' `$(srcdir) ;; \
+	esac; \
+	module_srcdir=libffi; \
+	$(SHELL) $$s/$$module_srcdir/configure \
+	  --srcdir=$${topdir}/$$module_srcdir \
+	  $(TARGET_CONFIGARGS) --build=${build_alias} --host=${target_alias} \
+	  --target=${target_alias} \
+	  --with-build-libsubdir=$(HOST_SUBDIR) \
+	  $(STAGEautoprofile_CONFIGURE_FLAGS)
+@endif target-libffi-bootstrap
+
+.PHONY: configure-stageautofeedback-target-libffi maybe-configure-stageautofeedback-target-libffi
+maybe-configure-stageautofeedback-target-libffi:
+@if target-libffi-bootstrap
+maybe-configure-stageautofeedback-target-libffi: configure-stageautofeedback-target-libffi
+configure-stageautofeedback-target-libffi:
+	@[ $(current_stage) = stageautofeedback ] || $(MAKE) stageautofeedback-start
+	@$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEautofeedback_TFLAGS)"; \
+	echo "Checking multilib configuration for libffi..."; \
+	$(CC_FOR_TARGET) --print-multi-lib > $(TARGET_SUBDIR)/libffi/multilib.tmp 2> /dev/null; \
+	if test -r $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	  if cmp -s $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; then \
+	    rm -f $(TARGET_SUBDIR)/libffi/multilib.tmp; \
+	  else \
+	    rm -f $(TARGET_SUBDIR)/libffi/Makefile; \
+	    mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	  fi; \
+	else \
+	  mv $(TARGET_SUBDIR)/libffi/multilib.tmp $(TARGET_SUBDIR)/libffi/multilib.out; \
+	fi; \
+	test ! -f $(TARGET_SUBDIR)/libffi/Makefile || exit 0; \
+	$(NORMAL_TARGET_EXPORTS) \
+	 \
+	CFLAGS="$(CFLAGS_FOR_TARGET)"; export CFLAGS; \
+	CXXFLAGS="$(CXXFLAGS_FOR_TARGET)"; export CXXFLAGS; \
+	LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)"; export LIBCFLAGS;  \
+	echo Configuring stage autofeedback in $(TARGET_SUBDIR)/libffi; \
+	$(SHELL) $(srcdir)/mkinstalldirs $(TARGET_SUBDIR)/libffi; \
+	cd $(TARGET_SUBDIR)/libffi || exit 1; \
+	case $(srcdir) in \
+	  /* | [A-Za-z]:[\\/]*) topdir=$(srcdir) ;; \
+	  *) topdir=`echo $(TARGET_SUBDIR)/libffi/ | \
+		sed -e 's,\./,,g' -e 's,[^/]*/,../,g' `$(srcdir) ;; \
+	esac; \
+	module_srcdir=libffi; \
+	$(SHELL) $$s/$$module_srcdir/configure \
+	  --srcdir=$${topdir}/$$module_srcdir \
+	  $(TARGET_CONFIGARGS) --build=${build_alias} --host=${target_alias} \
+	  --target=${target_alias} \
+	  --with-build-libsubdir=$(HOST_SUBDIR) \
+	  $(STAGEautofeedback_CONFIGURE_FLAGS)
+@endif target-libffi-bootstrap
+
+
+
 
 
 .PHONY: all-target-libffi maybe-all-target-libffi
@@ -49023,7 +49430,6 @@
 TARGET-target-libffi=all
 maybe-all-target-libffi: all-target-libffi
 all-target-libffi: configure-target-libffi
-	@: $(MAKE); $(unstage)
 	@r=`${PWD_COMMAND}`; export r; \
 	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
 	$(NORMAL_TARGET_EXPORTS)  \
@@ -49034,6 +49440,387 @@
 
 
 
+.PHONY: all-stage1-target-libffi maybe-all-stage1-target-libffi
+.PHONY: clean-stage1-target-libffi maybe-clean-stage1-target-libffi
+maybe-all-stage1-target-libffi:
+maybe-clean-stage1-target-libffi:
+@if target-libffi-bootstrap
+maybe-all-stage1-target-libffi: all-stage1-target-libffi
+all-stage1: all-stage1-target-libffi
+TARGET-stage1-target-libffi = $(TARGET-target-libffi)
+all-stage1-target-libffi: configure-stage1-target-libffi
+	@[ $(current_stage) = stage1 ] || $(MAKE) stage1-start
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGE1_TFLAGS)"; \
+	$(NORMAL_TARGET_EXPORTS)  \
+	cd $(TARGET_SUBDIR)/libffi && \
+	 \
+	$(MAKE) $(BASE_FLAGS_TO_PASS) \
+		CFLAGS="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)" \
+		CFLAGS_FOR_TARGET="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS_FOR_TARGET="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS_FOR_TARGET="$(LIBCFLAGS_FOR_TARGET)" \
+		$(EXTRA_TARGET_FLAGS)  \
+		  \
+		TFLAGS="$(STAGE1_TFLAGS)"  \
+		$(TARGET-stage1-target-libffi)
+
+maybe-clean-stage1-target-libffi: clean-stage1-target-libffi
+clean-stage1: clean-stage1-target-libffi
+clean-stage1-target-libffi:
+	@if [ $(current_stage) = stage1 ]; then \
+	  [ -f $(TARGET_SUBDIR)/libffi/Makefile ] || exit 0; \
+	else \
+	  [ -f $(TARGET_SUBDIR)/stage1-libffi/Makefile ] || exit 0; \
+	  $(MAKE) stage1-start; \
+	fi; \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$(MAKE) $(EXTRA_TARGET_FLAGS)  \
+	  clean
+@endif target-libffi-bootstrap
+
+
+.PHONY: all-stage2-target-libffi maybe-all-stage2-target-libffi
+.PHONY: clean-stage2-target-libffi maybe-clean-stage2-target-libffi
+maybe-all-stage2-target-libffi:
+maybe-clean-stage2-target-libffi:
+@if target-libffi-bootstrap
+maybe-all-stage2-target-libffi: all-stage2-target-libffi
+all-stage2: all-stage2-target-libffi
+TARGET-stage2-target-libffi = $(TARGET-target-libffi)
+all-stage2-target-libffi: configure-stage2-target-libffi
+	@[ $(current_stage) = stage2 ] || $(MAKE) stage2-start
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGE2_TFLAGS)"; \
+	$(NORMAL_TARGET_EXPORTS) \
+	  \
+	cd $(TARGET_SUBDIR)/libffi && \
+	 \
+	$(MAKE) $(BASE_FLAGS_TO_PASS) \
+		CFLAGS="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)" \
+		CFLAGS_FOR_TARGET="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS_FOR_TARGET="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS_FOR_TARGET="$(LIBCFLAGS_FOR_TARGET)" \
+		$(EXTRA_TARGET_FLAGS)   \
+		TFLAGS="$(STAGE2_TFLAGS)"  \
+		$(TARGET-stage2-target-libffi)
+
+maybe-clean-stage2-target-libffi: clean-stage2-target-libffi
+clean-stage2: clean-stage2-target-libffi
+clean-stage2-target-libffi:
+	@if [ $(current_stage) = stage2 ]; then \
+	  [ -f $(TARGET_SUBDIR)/libffi/Makefile ] || exit 0; \
+	else \
+	  [ -f $(TARGET_SUBDIR)/stage2-libffi/Makefile ] || exit 0; \
+	  $(MAKE) stage2-start; \
+	fi; \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$(MAKE) $(EXTRA_TARGET_FLAGS)   clean
+@endif target-libffi-bootstrap
+
+
+.PHONY: all-stage3-target-libffi maybe-all-stage3-target-libffi
+.PHONY: clean-stage3-target-libffi maybe-clean-stage3-target-libffi
+maybe-all-stage3-target-libffi:
+maybe-clean-stage3-target-libffi:
+@if target-libffi-bootstrap
+maybe-all-stage3-target-libffi: all-stage3-target-libffi
+all-stage3: all-stage3-target-libffi
+TARGET-stage3-target-libffi = $(TARGET-target-libffi)
+all-stage3-target-libffi: configure-stage3-target-libffi
+	@[ $(current_stage) = stage3 ] || $(MAKE) stage3-start
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGE3_TFLAGS)"; \
+	$(NORMAL_TARGET_EXPORTS) \
+	  \
+	cd $(TARGET_SUBDIR)/libffi && \
+	 \
+	$(MAKE) $(BASE_FLAGS_TO_PASS) \
+		CFLAGS="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)" \
+		CFLAGS_FOR_TARGET="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS_FOR_TARGET="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS_FOR_TARGET="$(LIBCFLAGS_FOR_TARGET)" \
+		$(EXTRA_TARGET_FLAGS)   \
+		TFLAGS="$(STAGE3_TFLAGS)"  \
+		$(TARGET-stage3-target-libffi)
+
+maybe-clean-stage3-target-libffi: clean-stage3-target-libffi
+clean-stage3: clean-stage3-target-libffi
+clean-stage3-target-libffi:
+	@if [ $(current_stage) = stage3 ]; then \
+	  [ -f $(TARGET_SUBDIR)/libffi/Makefile ] || exit 0; \
+	else \
+	  [ -f $(TARGET_SUBDIR)/stage3-libffi/Makefile ] || exit 0; \
+	  $(MAKE) stage3-start; \
+	fi; \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$(MAKE) $(EXTRA_TARGET_FLAGS)   clean
+@endif target-libffi-bootstrap
+
+
+.PHONY: all-stage4-target-libffi maybe-all-stage4-target-libffi
+.PHONY: clean-stage4-target-libffi maybe-clean-stage4-target-libffi
+maybe-all-stage4-target-libffi:
+maybe-clean-stage4-target-libffi:
+@if target-libffi-bootstrap
+maybe-all-stage4-target-libffi: all-stage4-target-libffi
+all-stage4: all-stage4-target-libffi
+TARGET-stage4-target-libffi = $(TARGET-target-libffi)
+all-stage4-target-libffi: configure-stage4-target-libffi
+	@[ $(current_stage) = stage4 ] || $(MAKE) stage4-start
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGE4_TFLAGS)"; \
+	$(NORMAL_TARGET_EXPORTS) \
+	  \
+	cd $(TARGET_SUBDIR)/libffi && \
+	 \
+	$(MAKE) $(BASE_FLAGS_TO_PASS) \
+		CFLAGS="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)" \
+		CFLAGS_FOR_TARGET="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS_FOR_TARGET="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS_FOR_TARGET="$(LIBCFLAGS_FOR_TARGET)" \
+		$(EXTRA_TARGET_FLAGS)   \
+		TFLAGS="$(STAGE4_TFLAGS)"  \
+		$(TARGET-stage4-target-libffi)
+
+maybe-clean-stage4-target-libffi: clean-stage4-target-libffi
+clean-stage4: clean-stage4-target-libffi
+clean-stage4-target-libffi:
+	@if [ $(current_stage) = stage4 ]; then \
+	  [ -f $(TARGET_SUBDIR)/libffi/Makefile ] || exit 0; \
+	else \
+	  [ -f $(TARGET_SUBDIR)/stage4-libffi/Makefile ] || exit 0; \
+	  $(MAKE) stage4-start; \
+	fi; \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$(MAKE) $(EXTRA_TARGET_FLAGS)   clean
+@endif target-libffi-bootstrap
+
+
+.PHONY: all-stageprofile-target-libffi maybe-all-stageprofile-target-libffi
+.PHONY: clean-stageprofile-target-libffi maybe-clean-stageprofile-target-libffi
+maybe-all-stageprofile-target-libffi:
+maybe-clean-stageprofile-target-libffi:
+@if target-libffi-bootstrap
+maybe-all-stageprofile-target-libffi: all-stageprofile-target-libffi
+all-stageprofile: all-stageprofile-target-libffi
+TARGET-stageprofile-target-libffi = $(TARGET-target-libffi)
+all-stageprofile-target-libffi: configure-stageprofile-target-libffi
+	@[ $(current_stage) = stageprofile ] || $(MAKE) stageprofile-start
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEprofile_TFLAGS)"; \
+	$(NORMAL_TARGET_EXPORTS) \
+	  \
+	cd $(TARGET_SUBDIR)/libffi && \
+	 \
+	$(MAKE) $(BASE_FLAGS_TO_PASS) \
+		CFLAGS="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)" \
+		CFLAGS_FOR_TARGET="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS_FOR_TARGET="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS_FOR_TARGET="$(LIBCFLAGS_FOR_TARGET)" \
+		$(EXTRA_TARGET_FLAGS)   \
+		TFLAGS="$(STAGEprofile_TFLAGS)"  \
+		$(TARGET-stageprofile-target-libffi)
+
+maybe-clean-stageprofile-target-libffi: clean-stageprofile-target-libffi
+clean-stageprofile: clean-stageprofile-target-libffi
+clean-stageprofile-target-libffi:
+	@if [ $(current_stage) = stageprofile ]; then \
+	  [ -f $(TARGET_SUBDIR)/libffi/Makefile ] || exit 0; \
+	else \
+	  [ -f $(TARGET_SUBDIR)/stageprofile-libffi/Makefile ] || exit 0; \
+	  $(MAKE) stageprofile-start; \
+	fi; \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$(MAKE) $(EXTRA_TARGET_FLAGS)   clean
+@endif target-libffi-bootstrap
+
+
+.PHONY: all-stagetrain-target-libffi maybe-all-stagetrain-target-libffi
+.PHONY: clean-stagetrain-target-libffi maybe-clean-stagetrain-target-libffi
+maybe-all-stagetrain-target-libffi:
+maybe-clean-stagetrain-target-libffi:
+@if target-libffi-bootstrap
+maybe-all-stagetrain-target-libffi: all-stagetrain-target-libffi
+all-stagetrain: all-stagetrain-target-libffi
+TARGET-stagetrain-target-libffi = $(TARGET-target-libffi)
+all-stagetrain-target-libffi: configure-stagetrain-target-libffi
+	@[ $(current_stage) = stagetrain ] || $(MAKE) stagetrain-start
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEtrain_TFLAGS)"; \
+	$(NORMAL_TARGET_EXPORTS) \
+	  \
+	cd $(TARGET_SUBDIR)/libffi && \
+	 \
+	$(MAKE) $(BASE_FLAGS_TO_PASS) \
+		CFLAGS="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)" \
+		CFLAGS_FOR_TARGET="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS_FOR_TARGET="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS_FOR_TARGET="$(LIBCFLAGS_FOR_TARGET)" \
+		$(EXTRA_TARGET_FLAGS)   \
+		TFLAGS="$(STAGEtrain_TFLAGS)"  \
+		$(TARGET-stagetrain-target-libffi)
+
+maybe-clean-stagetrain-target-libffi: clean-stagetrain-target-libffi
+clean-stagetrain: clean-stagetrain-target-libffi
+clean-stagetrain-target-libffi:
+	@if [ $(current_stage) = stagetrain ]; then \
+	  [ -f $(TARGET_SUBDIR)/libffi/Makefile ] || exit 0; \
+	else \
+	  [ -f $(TARGET_SUBDIR)/stagetrain-libffi/Makefile ] || exit 0; \
+	  $(MAKE) stagetrain-start; \
+	fi; \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$(MAKE) $(EXTRA_TARGET_FLAGS)   clean
+@endif target-libffi-bootstrap
+
+
+.PHONY: all-stagefeedback-target-libffi maybe-all-stagefeedback-target-libffi
+.PHONY: clean-stagefeedback-target-libffi maybe-clean-stagefeedback-target-libffi
+maybe-all-stagefeedback-target-libffi:
+maybe-clean-stagefeedback-target-libffi:
+@if target-libffi-bootstrap
+maybe-all-stagefeedback-target-libffi: all-stagefeedback-target-libffi
+all-stagefeedback: all-stagefeedback-target-libffi
+TARGET-stagefeedback-target-libffi = $(TARGET-target-libffi)
+all-stagefeedback-target-libffi: configure-stagefeedback-target-libffi
+	@[ $(current_stage) = stagefeedback ] || $(MAKE) stagefeedback-start
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEfeedback_TFLAGS)"; \
+	$(NORMAL_TARGET_EXPORTS) \
+	  \
+	cd $(TARGET_SUBDIR)/libffi && \
+	 \
+	$(MAKE) $(BASE_FLAGS_TO_PASS) \
+		CFLAGS="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)" \
+		CFLAGS_FOR_TARGET="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS_FOR_TARGET="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS_FOR_TARGET="$(LIBCFLAGS_FOR_TARGET)" \
+		$(EXTRA_TARGET_FLAGS)   \
+		TFLAGS="$(STAGEfeedback_TFLAGS)"  \
+		$(TARGET-stagefeedback-target-libffi)
+
+maybe-clean-stagefeedback-target-libffi: clean-stagefeedback-target-libffi
+clean-stagefeedback: clean-stagefeedback-target-libffi
+clean-stagefeedback-target-libffi:
+	@if [ $(current_stage) = stagefeedback ]; then \
+	  [ -f $(TARGET_SUBDIR)/libffi/Makefile ] || exit 0; \
+	else \
+	  [ -f $(TARGET_SUBDIR)/stagefeedback-libffi/Makefile ] || exit 0; \
+	  $(MAKE) stagefeedback-start; \
+	fi; \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$(MAKE) $(EXTRA_TARGET_FLAGS)   clean
+@endif target-libffi-bootstrap
+
+
+.PHONY: all-stageautoprofile-target-libffi maybe-all-stageautoprofile-target-libffi
+.PHONY: clean-stageautoprofile-target-libffi maybe-clean-stageautoprofile-target-libffi
+maybe-all-stageautoprofile-target-libffi:
+maybe-clean-stageautoprofile-target-libffi:
+@if target-libffi-bootstrap
+maybe-all-stageautoprofile-target-libffi: all-stageautoprofile-target-libffi
+all-stageautoprofile: all-stageautoprofile-target-libffi
+TARGET-stageautoprofile-target-libffi = $(TARGET-target-libffi)
+all-stageautoprofile-target-libffi: configure-stageautoprofile-target-libffi
+	@[ $(current_stage) = stageautoprofile ] || $(MAKE) stageautoprofile-start
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEautoprofile_TFLAGS)"; \
+	$(NORMAL_TARGET_EXPORTS) \
+	  \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$$s/gcc/config/i386/$(AUTO_PROFILE) \
+	$(MAKE) $(BASE_FLAGS_TO_PASS) \
+		CFLAGS="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)" \
+		CFLAGS_FOR_TARGET="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS_FOR_TARGET="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS_FOR_TARGET="$(LIBCFLAGS_FOR_TARGET)" \
+		$(EXTRA_TARGET_FLAGS)   \
+		TFLAGS="$(STAGEautoprofile_TFLAGS)"  \
+		$(TARGET-stageautoprofile-target-libffi)
+
+maybe-clean-stageautoprofile-target-libffi: clean-stageautoprofile-target-libffi
+clean-stageautoprofile: clean-stageautoprofile-target-libffi
+clean-stageautoprofile-target-libffi:
+	@if [ $(current_stage) = stageautoprofile ]; then \
+	  [ -f $(TARGET_SUBDIR)/libffi/Makefile ] || exit 0; \
+	else \
+	  [ -f $(TARGET_SUBDIR)/stageautoprofile-libffi/Makefile ] || exit 0; \
+	  $(MAKE) stageautoprofile-start; \
+	fi; \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$(MAKE) $(EXTRA_TARGET_FLAGS)   clean
+@endif target-libffi-bootstrap
+
+
+.PHONY: all-stageautofeedback-target-libffi maybe-all-stageautofeedback-target-libffi
+.PHONY: clean-stageautofeedback-target-libffi maybe-clean-stageautofeedback-target-libffi
+maybe-all-stageautofeedback-target-libffi:
+maybe-clean-stageautofeedback-target-libffi:
+@if target-libffi-bootstrap
+maybe-all-stageautofeedback-target-libffi: all-stageautofeedback-target-libffi
+all-stageautofeedback: all-stageautofeedback-target-libffi
+TARGET-stageautofeedback-target-libffi = $(TARGET-target-libffi)
+all-stageautofeedback-target-libffi: configure-stageautofeedback-target-libffi
+	@[ $(current_stage) = stageautofeedback ] || $(MAKE) stageautofeedback-start
+	@r=`${PWD_COMMAND}`; export r; \
+	s=`cd $(srcdir); ${PWD_COMMAND}`; export s; \
+	TFLAGS="$(STAGEautofeedback_TFLAGS)"; \
+	$(NORMAL_TARGET_EXPORTS) \
+	  \
+	cd $(TARGET_SUBDIR)/libffi && \
+	 \
+	$(MAKE) $(BASE_FLAGS_TO_PASS) \
+		CFLAGS="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS="$(LIBCFLAGS_FOR_TARGET)" \
+		CFLAGS_FOR_TARGET="$(CFLAGS_FOR_TARGET)" \
+		CXXFLAGS_FOR_TARGET="$(CXXFLAGS_FOR_TARGET)" \
+		LIBCFLAGS_FOR_TARGET="$(LIBCFLAGS_FOR_TARGET)" \
+		$(EXTRA_TARGET_FLAGS)   \
+		TFLAGS="$(STAGEautofeedback_TFLAGS)" PERF_DATA=perf.data \
+		$(TARGET-stageautofeedback-target-libffi)
+
+maybe-clean-stageautofeedback-target-libffi: clean-stageautofeedback-target-libffi
+clean-stageautofeedback: clean-stageautofeedback-target-libffi
+clean-stageautofeedback-target-libffi:
+	@if [ $(current_stage) = stageautofeedback ]; then \
+	  [ -f $(TARGET_SUBDIR)/libffi/Makefile ] || exit 0; \
+	else \
+	  [ -f $(TARGET_SUBDIR)/stageautofeedback-libffi/Makefile ] || exit 0; \
+	  $(MAKE) stageautofeedback-start; \
+	fi; \
+	cd $(TARGET_SUBDIR)/libffi && \
+	$(MAKE) $(EXTRA_TARGET_FLAGS)   clean
+@endif target-libffi-bootstrap
+
+
+
+
 
 
 .PHONY: check-target-libffi maybe-check-target-libffi
@@ -56251,7 +57038,15 @@
 configure-target-libtermcap: stage_last
 configure-target-winsup: stage_last
 configure-target-libgloss: stage_last
-configure-target-libffi: stage_last
+configure-stage1-target-libffi: maybe-all-stage1-gcc
+configure-stage2-target-libffi: maybe-all-stage2-gcc
+configure-stage3-target-libffi: maybe-all-stage3-gcc
+configure-stage4-target-libffi: maybe-all-stage4-gcc
+configure-stageprofile-target-libffi: maybe-all-stageprofile-gcc
+configure-stagetrain-target-libffi: maybe-all-stagetrain-gcc
+configure-stagefeedback-target-libffi: maybe-all-stagefeedback-gcc
+configure-stageautoprofile-target-libffi: maybe-all-stageautoprofile-gcc
+configure-stageautofeedback-target-libffi: maybe-all-stageautofeedback-gcc
 configure-target-zlib: stage_last
 configure-target-rda: stage_last
 configure-target-libada: stage_last
@@ -57274,10 +58069,28 @@
 all-m4: maybe-all-build-texinfo
 configure-target-fastjar: maybe-configure-target-zlib
 all-target-fastjar: maybe-all-target-zlib
-configure-target-libgo: maybe-configure-target-libffi
 all-target-libgo: maybe-all-target-libbacktrace
-all-target-libgo: maybe-all-target-libffi
 all-target-libgo: maybe-all-target-libatomic
+all-target-libgomp: maybe-all-target-libffi
+all-stage1-target-libgomp: maybe-all-stage1-target-libffi
+all-stage2-target-libgomp: maybe-all-stage2-target-libffi
+all-stage3-target-libgomp: maybe-all-stage3-target-libffi
+all-stage4-target-libgomp: maybe-all-stage4-target-libffi
+all-stageprofile-target-libgomp: maybe-all-stageprofile-target-libffi
+all-stagetrain-target-libgomp: maybe-all-stagetrain-target-libffi
+all-stagefeedback-target-libgomp: maybe-all-stagefeedback-target-libffi
+all-stageautoprofile-target-libgomp: maybe-all-stageautoprofile-target-libffi
+all-stageautofeedback-target-libgomp: maybe-all-stageautofeedback-target-libffi
+configure-target-libgomp: maybe-configure-target-libffi
+configure-stage1-target-libgomp: maybe-configure-stage1-target-libffi
+configure-stage2-target-libgomp: maybe-configure-stage2-target-libffi
+configure-stage3-target-libgomp: maybe-configure-stage3-target-libffi
+configure-stage4-target-libgomp: maybe-configure-stage4-target-libffi
+configure-stageprofile-target-libgomp: maybe-configure-stageprofile-target-libffi
+configure-stagetrain-target-libgomp: maybe-configure-stagetrain-target-libffi
+configure-stagefeedback-target-libgomp: maybe-configure-stagefeedback-target-libffi
+configure-stageautoprofile-target-libgomp: maybe-configure-stageautoprofile-target-libffi
+configure-stageautofeedback-target-libgomp: maybe-configure-stageautofeedback-target-libffi
 configure-target-libphobos: maybe-configure-target-libbacktrace
 configure-target-libphobos: maybe-configure-target-zlib
 all-target-libphobos: maybe-all-target-libbacktrace
@@ -57328,6 +58141,7 @@
 install-target-libgfortran: maybe-install-target-libgcc
 install-target-libphobos: maybe-install-target-libatomic
 install-target-libsanitizer: maybe-install-target-libstdc++-v3
+install-target-libgomp: maybe-install-target-libffi
 install-target-libsanitizer: maybe-install-target-libgcc
 install-target-libvtv: maybe-install-target-libstdc++-v3
 install-target-libvtv: maybe-install-target-libgcc
@@ -57387,7 +58201,9 @@
 all-bison: maybe-all-intl
 all-flex: maybe-all-intl
 all-m4: maybe-all-intl
+configure-target-libgo: maybe-configure-target-libffi
 configure-target-libgo: maybe-all-target-libstdc++-v3
+all-target-libgo: maybe-all-target-libffi
 configure-target-liboffloadmic: maybe-configure-target-libgomp
 all-target-liboffloadmic: maybe-all-target-libgomp
 configure-target-newlib: maybe-all-binutils
@@ -57427,6 +58243,15 @@
 configure-stagefeedback-target-libvtv: maybe-all-stagefeedback-target-libgcc
 configure-stageautoprofile-target-libvtv: maybe-all-stageautoprofile-target-libgcc
 configure-stageautofeedback-target-libvtv: maybe-all-stageautofeedback-target-libgcc
+configure-stage1-target-libffi: maybe-all-stage1-target-libgcc
+configure-stage2-target-libffi: maybe-all-stage2-target-libgcc
+configure-stage3-target-libffi: maybe-all-stage3-target-libgcc
+configure-stage4-target-libffi: maybe-all-stage4-target-libgcc
+configure-stageprofile-target-libffi: maybe-all-stageprofile-target-libgcc
+configure-stagetrain-target-libffi: maybe-all-stagetrain-target-libgcc
+configure-stagefeedback-target-libffi: maybe-all-stagefeedback-target-libgcc
+configure-stageautoprofile-target-libffi: maybe-all-stageautoprofile-target-libgcc
+configure-stageautofeedback-target-libffi: maybe-all-stageautofeedback-target-libgcc
 configure-stage1-target-libgomp: maybe-all-stage1-target-libgcc
 configure-stage2-target-libgomp: maybe-all-stage2-target-libgcc
 configure-stage3-target-libgomp: maybe-all-stage3-target-libgcc
@@ -57498,7 +58323,6 @@
 
 
 configure-target-libffi: maybe-all-target-newlib maybe-all-target-libgloss
-configure-target-libffi: maybe-all-target-libstdc++-v3
 
 configure-target-zlib: maybe-all-target-newlib maybe-all-target-libgloss
 
diff --git a/config/ChangeLog.omp b/config/ChangeLog.omp
new file mode 100644
index 0000000..0a4142d
--- /dev/null
+++ b/config/ChangeLog.omp
@@ -0,0 +1,7 @@
+2019-06-25  Kwok Cheung Yeung  <kcy@codesourcery.com>
+            Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+
+	* gthr.m4 (GCC_AC_THREAD_HEADER): Add case for gcn.
+
diff --git a/config/gthr.m4 b/config/gthr.m4
index 7b29f1f..4b93730 100644
--- a/config/gthr.m4
+++ b/config/gthr.m4
@@ -13,6 +13,7 @@
 case $1 in
     aix)	thread_header=config/rs6000/gthr-aix.h ;;
     dce)	thread_header=config/pa/gthr-dce.h ;;
+    gcn)	thread_header=config/gcn/gthr-gcn.h ;;
     lynx)	thread_header=config/gthr-lynx.h ;;
     mipssde)	thread_header=config/mips/gthr-mipssde.h ;;
     posix)	thread_header=gthr-posix.h ;;
diff --git a/configure b/configure
index abd93a9..ef00d1f 100755
--- a/configure
+++ b/configure
@@ -3466,6 +3466,9 @@
   alpha*-*-*vms*)
     noconfigdirs="$noconfigdirs target-libffi"
     ;;
+  amdgcn*-*-*)
+    noconfigdirs="$noconfigdirs target-libffi"
+    ;;
   arm*-*-freebsd*)
     noconfigdirs="$noconfigdirs target-libffi"
     ;;
@@ -3513,11 +3516,19 @@
   ft32-*-*)
     noconfigdirs="$noconfigdirs target-libffi"
     ;;
+  nvptx-*-*)
+    noconfigdirs="$noconfigdirs target-libffi"
+    ;;
   *-*-lynxos*)
     noconfigdirs="$noconfigdirs target-libffi"
     ;;
 esac
 
+libgomp_deps="target-libffi"
+if echo " ${noconfigdirs} " | grep " target-libffi " > /dev/null 2>&1 ; then
+   libgomp_deps=""
+fi
+
 # Disable the go frontend on systems where it is known to not work. Please keep
 # this in sync with contrib/config-list.mk.
 case "${target}" in
@@ -6588,6 +6599,15 @@
 # $build_configdirs and $target_configdirs.
 # If we have the source for $noconfigdirs entries, add them to $notsupp.
 
+# libgomp depends on libffi.  Remove it from nonsupp if necessary.
+if ! (echo " $noconfigdirs " | grep " target-libgomp " >/dev/null 2>&1); then
+  if echo " $noconfigdirs " | grep " target-libffi " >/dev/null 2>&1; then
+    if test "x${libgomp_deps}" != x; then
+      noconfigdirs=`echo " $noconfigdirs " | sed -e "s/ target-libffi / /"`
+    fi
+  fi
+fi
+
 notsupp=""
 for dir in . $skipdirs $noconfigdirs ; do
   dirname=`echo $dir | sed -e s/target-//g -e s/build-//g`
@@ -7182,6 +7202,9 @@
 
 # If we are building libgomp, bootstrap it.
 if echo " ${target_configdirs} " | grep " libgomp " > /dev/null 2>&1 ; then
+   if echo " ${target_configdirs} " | grep " libffi " > /dev/null 2>&1 ; then
+     bootstrap_target_libs=${bootstrap_target_libs}target-libffi,
+   fi
   bootstrap_target_libs=${bootstrap_target_libs}target-libgomp,
 fi
 
diff --git a/configure.ac b/configure.ac
index 9db4fd1..5184b82 100644
--- a/configure.ac
+++ b/configure.ac
@@ -748,6 +748,9 @@
   alpha*-*-*vms*)
     noconfigdirs="$noconfigdirs target-libffi"
     ;;
+  amdgcn*-*-*)
+    noconfigdirs="$noconfigdirs target-libffi"
+    ;;
   arm*-*-freebsd*)
     noconfigdirs="$noconfigdirs target-libffi"
     ;;
@@ -795,11 +798,19 @@
   ft32-*-*)
     noconfigdirs="$noconfigdirs target-libffi"
     ;;
+  nvptx-*-*)
+    noconfigdirs="$noconfigdirs target-libffi"
+    ;;
   *-*-lynxos*)
     noconfigdirs="$noconfigdirs target-libffi"
     ;;
 esac
 
+libgomp_deps="target-libffi"
+if echo " ${noconfigdirs} " | grep " target-libffi " > /dev/null 2>&1 ; then
+   libgomp_deps=""
+fi
+
 # Disable the go frontend on systems where it is known to not work. Please keep
 # this in sync with contrib/config-list.mk.
 case "${target}" in
@@ -2178,6 +2189,15 @@
 # $build_configdirs and $target_configdirs.
 # If we have the source for $noconfigdirs entries, add them to $notsupp.
 
+# libgomp depends on libffi.  Remove it from nonsupp if necessary.
+if ! (echo " $noconfigdirs " | grep " target-libgomp " >/dev/null 2>&1); then
+  if echo " $noconfigdirs " | grep " target-libffi " >/dev/null 2>&1; then
+    if test "x${libgomp_deps}" != x; then
+      noconfigdirs=`echo " $noconfigdirs " | sed -e "s/ target-libffi / /"`
+    fi
+  fi
+fi
+
 notsupp=""
 for dir in . $skipdirs $noconfigdirs ; do
   dirname=`echo $dir | sed -e s/target-//g -e s/build-//g`
@@ -2695,6 +2715,9 @@
 
 # If we are building libgomp, bootstrap it.
 if echo " ${target_configdirs} " | grep " libgomp " > /dev/null 2>&1 ; then
+   if echo " ${target_configdirs} " | grep " libffi " > /dev/null 2>&1 ; then
+     bootstrap_target_libs=${bootstrap_target_libs}target-libffi,
+   fi
   bootstrap_target_libs=${bootstrap_target_libs}target-libgomp,
 fi
 
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
new file mode 100644
index 0000000..d024ee7
--- /dev/null
+++ b/gcc/ChangeLog.omp
@@ -0,0 +1,907 @@
+2019-11-18  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	Backport from mainline:
+
+	2019-11-07  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* ira.c (setup_alloc_regs): Setup no_unit_alloc_regs for
+	frame pointer in multiple registers.
+	(ira_setup_eliminable_regset): Setup eliminable_regset,
+	ira_no_alloc_regs and regs_ever_live for frame pointer in
+	multiple registers.
+
+	2019-11-10  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* lra-spills.c (assign_spill_hard_regs): Do not spill into
+	registers in eliminable_regset.
+
+	2019-11-14  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* lra-spills.c (assign_spill_hard_regs): Check that the spill
+	register is suitable for the mode.
+
+	2019-11-15  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* config/gcn/gcn.c (gcn_regno_reg_class): Return VCC_CONDITIONAL_REG
+	register class for VCC_LO and VCC_HI.
+	(gcn_spill_class): Use SGPR_REGS to spill registers in
+	VCC_CONDITIONAL_REG.
+
+	2019-11-15  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* config/gcn/gcn.c (gcn_expand_prologue): Remove initialization and
+	prologue use of v0.
+	(print_operand_address): Use v1 for zero vector offset.
+
+	2019-11-15  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* config/gcn/gcn.c (gcn_init_cumulative_args): Call reinit_regs.
+
+	2019-11-15  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* config/gcn/gcn.c (default_requested_args): New.
+	(gcn_parse_amdgpu_hsa_kernel_attribute): Initialize requested args
+	set with default_requested_args.
+	(gcn_conditional_register_usage): Limit register usage of non-kernel
+	functions.  Reassign fixed registers if a non-standard set of args is
+	requested.
+	* config/gcn/gcn.h (FIXED_REGISTERS): Fix registers according to ABI.
+
+	2019-11-15  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* config/gcn/gcn.c (MAX_NORMAL_SGPR_COUNT, MAX_NORMAL_VGPR_COUNT): New.
+	(gcn_conditional_register_usage): Use constants in place of hard-coded
+	values.
+	(gcn_hsa_declare_function_name): Set lower bound for number of
+	SGPRs/VGPRs in non-leaf kernels to MAX_NORMAL_SGPR_COUNT and
+	MAX_NORMAL_VGPR_COUNT.
+
+	2019-11-15  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* config/gcn/gcn.h (FIXED_REGISTERS): Unfix frame pointer.
+	(CALL_USED_REGISTERS): Make frame pointer callee-saved.
+
+2019-10-16  Julian Brown  <julian@codesourcery.com>
+
+	* config/gcn/gcn-protos.h (gcn_goacc_adjust_gangprivate_decl): Rename
+	to...
+	(gcn_goacc_adjust_private_decl): ...this.
+	* config/gcn/gcn-tree.c (diagnostic-core.h): Include.
+	(gcn_goacc_adjust_gangprivate_decl): Rename to...
+	(gcn_goacc_adjust_private_decl): ...this. Add LEVEL parameter.
+	* config/gcn/gcn.c (TARGET_GOACC_ADJUST_GANGPRIVATE_DECL): Rename to...
+	(TARGET_GOACC_ADJUST_PRIVATE_DECL): ...this.
+	* config/nvptx/nvptx.c (tree-pretty-print.h): Include.
+	(nvptx_goacc_adjust_private_decl): New function.
+	(TARGET_GOACC_ADJUST_PRIVATE_DECL): Define hook using above function.
+	* doc/tm.texi.in (TARGET_GOACC_ADJUST_GANGPRIVATE_DECL): Rename to...
+	(TARGET_GOACC_ADJUST_PRIVATE_DECL): ...this.
+	* doc/tm.texi: Regenerated.
+	* internal-fn.c (expand_UNIQUE): Handle IFN_UNIQUE_OACC_PRIVATE.
+	* internal-fn.h (IFN_UNIQUE_CODES): Add OACC_PRIVATE.
+	* omp-low.c (omp_context): Remove oacc_partitioning_levels field.
+	(lower_oacc_reductions): Add PRIVATE_MARKER parameter.  Insert before
+	fork.
+	(lower_oacc_head_tail): Add PRIVATE_MARKER parameter. Modify its
+	gimple call arguments as appropriate. Don't set
+	oacc_partitioning_levels in omp_context. Pass private_marker to
+	lower_oacc_reductions.
+	(oacc_record_private_var_clauses): Don't check for NULL ctx.
+	(make_oacc_private_marker): New function.
+	(lower_omp_for): Only call oacc_record_vars_in_bind for
+	OpenACC contexts.  Create private marker and pass to
+	lower_oacc_head_tail.
+	(lower_omp_target): Remove unnecessary call to
+	oacc_record_private_var_clauses. Remove call to mark_oacc_gangprivate.
+	Create private marker and pass to lower_oacc_reductions.
+	(process_oacc_gangprivate_1): Remove.
+	(lower_omp_1): Only call oacc_record_vars_in_bind for OpenACC.  Don't
+	iterate over contexts calling process_oacc_gangprivate_1.
+	(omp-offload.c (oacc_loop_xform_head_tail): Treat
+	private-variable markers like fork/join when transforming head/tail
+	sequences.
+	(execute_oacc_device_lower): Use IFN_UNIQUE_OACC_PRIVATE instead of
+	"oacc gangprivate" attributes to determine partitioning level of
+	variables.
+	* omp-sese.c (find_gangprivate_vars): New function.
+	(find_local_vars_to_propagate): Use GANGPRIVATE_VARS parameter instead
+	of "oacc gangprivate" attribute to determine which variables are
+	gang-private.
+	(oacc_do_neutering): Use find_gangprivate_vars.
+	* target.def (adjust_gangprivate_decl): Rename to...
+	(adjust_private_decl): ...this.  Update documentation (briefly).
+
+2019-10-09  Tobias Burnus  <tobias@codesourcery.com>
+
+	* f95-lang.c (LANG_HOOKS_OMP_ARRAY_DATA): Set to gfc_omp_array_data.
+	* trans-array.c (gfc_conv_descriptor_data_get): Handle ref types.
+	* trans-openmp.c (gfc_omp_array_data): New.
+	* trans.h (gfc_omp_array_data): Declare.
+
+2019-09-20  Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (localize_reductions): Rewrite references for
+	OMP_CLAUSE_PRIVATE also.
+
+2019-09-17  Tobias Burnus  <tobias@codesourcery.com>
+
+	* config/gcn/gcn.c (gcn_expand_scalar_to_vector_address,
+	gcn_md_reorg): Remove unused statement.
+	(gcn_emutls_var_init): Add missing return - after sorry abort.
+	* config/gcn/gcn.md (movdi_symbol_save_scc): Fix condition.
+	* config/gcn/mkoffload.c (process_obj): Remove unused variables.
+	* gimplify.c (gomp_oacc_needs_data_present): Likewise.
+	(gimplify_adjust_omp_clauses): Fix condition by adding ().
+	* omp-low.c (process_oacc_gangprivate_1): Comment unused
+	parameter name to silence unused warning.
+	* omp-sese.c (omp_sese_number, omp_sese_pseudo): Remove
+	superfluous ().
+	(oacc_do_neutering): Use signed int to avoid a warning.
+	* tree-ssa-structalias.c (find_func_aliases_for_builtin_call,
+	find_func_clobbers): Use unsigned to silence warning.
+
+2019-09-10  Julian Brown  <julian@codesourcery.com>
+
+	* config/gcn/mkoffload.c (process_asm): Remove omp_data_size,
+	gridified_kernel_p, kernel_dependencies_count, kernel_dependencies
+	from emitted hsa_kernel_description struct array.
+
+2019-09-10  Andrew Stubbs  <ams@codesourcery.com>
+
+	* config/gcn/gcn.c (gcn_hsa_declare_function_name): Calculate
+	granulated_sgprs according to architecture.
+
+2019-09-09  Andrew Stubbs  <ams@codesourcery.com>
+
+	* config/gcn/gcn-run.c (heap_region): New global variable.
+	(struct hsa_runtime_fn_info): Add hsa_memory_assign_agent_fn.
+	(init_hsa_runtime_functions): Initialize hsa_memory_assign_agent.
+	(get_kernarg_region): Move contents to ....
+	(get_memory_region): .... here.
+	(get_heap_region): New function.
+	(init_device): Initialize the heap_region.
+	(device_malloc): Add region parameter.
+	(struct kernargs): Move heap ....
+	(heap): ... to global scope.
+	(main): Allocate heap separate to kernargs.
+
+2019-09-06  Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+
+	2019-06-06  Andrew Stubbs  <ams@codesourcery.com>
+
+	* config.gcc (amdgcn-*-*): Allow --with-arch=gfx906.
+	* config/gcn/gcn.opt (gpu_type): Add gfx906.
+	* config/gcn/t-gcn-hsa (MULTILIB_OPTIONS): Add gfx906 multilib.
+	(MULTILIB_DIRNAMES): Rename gcn5 to gfx900.
+	Add gfx906.
+
+	2019-06-07  Andrew Stubbs  <ams@codesourcery.com>
+
+	* doc/invoke.texi (AMD GCN Options): Add gfx906.
+
+2019-09-06  Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (gimplify_omp_for): Use for_stmt in call to
+	localize_reductions.
+
+2019-09-06  Julian Brown  <julian@codesourcery.com>
+
+	* config/nvptx/nvptx.c (omp-sese.h): Include.
+	(bb_pair_t, bb_pair_vec_t, pseudo_node_t, bracket, bracket_vec_t,
+	bb_sese, bb_sese::~bb_sese, bb_sese::append, bb_sese::remove,
+	BB_SET_SESE, BB_GET_SESE, nvptx_sese_number, nvptx_sese_pseudo,
+	nvptx_sese_color, nvptx_find_sese): Remove.
+	(nvptx_neuter_pars): Call omp_find_sese instead of nvptx_find_sese.
+	* omp-sese.c (omp-sese.h): Include.
+	(struct parallel): Rename to...
+	(struct parallel_g): This.
+	(parallel::parallel, parallel::~parallel): Rename to...
+	(parallel_g::parallel_g, parallel_g::~parallel_g): These.
+	(omp_sese_dump_pars, omp_sese_find_par, omp_sese_discover_pars,
+	populate_single_mode_bitmaps, find_ssa_names_to_propagate,
+	find_partitioned_var_uses, find_local_vars_to_propagate,
+	neuter_worker_single): Update for parallel_g name change.
+	(bb_pair_t, bb_pair_vec_t): Remove.
+	(omp_find_sese): Make global.
+	* omp-sese.h (bb_pair_t, bb_pair_vec_t): New.
+	(omp_find_sese): Add prototype.
+
+2019-09-06  Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (gimplify_omp_workshare): Use OMP_CLAUSES, OMP_BODY
+	instead of OMP_TARGET_CLAUSES, OMP_TARGET_BODY.
+
+2019-09-05  Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+
+	2019-07-31  Andrew Stubbs  <ams@codesourcery.com>
+
+	gcc/
+	* config/gcn/gcn-valu.md
+	(scatter<mode>_insn_1offset<exec_scatter>): Remove s_waitcnt.
+	(scatter<mode>_insn_1offset_ds<exec_scatter>): Likewise.
+	(scatter<mode>_insn_2offsets<exec_scatter>): Likewise.
+	* config/gcn/gcn.c (gcn_md_reorg): Add delayeduse and reads to
+	struct ilist. Add nops for delayeduse insns.
+	* config/gcn/gcn.md (delayeduse): New attribute.
+	(*movbi): Remove s_waitcnt from stores.
+	(*mov<mode>_insn): Likewise.
+	(*movti_insn): Likewise. Add delayeduse attribute.
+	(sync_compare_and_swap<mode>_insn): Add delayeduse attribute.
+	(atomic_store<mode>): Remove or adjust s_waitcnt.
+
+	2019-09-05  Andrew Stubbs  <ams@codesourcery.com>
+
+	gcc/
+	* config/gcn/gcn.md (*movti_insn): Set delayeduse for global_store.
+	(sync_compare_and_swap<mode>_insn): Likewise.
+
+2019-09-05  Julian Brown  <julian@codesourcery.com>
+
+	* config/gcn/gcn.c (gcn_goacc_validate_dims): Remove
+	no-flag_worker-partitioning assertion.
+	(TARGET_GOACC_WORKER_PARTITIONING): Define target hook to true.
+	* config/gcn/gcn.opt (flag_worker_partitioning): Change default to 1.
+
+2019-09-05  Cesar Philippidis  <cesar@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (privatize_reduction): New struct.
+	(localize_reductions_r, localize_reductions): New functions.
+	(gimplify_omp_for): Call localize_reductions.
+	(gimplify_omp_workshare): Likewise.
+	* omp-low.c (lower_oacc_reductions): Handle localized reductions.
+	Create fewer temp vars.
+	* tree-core.h (omp_clause_code): Add OMP_CLAUSE_REDUCTION_PRIVATE_DECL
+	documentation.
+	* tree.c (omp_clause_num_ops): Bump number of ops for
+	OMP_CLAUSE_REDUCTION to 6.
+	(walk_tree_1): Adjust accordingly.
+	* tree.h (OMP_CLAUSE_REDUCTION_PRIVATE_DECL): Add macro.
+
+2019-09-05  Julian Brown  <julian@codesourcery.com>
+
+	* config/gcn/gcn-protos.h (gcn_goacc_adjust_propagation_record): Rename
+	prototype to...
+	(gcn_goacc_create_propagation_record): This.
+	* config/gcn/gcn-tree.c (gcn_goacc_adjust_propagation_record): Rename
+	function to...
+	(gcn_goacc_create_propagation_record): This.  Adjust comment.
+	* config/gcn/gcn.c (gcn_init_builtins): Override decls for
+        BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START,
+        BUILT_IN_GOACC_SINGLE_COPY_END and BUILT_IN_GOACC_BARRIER.
+	(gcn_fork_join): Remove inaccurate comment.
+	(TARGET_GOACC_ADJUST_PROPAGATION_RECORD): Rename to...
+	(TARGET_GOACC_CREATE_PROPAGATION_RECORD): This.
+
+2019-09-05  Julian Brown  <julian@codesourcery.com>
+
+	* Makefile.in (OBJS): Add omp-sese.o.
+	* omp-builtins.def (BUILT_IN_GOACC_BARRIER, BUILT_IN_GOACC_SINGLE_START,
+        BUILT_IN_GOACC_SINGLE_COPY_START, BUILT_IN_GOACC_SINGLE_COPY_END): New
+	builtins.
+	* omp-offload.c (omp-sese.h): Include header.
+	(oacc_loop_xform_head_tail): Call update_stmt for modified builtin
+	calls.
+	(oacc_loop_process): Likewise.
+	(default_goacc_create_propagation_record): New default implementation
+	for TARGET_GOACC_CREATE_PROPAGATION_RECORD hook.
+	(execute_oacc_loop_designation): New.  Split out of oacc_device_lower.
+	(execute_oacc_gimple_workers): New.  Likewise.
+	(execute_oacc_device_lower): Recreate dims array.
+	(pass_data_oacc_loop_designation, pass_data_oacc_gimple_workers): New.
+	(pass_oacc_loop_designation, pass_oacc_gimple_workers): New.
+	(make_pass_oacc_loop_designation, make_pass_oacc_gimple_workers): New.
+	* omp-offload.h (oacc_fn_attrib_level): Add prototype.
+	* omp-sese.c: New file.
+	* omp-sese.h: New file.
+	* passes.def (pass_oacc_loop_designation, pass_oacc_gimple_workers):
+	Add passes.
+	* target.def (worker_partitioning, create_propagation_record): Add
+	target hooks.
+	* targhooks.h (default_goacc_create_propagation_record): Add prototype.
+	* tree-pass.h (make_pass_oacc_loop_designation,
+	make_pass_oacc_gimple_workers): Add prototypes.
+	* doc/tm.texi.in (TARGET_GOACC_WORKER_PARTITIONING,
+	TARGET_GOACC_CREATE_PROPAGATION_RECORD): Add documentation hooks.
+	* doc/tm.texi: Regenerate.
+
+2019-09-05  Julian Brown  <julian@codesourcery.com>
+
+	* omp-offload.c (convert.h): Include.
+	(struct addr_expr_rewrite_info): Add struct.
+	(rewrite_addr_expr): New function.
+	(is_sync_builtin_call): New function.
+	(execute_oacc_device_lower): Support rewriting gang-private variables
+	using target hook, and fix up addr_expr nodes afterwards.
+	* target.def (adjust_gangprivate_decl): New target hook.
+	* doc/tm.texi.in (TARGET_GOACC_ADJUST_GANGPRIVATE_DECL): Document new
+	target hook.
+	* doc/tm.texi: Regenerate.
+
+2019-08-13  Julian Brown  <julian@codesourcery.com>
+
+	* omp-oacc-kernels.c (add_wait): New function, split out of...
+	(add_async_clauses_and_wait): ...here. Call new outlined function.
+	(decompose_kernels_region_body): Add wait at the end of
+	explicitly-asynchronous kernels regions.
+
+2019-08-08  Julian Brown  <julian@codesourcery.com>
+
+	* config/gcn/gcn.c (gcn_goacc_validate_dims): Ensure
+	flag_worker_partitioning is not set.
+	(TARGET_GOACC_WORKER_PARTITIONING): Remove target hook definition.
+	* config/gcn/gcn.opt (macc-experimental-workers): Default to off.
+
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+
+	* builtin-types.def (BT_FN_VOID_INT_INT_OMPFN_SIZE_PTR_PTR_PTR_VAR):
+	Remove.
+	* config/i386/i386.c (ix86_goacc_explode_args): New.
+	(TARGET_GOACC_EXPLODE_ARGS): Define, using above function.
+	* doc/tm.texi: Regenerated.
+	* doc/tm.texi.in: Add TARGET_GOACC_EXPLODE_ARGS hook.
+	* fortran/types.def (BT_FN_VOID_INT_INT_OMPFN_SIZE_PTR_PTR_PTR_VAR):
+	Remove.
+	* omp-builtins.def (GOACC_parallel_keyed_v2): Remove.
+	* omp-expand.c (expand_omp_target): Use explode_args target hook.
+	Use GOMP_LAUNCH_ARGS_EXPLODED launch tag.
+	* omp-low.c (build_receiver_ref, build_sender_ref,
+	create_omp_child_function, scan_omp_target, lower_omp_target): Use
+	explode_args target hook.
+	* target.def (explode_args): New target hook.
+	* tree-ssa-structalias.c (target.h): Include.
+	(find_func_aliases_for_builtin_call): Conditionalise disabling of pass
+	for OpenACC parallel regions based on explode_args target hook.  Remove
+	'params' from BUILT_IN_GOACC_PARALLEL arguments.
+	(find_func_clobbers): Likewise.
+	(ipa_pta_execute): Update for removed 'params' argument.
+
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+	    Andrew Stubbs  <ams@codesourcery.com>
+
+	* config.gcc (amdgcn-*-*): Add default option for gfx906.
+	* config/gcn/mkoffload.c: New.
+	* config/gcn/offload.h: New.
+
+2019-06-25  Kwok Cheung Yeung  <kcy@codesourcery.com>
+            Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+
+	* config.gcc (thread_file): Set to gcn for AMD GCN.
+	* config/gcn/gcn.c (gcn_emutls_var_init): New function.
+	(TARGET_EMUTLS_VAR_INIT): New hook.
+
+2019-05-22  Kwok Cheung Yeung  <kcy@codesourcery.com>
+	    Andrew Stubbs  <amd@codesourcery.com>
+
+	Backport from mainline:
+
+	* config.gcc (gcc_cv_initfini_array): Set for AMD GCN.
+	* config/gcn/gcn-run.c (init_array_kernel, fini_array_kernel): New.
+	(kernel): Rename to...
+	(main_kernel): ... this.
+	(load_image): Load _init_array and _fini_array kernels.
+	(run): Add argument for kernel to run.
+	(main): Run init_array_kernel before main_kernel, and
+	fini_array_kernel after.
+	* config/gcn/gcn.c (gcn_handle_amdgpu_hsa_kernel_attribute): Allow
+	amdgpu_hsa_kernel attribute on functions.
+	(gcn_disable_constructors): Delete.
+	(TARGET_ASM_CONSTRUCTOR, TARGET_ASM_DESTRUCTOR): Delete.
+	* config/gcn/crt0.c (size_t): Define.
+	(_init_array, _fini_array): New.
+	(__preinit_array_start, __preinit_array_end,
+	__init_array_start, __init_array_end,
+	__fini_array_start, __fini_array_end): Declare weak references.
+
+2019-07-10  Cesar Philippidis  <cesar@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (gimplify_adjust_omp_clauses_1): Raise error for
+	assumed-size arrays in map clauses for Fortran/OpenMP.
+	* omp-low.c (lower_omp_target): Set the size of assumed-size Fortran
+	arrays to one to allow use of data already mapped on the offload device.
+
+2019-07-10  Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (insert_struct_comp_map): Handle GOMP_MAP_ATTACH_DETACH.
+	(gimplify_scan_omp_clauses): Separate out handling of OACC_ENTER_DATA
+	and OACC_EXIT_DATA. Remove GOMP_MAP_POINTER and GOMP_MAP_TO_PSET
+	mappings, apart from those following GOMP_MAP_DECLARE_{,DE}ALLOCATE.
+	Handle GOMP_MAP_ATTACH_DETACH.
+	* tree-pretty-print.c (dump_omp_clause): Support GOMP_MAP_ATTACH_DETACH.
+	Print "bias" not "len" for attach/detach clause types.
+
+2019-05-28  Julian Brown  <julian@codesourcery.com>
+
+	* omp-low.c (mark_oacc_gangprivate): Add CTX parameter.  Use to look up
+	correct decl to add attribute to.
+	(lower_omp_for): Move "oacc gangprivate" processing from here...
+	(process_oacc_gangprivate_1): ...to here. New function.
+	(lower_omp_target): Update call to mark_oacc_gangprivate.
+	(execute_lower_omp): Call process_oacc_gangprivate_1 for each OMP
+	context.
+
+2019-05-30  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* tree-vrp.c (extract_range_from_unary_expr): Set a varying range
+	when a reference is converted to an integral type.
+
+2019-05-20  Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (gimplify_adjust_omp_clauses_1): Support implied no_alloc
+	and optional arguments based on mappings in enclosing data regions.
+
+2019-05-19  Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (oacc_array_mapping_info): Add REF field.
+	(gimplify_scan_omp_clauses): Initialise above field for data blocks
+	passed by reference.
+	(gomp_oacc_needs_data_present): Handle references.
+	(gimplify_adjust_omp_clauses_1): Handle references and optional
+	arguments for variables declared in lexically-enclosing OpenACC data
+	region.
+
+2019-05-16  Julian Brown  <julian@codesourcery.com>
+
+	* omp-oacc-kernels.c (find_omp_for_index_vars_1,
+	find_omp_for_index_vars): New functions.
+	(maybe_build_inner_data_region): Add IDX_VARS argument. Don't add
+	CREATE mapping clauses for loop index variables.  Set TREE_ADDRESSABLE
+	flag on newly-mapped declarations as a side effect.
+	(decompose_kernels_region_body): Call find_omp_for_index_vars.  Don't
+	create PRESENT clause for loop index variables.  Pass index variable
+	set to maybe_build_inner_data_region.
+
+2019-05-03  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* gimplify.c (gomp_oacc_needs_data_present): Return NULL if decl is a
+	Fortran optional argument.
+
+2019-02-01  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* omp-oacc-kernels.c (struct adjust_nested_loop_clauses_wi_info): New.
+	(adjust_nested_loop_clauses, transform_kernels_loop_clauses): Use it.
+
+2019-01-23  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* doc/invoke.texi (-fopenacc-kernels): Update.
+
+2019-01-24  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* omp-oacc-kernels.c (adjust_region_code_walk_stmt_fn)
+	(adjust_region_code): New functions.
+	(make_loops_gang_single): Update.
+	(make_gang_single_region): Rename to...
+	(make_region_seq): ... this, and update.
+	(make_gang_parallel_loop_region): Rename to...
+	(make_region_loop_nest): ... this, and update.
+	(is_unconditional_oacc_for_loop): Remove stmt parameter and check.
+	(decompose_kernels_region_body): Update.
+
+2019-01-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* omp-oacc-kernels.c (add_async_clauses_and_wait): New function...
+	(decompose_kernels_region_body): ... called from here.
+
+2019-01-23  Gergö Barany  <gergo@codesourcery.com>
+
+	* omp-oacc-kernels.c (add_parent_or_loop_num_clause): New function.
+	(adjust_nested_loop_clauses): Likewise.
+	(transform_kernels_loop_clauses, make_gang_parallel_loop_region):
+	Add worker and vector clause parameters, emit error on illegal
+	nesting.
+	(visit_loops_in_gang_single_region): Emit warning on conditionally
+	executed code with a gang clause.
+	(make_loops_gang_single): New function.
+	(decompose_kernels_region_body): Separate out gang/worker/vector clauses
+	for separate handling; add call to make_loops_gang_single.
+	* omp-offload.c (oacc_loop_auto_partitions): Add and propagate
+	is_oacc_gang_single parameter.
+	(oacc_loop_partition): Likewise.
+	(execute_oacc_device_lower): Adjust call to oacc_loop_partition.
+
+2019-01-23  Gergö Barany  <gergo@codesourcery.com>
+
+	* omp-oacc-kernels.c (control_flow_regions): New class.
+	(control_flow_regions::control_flow_regions): New constructor.
+	(control_flow_regions::is_unconditional_oacc_for_loop): New method.
+	(control_flow_regions::find_rep): Likewise.
+	(control_flow_regions::union_reps): Likewise.
+	(control_flow_regions::compute_regions): Likewise.
+	(decompose_kernels_region_body): Use test for conditional execution.
+
+2019-01-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* omp-oacc-kernels.c (top_level_omp_for_in_stmt): New function.
+	(make_gang_single_region): Likewise.
+	(transform_kernels_loop_clauses, make_gang_parallel_loop_region):
+	Likewise.
+	(flatten_binds): Likewise.
+	(make_data_region_try_statement): Likewise.
+	(maybe_build_inner_data_region): Likewise.
+	(decompose_kernels_region_body): Likewise.
+	(transform_kernels_region): Delegate to decompose_kernels_region_body
+	and make_data_region_try_statement.
+
+2019-01-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* Makefile.in: Add...
+	* omp-oacc-kernels.c: ... this new file for the kernels conversion
+	pass.
+	* flag-types.h (enum openacc_kernels): Add "split" style.  Adjust
+	all users.
+	* doc/invoke.texi (-fopenacc-kernels): Update.
+	* passes.def: Add pass_convert_oacc_kernels to pipeline.
+	* tree-pass.h (make_pass_convert_oacc_kernels): Add declaration.
+
+2019-01-23  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* gimple.h (enum gf_mask): Add new target kinds
+	GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED,
+	GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE, and
+	GF_OMP_TARGET_KIND_OACC_DATA_KERNELS.
+	(is_gimple_omp_oacc): Handle new target kinds.
+	(is_gimple_omp_offloaded): Likewise.
+	* gimple-pretty-print.c (dump_gimple_omp_target): Likewise.
+	* omp-expand.c (expand_omp_target): Likewise.
+	(build_omp_regions_1): Likewise.
+	(omp_make_gimple_edges): Likewise.
+	* omp-low.c (is_oacc_parallel_or_serial): Likewise.
+	(was_originally_oacc_kernels): New function.
+	(scan_omp_for): Update check for illegal nesting.
+	(check_omp_nesting_restrictions): Handle new target kinds.
+	(lower_oacc_reductions): Likewise.
+	(lower_omp_target): Likewise.
+	* omp-offload.c (execute_oacc_device_lower): Likewise.
+
+2019-01-30  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* flag-types.h (enum openacc_kernels): New type.
+
+2018-12-19  Julian Brown  <julian@codesourcery.com>
+            Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* omp-low.c (lower_omp_target): Support GOMP_MAP_NO_ALLOC.
+	* tree-pretty-print.c (dump_omp_clause): Likewise.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* gimple.h (gf_mask): Add GF_OMP_TARGET_KIND_OACC_SERIAL
+	enumeration constant.
+	(is_gimple_omp_oacc): Handle GF_OMP_TARGET_KIND_OACC_SERIAL.
+	(is_gimple_omp_offloaded): Likewise.
+	* gimplify.c (omp_region_type): Add ORT_ACC_SERIAL enumeration
+	constant.  Adjust the value of ORT_NONE accordingly.
+	(is_gimple_stmt): Handle OACC_SERIAL.
+	(oacc_default_clause): Handle ORT_ACC_SERIAL.
+	(gomp_needs_data_present): Likewise.
+	(gimplify_adjust_omp_clauses): Likewise.
+	(gimplify_omp_workshare): Handle OACC_SERIAL.
+	(gimplify_expr): Likewise.
+	* omp-expand.c (expand_omp_target): Handle
+	GF_OMP_TARGET_KIND_OACC_SERIAL.
+	(build_omp_regions_1, omp_make_gimple_edges): Likewise.
+	* omp-low.c (is_oacc_parallel): Rename function to...
+	(is_oacc_parallel_or_serial): ... this.  Handle
+	GF_OMP_TARGET_KIND_OACC_SERIAL.
+	(build_receiver_ref): Adjust accordingly.
+	(build_sender_ref): Likewise.
+	(scan_sharing_clauses): Likewise.
+	(create_omp_child_function): Likewise.
+	(scan_omp_for): Likewise.
+	(scan_omp_target): Likewise.
+	(lower_oacc_head_mark): Likewise.
+	(convert_from_firstprivate_int): Likewise.
+	(lower_omp_target): Likewise.
+	(check_omp_nesting_restrictions): Handle
+	GF_OMP_TARGET_KIND_OACC_SERIAL.
+	(lower_oacc_reductions): Likewise.
+	(lower_omp_target): Likewise.
+	* tree-pretty-print.c (dump_generic_node): Handle OACC_SERIAL.
+	* tree.def (OACC_SERIAL): New tree code.
+
+	* doc/generic.texi (OpenACC): Document OACC_SERIAL.
+
+2017-12-21  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* omp-low.c (install_parm_decl): Don't extract identifiers from
+	artifical decls.
+
+2018-12-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* omp-expand.c (expand_omp_target): Handle if_present flag on
+	OpenACC host_data construct.
+
+2018-12-20  Gergö Barany  <gergo@codesourcery.com>
+
+	* omp-low.c (struct omp_context): New fields
+	local_reduction_clauses, outer_reduction_clauses.
+	(new_omp_context): Initialize these.
+	(scan_sharing_clauses): Record reduction clauses on OpenACC
+	constructs.
+	(scan_omp_for): Check reduction clauses for incorrect nesting.
+
+2019-01-31  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* omp-low.c (lower_omp_target): For use_device clauses, generate
+	conditional statements to treat Fortran optional arguments like
+	references if non-null, or propogate null arguments into offloaded
+	code otherwise.
+
+2019-01-31  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* omp-general.c (omp_is_optional_argument): Add comment.  Add extra
+	check for Fortran language.
+
+2019-01-30  Kwok Cheung Yeung <kcy@codesourcery.com>
+
+	* omp-general.c (omp_is_optional_argument): New.
+	* omp-general.h (omp_is_optional_argument): New.
+	* omp-low.c (lower_omp_target): Use size of referenced object when
+	optional argument used as argument to firstprivate.
+
+2018-12-11  Julian Brown  <julian@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* config/nvptx/nvptx.c (tree-hash-traits.h): Include.
+	(gangprivate_shared_size): New global variable.
+	(gangprivate_shared_align): Likewise.
+	(gangprivate_shared_sym): Likewise.
+	(gangprivate_shared_hmap): Likewise.
+	(nvptx_option_override): Initialize gangprivate_shared_sym,
+	gangprivate_shared_align.
+	(nvptx_file_end): Output gangprivate_shared_sym.
+	(nvptx_goacc_expand_accel_var): New function.
+	(nvptx_set_current_function): New function.
+	(TARGET_SET_CURRENT_FUNCTION): Define hook.
+	(TARGET_GOACC_EXPAND_ACCEL): Likewise.
+	* doc/tm.texi (TARGET_GOACC_EXPAND_ACCEL_VAR): Document new hook.
+	* doc/tm.texi.in (TARGET_GOACC_EXPAND_ACCEL_VAR): Likewise.
+	* expr.c (expand_expr_real_1): Remap decls marked with the
+	"oacc gangprivate" attribute.
+	* omp-low.c (omp_context): Add oacc_partitioning_level and
+	oacc_addressable_var_decls fields.
+	(new_omp_context): Initialize oacc_addressable_var_decls in new
+	omp_context.
+	(delete_omp_context): Delete oacc_addressable_var_decls in old
+	omp_context.
+	(lower_oacc_head_tail): Record partitioning-level count in omp context.
+	(oacc_record_private_var_clauses, oacc_record_vars_in_bind)
+	(mark_oacc_gangprivate): New functions.
+	(lower_omp_for): Call oacc_record_private_var_clauses with "for"
+	clauses.  Call mark_oacc_gangprivate for gang-partitioned loops.
+	(lower_omp_target): Call oacc_record_private_var_clauses with "target"
+	clauses.
+	Call mark_oacc_gangprivate for offloaded target regions.
+	(lower_omp_1): Call vars_in_bind for GIMPLE_BIND within OMP regions.
+	* target.def (expand_accel_var): New hook.
+
+2018-09-05  Cesar Philippidis  <cesar@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* gimplify.c (omp_add_variable): Enable firstprivate reduction
+	variables.
+
+2018-09-20  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* omp-low.c (lower_oacc_head_mark): Don't mark OpenACC auto
+	loops as independent inside acc parallel regions.
+
+2018-10-04  Cesar Philippidis  <cesar@codesourcery.com>
+            Julian Brown  <julian@codesourcery.com>
+
+	* omp-low.c (scan_sharing_clauses): Update handling of OpenACC declare
+	create, declare copyin and declare deviceptr to have local lifetimes.
+	(convert_to_firstprivate_int): Handle pointer types.
+	(convert_from_firstprivate_int): Likewise.  Create local storage for
+	the values being pointed to.  Add new orig_type argument.
+	(lower_omp_target): Handle GOMP_MAP_DECLARE_{ALLOCATE,DEALLOCATE}.
+	Add orig_type argument to convert_from_firstprivate_int call.
+	Allow pointer types with GOMP_MAP_FIRSTPRIVATE_INT.  Don't privatize
+	firstprivate VLAs.
+	* tree-pretty-print.c (dump_omp_clause): Handle
+	GOMP_MAP_DECLARE_{ALLOCATE,DEALLOCATE}.
+
+2018-12-22  Cesar Philippidis  <cesar@codesourcery.com>
+            Julian Brown  <julian@codesourcery.com>
+
+	* omp-low.c (maybe_lookup_field_in_outer_ctx): New function.
+	(convert_to_firstprivate_int): New function.
+	(convert_from_firstprivate_int): New function.
+	(lower_omp_target): Enable GOMP_MAP_FIRSTPRIVATE_INT in OpenACC.
+
+2018-08-28  Julian Brown  <julian@codesourcery.com>
+            Cesar Philippidis  <cesar@codesourcery.com>
+
+	* gimplify.c (oacc_array_mapping_info): New struct.
+	(gimplify_omp_ctx): Add decl_data_clause hash map.
+	(new_omp_context): Zero-initialise above.
+	(delete_omp_context): Delete above if allocated.
+	(gimplify_scan_omp_clauses): Scan for array mappings on data constructs,
+	and record in above map.
+	(gomp_oacc_needs_data_present): New function.
+	(gimplify_adjust_omp_clauses_1): Handle data mappings (e.g. array
+	slices) declared in lexically-enclosing data constructs.
+	* omp-low.c (lower_omp_target): Allow decl for bias not to be present
+	in OpenACC context.
+
+2018-10-05  Nathan Sidwell  <nathan@acm.org>
+	    Tom de Vries  <tdevries@suse.de>
+	    Thomas Schwinge  <thomas@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+
+	* doc/invoke.texi (fopenacc-dim): Update.
+	* omp-offload.c (oacc_parse_default_dims): Update.
+	(oacc_validate_dims): Emit warnings about strange partitioning choices.
+
+2019-09-20  Chung-Lin Tang <cltang@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* omp-expand.c (struct omp_region): Add inside_kernels_p field.
+	(expand_omp_for_generic): Adjust to generate a 'sequential' loop
+	when GOMP builtin arguments are BUILT_IN_NONE.
+	(expand_omp_for): Use expand_omp_for_generic to generate a
+	non-parallelized loop for OMP_FORs inside OpenACC kernels regions.
+	(expand_omp): Mark inside_kernels_p field true for regions
+	nested inside OpenACC kernels constructs.
+
+2018-09-20  Cesar Philippidis  <cesar@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+
+	* omp-low.c (install_var_field): New base_pointer_restrict
+	argument.
+	(scan_sharing_clauses): Update call to install_var_field.
+	(omp_target_base_pointers_restrict_p): New function.
+	(scan_omp_target): Update call to install_var_field.
+
+2018-10-30  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* config/nvptx/nvptx.c (nvptx_propagate_unified): New.
+	(nvptx_split_blocks): Call it for cond_uni insn.
+	(nvptx_expand_cond_uni): New.
+	(enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
+	(nvptx_init_builtins): Initialize it.
+	(nvptx_expand_builtin):
+	(nvptx_generate_vector_shuffle): Change integral SHIFT operand to
+	tree BITS operand.
+	(nvptx_vector_reduction): New.
+	(nvptx_adjust_reduction_type): New.
+	(nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
+	(nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
+	(nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
+	Use it to adjust the type of ref_to_res.
+	(nvptx_goacc_reduction_teardown):
+	* config/nvptx/nvptx.md (cond_uni): New pattern.
+
+2018-06-29  Cesar Philippidis  <cesar@codesourcery.com>
+	    James Norris  <jnorris@codesourcery.com>
+
+	* gimplify.c (enum gimplify_omp_var_data): Add GOVD_DEVICETPR.
+	(oacc_default_clause): Privatize fortran common blocks.
+	(omp_notice_variable): Add GOVD_DEVICEPTR attribute when appropriate.
+	Defer the expansion of DECL_VALUE_EXPR for common block decls.
+	(gimplify_scan_omp_clauses): Add GOVD_DEVICEPTR attribute when
+	appropriate.
+	(gimplify_adjust_omp_clauses_1): Set GOMP_MAP_FORCE_DEVICEPTR for
+	implicit deviceptr mappings.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* omp-general.c (oacc_build_routine_dims): Move some of its
+	processing into...
+	(oacc_verify_routine_clauses): ... this new function.
+	* omp-general.h (oacc_verify_routine_clauses): New prototype.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* tree-core.h (omp_clause_code): Add OMP_CLAUSE_NOHOST.
+	* tree.c (omp_clause_num_ops, omp_clause_code_name, walk_tree_1):
+	Update for these.
+	* tree-pretty-print.c (dump_omp_clause): Handle	OMP_CLAUSE_NOHOST.
+	* gimplify.c (gimplify_scan_omp_clauses)
+	(gimplify_adjust_omp_clauses): Handle OMP_CLAUSE_NOHOST.
+	* tree-nested.c (convert_nonlocal_omp_clauses)
+	(convert_local_omp_clauses): Likewise.
+	* omp-low.c (scan_sharing_clauses): Likewise.
+	* omp-offload.c (maybe_discard_oacc_function): New function.
+	(execute_oacc_device_lower) [!ACCEL_COMPILER]: Handle OpenACC
+	nohost clauses.
+
+2019-02-05  Julian Brown  <julian@codesourcery.com>
+
+	* omp-low.c (scan_sharing_clauses): Disallow dynamic (multidimensional)
+	arrays within structs.
+
+2018-10-16  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* tree-pretty-print.c (dump_omp_clauses): Add cases for printing
+	GOMP_MAP_DYNAMIC_ARRAY map kinds.
+
+2018-10-16  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* omp-low.c (dynamic_array_lookup): New function.
+	(dynamic_array_reference_start): Likewise.
+	(scan_for_op): Likewise.
+	(scan_for_reference): Likewise.
+	(da_create_bias): Likewise.
+	(da_dimension_peel): Likewise.
+	(lower_omp_1): Add case to look for start of dynamic array reference,
+	and handle bias adjustments for the code sequence.
+
+2018-10-16  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* omp-low.c (struct omp_context):
+	Add 'hash_map<tree_operand_hash, tree> *dynamic_arrays' field, also
+	added include of "tree-hash-traits.h".
+	(append_field_to_record_type): New function.
+	(create_dynamic_array_descr_type): Likewise.
+	(create_dynamic_array_descr_init_code): Likewise.
+	(new_omp_context): Add initialize of dynamic_arrays field.
+	(delete_omp_context): Add delete of dynamic_arrays field.
+	(scan_sharing_clauses): For dynamic array map kinds, check for
+	supported dimension structure, and install dynamic array variable into
+	current omp_context.
+	(lower_omp_target): Add handling for dynamic array map kinds.
+
+2018-10-16  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* gimplify.c (gimplify_scan_omp_clauses): For dynamic array map kinds,
+	make sure bias in each dimension are put into firstprivate variables.
+
+2019-01-31  Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (gimplify_scan_omp_clauses): Handle array sections on
+	dereferenced struct members.
+
+2018-12-14  Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (gimplify_omp_var_data): Add GOVD_MAP_HAS_ATTACHMENTS.
+	(insert_struct_component_mapping): Support derived-type member mappings
+	for arrays with descriptors which use GOMP_MAP_TO_PSET.
+	(gimplify_scan_omp_clauses): Rewrite GOMP_MAP_ALWAYS_POINTER to
+	GOMP_MAP_ATTACH for OpenACC struct/derived-type component pointers.
+	Handle pointer mappings that use GOMP_MAP_TO_PSET.  Handle attach/detach
+	clauses.
+	(gimplify_adjust_omp_clauses_1): Skip adjustments for explicit
+	attach/detach clauses.
+	(gimplify_omp_target_update): Handle finalize for detach.
+	* omp-low.c (lower_omp_target): Support GOMP_MAP_ATTACH,
+	GOMP_MAP_DETACH, GOMP_MAP_FORCE_DETACH.
+	* tree-pretty-print.c (dump_omp_clause): Likewise.
+
+2018-11-10  Julian Brown  <julian@codesourcery.com>
+
+	* gimplify.c (insert_struct_comp_map, check_base_and_compare_lt): New.
+	(gimplify_scan_omp_clauses): Outline duplicated code into calls to
+	above two functions.
+
+2015-08-20  Thomas Schwinge  <thomas@codesourcery.com>
+	    Joseph Myers  <joseph@codesourcery.com>
+
+	PR libgomp/81886
+	* doc/invoke.texi (-ffixed-@var{reg}): Document conflict with
+	Fortran options.
+	* gcc.c (add_omp_infile_spec_func, spec_lang_mask_accept): New.
+	(driver_self_specs): Add spec to use %:add-omp-infile().
+	(static_spec_functions): Add add-omp-infile.
+	(struct switchstr): Add lang_mask field.  Expand comment.
+	(struct infile): Add lang_mask field.
+	(add_infile, save_switch, do_spec): Add lang_mask argument.
+	(driver_unknown_option_callback, driver_wrong_lang_callback)
+	(driver_handle_option, process_command, do_self_spec)
+	(driver::do_spec_on_infiles, driver::maybe_run_linker): All
+	callers changed.
+	(give_switch): Check languages of switch against
+	spec_lang_mask_accept.
+	(driver::maybe_putenv_OFFLOAD_TARGETS): Don't free
+	offload_targets.
+	* gcc.h (do_spec): Update prototype.
+
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index abae872..779eadc 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1430,7 +1430,9 @@
 	omp-expand.o \
 	omp-general.o \
 	omp-grid.o \
+	omp-sese.o \
 	omp-low.o \
+	omp-oacc-kernels.o \
 	omp-simd-clone.o \
 	opt-problem.o \
 	optabs.o \
@@ -2559,6 +2561,7 @@
   $(srcdir)/omp-offload.c \
   $(srcdir)/omp-expand.c \
   $(srcdir)/omp-low.c \
+  $(srcdir)/omp-oacc-kernels.c \
   $(srcdir)/targhooks.c $(out_file) $(srcdir)/passes.c $(srcdir)/cgraphunit.c \
   $(srcdir)/cgraphclones.c \
   $(srcdir)/tree-phinodes.c \
diff --git a/gcc/c-family/ChangeLog.omp b/gcc/c-family/ChangeLog.omp
new file mode 100644
index 0000000..d27a965
--- /dev/null
+++ b/gcc/c-family/ChangeLog.omp
@@ -0,0 +1,38 @@
+2019-01-09  Julian Brown  <julian@codesourcery.com>
+
+	* c-cppbuiltin.c (c_cpp_builtins): Update _OPENACC define to 201711.
+
+2019-01-23  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* c.opt (fopenacc-kernels): Default to "split".
+
+2019-01-30  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* c.opt (fopenacc-kernels): New flag.
+
+2018-12-19  Julian Brown  <julian@codesourcery.com>
+            Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* c-pragma.h (pragma_omp_clause): Add
+	PRAGMA_OACC_CLAUSE_NO_CREATE.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* c-pragma.h (pragma_kind): Add PRAGMA_OACC_SERIAL enumeration
+	constant.
+	* c-pragma.c (oacc_pragmas): Add "serial" entry.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* c-attribs.c (c_common_attribute_table): Set min_len to -1 for
+	"omp declare target".
+	* c-pragma.h (pragma_omp_clause): Add PRAGMA_OACC_CLAUSE_NOHST.
+
+2018-12-14  Julian Brown  <julian@codesourcery.com>
+
+	* c-pragma.h (pragma_omp_clause): Add PRAGMA_OACC_CLAUSE_ATTACH and
+	PRAGMA_OACC_CLAUSE_DETACH.
+	* c-common.h (c_omp_map_clause_name): Add prototype.
+	* c-omp.c (c_omp_map_clause_name): New function.
+
diff --git a/gcc/c-family/c-attribs.c b/gcc/c-family/c-attribs.c
index a19e4ca..b2b510a 100644
--- a/gcc/c-family/c-attribs.c
+++ b/gcc/c-family/c-attribs.c
@@ -437,7 +437,7 @@
 			      handle_omp_declare_simd_attribute, NULL },
   { "simd",		      0, 1, true,  false, false, false,
 			      handle_simd_attribute, NULL },
-  { "omp declare target",     0, 0, true, false, false, false,
+  { "omp declare target",     0, -1, true, false, false, false,
 			      handle_omp_declare_target_attribute, NULL },
   { "omp declare target link", 0, 0, true, false, false, false,
 			      handle_omp_declare_target_attribute, NULL },
diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
index 6837642..a6c13d0 100644
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -1187,6 +1187,7 @@
 extern void c_omp_declare_simd_clauses_to_decls (tree, tree);
 extern bool c_omp_predefined_variable (tree);
 extern enum omp_clause_default_kind c_omp_predetermined_sharing (tree);
+extern const char *c_omp_map_clause_name (tree, bool);
 
 /* Return next tree in the chain for chain_next walking of tree nodes.  */
 static inline tree
diff --git a/gcc/c-family/c-cppbuiltin.c b/gcc/c-family/c-cppbuiltin.c
index d389f8c..674e5e6 100644
--- a/gcc/c-family/c-cppbuiltin.c
+++ b/gcc/c-family/c-cppbuiltin.c
@@ -1407,7 +1407,7 @@
     cpp_define (pfile, "__SSP__=1");
 
   if (flag_openacc)
-    cpp_define (pfile, "_OPENACC=201306");
+    cpp_define (pfile, "_OPENACC=201711");
 
   if (flag_openmp)
     cpp_define (pfile, "_OPENMP=201511");
diff --git a/gcc/c-family/c-omp.c b/gcc/c-family/c-omp.c
index ebe0b4e..d3126e3 100644
--- a/gcc/c-family/c-omp.c
+++ b/gcc/c-family/c-omp.c
@@ -2060,3 +2060,36 @@
 
   return OMP_CLAUSE_DEFAULT_UNSPECIFIED;
 }
+
+/* For OpenACC, the OMP_CLAUSE_MAP_KIND of an OMP_CLAUSE_MAP is used internally
+   to distinguish clauses as seen by the user.  Return the "friendly" clause
+   name for error messages etc., where possible.  See also
+   c/c-parser.c:c_parser_oacc_data_clause and
+   cp/parser.c:cp_parser_oacc_data_clause.  */
+
+const char *
+c_omp_map_clause_name (tree clause, bool oacc)
+{
+  if (oacc && OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_MAP)
+    switch (OMP_CLAUSE_MAP_KIND (clause))
+    {
+    case GOMP_MAP_FORCE_ALLOC:
+    case GOMP_MAP_ALLOC: return "create";
+    case GOMP_MAP_FORCE_TO:
+    case GOMP_MAP_TO: return "copyin";
+    case GOMP_MAP_FORCE_FROM:
+    case GOMP_MAP_FROM: return "copyout";
+    case GOMP_MAP_FORCE_TOFROM:
+    case GOMP_MAP_TOFROM: return "copy";
+    case GOMP_MAP_RELEASE: return "delete";
+    case GOMP_MAP_FORCE_PRESENT: return "present";
+    case GOMP_MAP_ATTACH: return "attach";
+    case GOMP_MAP_FORCE_DETACH:
+    case GOMP_MAP_DETACH: return "detach";
+    case GOMP_MAP_DEVICE_RESIDENT: return "device_resident";
+    case GOMP_MAP_LINK: return "link";
+    case GOMP_MAP_FORCE_DEVICEPTR: return "deviceptr";
+    default: break;
+    }
+  return omp_clause_code_name[OMP_CLAUSE_CODE (clause)];
+}
diff --git a/gcc/c-family/c-pragma.c b/gcc/c-family/c-pragma.c
index fbc734a..c20a8aa 100644
--- a/gcc/c-family/c-pragma.c
+++ b/gcc/c-family/c-pragma.c
@@ -1277,6 +1277,7 @@
   { "loop", PRAGMA_OACC_LOOP },
   { "parallel", PRAGMA_OACC_PARALLEL },
   { "routine", PRAGMA_OACC_ROUTINE },
+  { "serial", PRAGMA_OACC_SERIAL },
   { "update", PRAGMA_OACC_UPDATE },
   { "wait", PRAGMA_OACC_WAIT }
 };
diff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h
index 2bbb279..a6bae05 100644
--- a/gcc/c-family/c-pragma.h
+++ b/gcc/c-family/c-pragma.h
@@ -38,6 +38,7 @@
   PRAGMA_OACC_LOOP,
   PRAGMA_OACC_PARALLEL,
   PRAGMA_OACC_ROUTINE,
+  PRAGMA_OACC_SERIAL,
   PRAGMA_OACC_UPDATE,
   PRAGMA_OACC_WAIT,
 
@@ -136,17 +137,21 @@
 
   /* Clauses for OpenACC.  */
   PRAGMA_OACC_CLAUSE_ASYNC,
+  PRAGMA_OACC_CLAUSE_ATTACH,
   PRAGMA_OACC_CLAUSE_AUTO,
   PRAGMA_OACC_CLAUSE_COPY,
   PRAGMA_OACC_CLAUSE_COPYOUT,
   PRAGMA_OACC_CLAUSE_CREATE,
   PRAGMA_OACC_CLAUSE_DELETE,
+  PRAGMA_OACC_CLAUSE_DETACH,
   PRAGMA_OACC_CLAUSE_DEVICEPTR,
   PRAGMA_OACC_CLAUSE_DEVICE_RESIDENT,
   PRAGMA_OACC_CLAUSE_FINALIZE,
   PRAGMA_OACC_CLAUSE_GANG,
   PRAGMA_OACC_CLAUSE_HOST,
   PRAGMA_OACC_CLAUSE_INDEPENDENT,
+  PRAGMA_OACC_CLAUSE_NO_CREATE,
+  PRAGMA_OACC_CLAUSE_NOHOST,
   PRAGMA_OACC_CLAUSE_NUM_GANGS,
   PRAGMA_OACC_CLAUSE_NUM_WORKERS,
   PRAGMA_OACC_CLAUSE_PRESENT,
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index 916cc67..3f0c0aa 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -1668,6 +1668,19 @@
 C ObjC C++ ObjC++ LTO Joined Var(flag_openacc_dims)
 Specify default OpenACC compute dimensions.
 
+fopenacc-kernels=
+C ObjC C++ ObjC++ RejectNegative Joined Enum(openacc_kernels) Var(flag_openacc_kernels) Init(OPENACC_KERNELS_SPLIT)
+-fopenacc-kernels=[split|parloops]	Configure OpenACC 'kernels' constructs handling.
+
+Enum
+Name(openacc_kernels) Type(enum openacc_kernels)
+
+EnumValue
+Enum(openacc_kernels) String(split) Value(OPENACC_KERNELS_SPLIT)
+
+EnumValue
+Enum(openacc_kernels) String(parloops) Value(OPENACC_KERNELS_PARLOOPS)
+
 fopenmp
 C ObjC C++ ObjC++ LTO Var(flag_openmp)
 Enable OpenMP (implies -frecursive in Fortran).
diff --git a/gcc/c/ChangeLog.omp b/gcc/c/ChangeLog.omp
new file mode 100644
index 0000000..e5ac8c4
--- /dev/null
+++ b/gcc/c/ChangeLog.omp
@@ -0,0 +1,96 @@
+2019-07-10  Julian Brown  <julian@codesourcery.com>
+
+	* c-typeck.c (handle_omp_array_sections): Use GOMP_MAP_ATTACH_DETACH
+	for OpenACC attach/detach operations.
+
+2018-12-19  Julian Brown  <julian@codesourcery.com>
+            Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* c-parser.c (c_parser_omp_clause_name): Support no_create.
+	(c_parser_oacc_data_clause): Likewise.
+	(c_parser_oacc_all_clauses): Likewise.
+	(OACC_DATA_CLAUSE_MASK, OACC_KERNELS_CLAUSE_MASK)
+	(OACC_PARALLEL_CLAUSE_MASK, OACC_SERIAL_CLAUSE_MASK): Add
+	PRAGMA_OACC_CLAUSE_NO_CREATE.
+	* c-typeck.c (handle_omp_array_sections): Support
+	GOMP_MAP_NO_ALLOC.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* c-parser.c (OACC_SERIAL_CLAUSE_MASK): New macro.
+	(OACC_SERIAL_CLAUSE_DEVICE_TYPE_MASK): Likewise.
+	(c_parser_oacc_kernels_parallel): Rename function to...
+	(c_parser_oacc_compute): ... this.  Handle PRAGMA_OACC_SERIAL.
+	(c_parser_omp_construct): Update accordingly.
+
+2018-12-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* c-parser.c (OACC_HOST_DATA_CLAUSE_MASK): Add PRAGMA_OACC_CLAUSE_IF
+	and PRAGMA_OACC_CLAUSE_IF_PRESENT.
+
+2018-12-13  Cesar Philippidis  <cesar@codesourcery.com>
+            Nathan Sidwell  <nathan@acm.org>
+            Julian Brown  <julian@codesourcery.com>
+
+        * c-parser.c (c_parser_omp_variable_list): New c_omp_region_type
+        argument.  Use it to specialize handling of OMP_CLAUSE_REDUCTION for
+        OpenACC.
+	(c_parser_oacc_data_clause): Add region-type argument.
+	(c_parser_oacc_data_clause_deviceptr): Likewise.
+        (c_parser_omp_clause_reduction): Change is_omp boolean parameter to
+        c_omp_region_type.  Update call to c_parser_omp_variable_list.
+        (c_parser_oacc_all_clauses): Update calls to
+        c_parser_omp_clause_reduction.
+        (c_parser_omp_all_clauses): Likewise.
+	(c_parser_oacc_cache): Update call to c_parser_omp_var_list_parens.
+        * c-typeck.c (c_finish_omp_clauses): Emit an error on orphan OpenACC
+        gang reductions.  Suppress user-defined reduction error for OpenACC.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* c-parser.c (c_parser_oacc_routine): Normalize order of clauses.
+	(c_finish_oacc_routine): Call oacc_verify_routine_clauses.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* c-parser.c (c_parser_omp_clause_name): Handle "nohost".
+	(c_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_NOHOST.
+	(c_parser_oacc_routine, c_finish_oacc_routine): Update.
+	* c-typeck.c (c_finish_omp_clauses): Handle OMP_CLAUSE_NOHOST.
+
+2018-10-16  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* c-typeck.c (handle_omp_array_sections_1): Add 'bool &non_contiguous'
+	parameter, adjust recursive call site, add cases for allowing
+	pointer based multi-dimensional arrays for OpenACC.
+	(handle_omp_array_sections): Adjust handle_omp_array_sections_1 call,
+	handle non-contiguous case to create dynamic array map.
+
+2019-01-31  Julian Brown  <julian@codesourcery.com>
+
+	* c-typeck.c (handle_omp_array_sections_1): Handle chained dereferences.
+	(c_finish_omp_clauses): Likewise.
+
+2018-12-14  Julian Brown  <julian@codesourcery.com>
+
+	* c-parser.c (c_parser_omp_clause_name): Add parsing of attach and
+	detach clauses.
+	(c_parser_omp_variable_list): Add ALLOW_DEREF optional parameter.
+	Allow deref (->) in variable lists if true.
+	(c_parser_omp_var_list_parens): Add ALLOW_DEREF optional parameter.
+	Pass to c_parser_omp_variable_list.
+	(c_parser_oacc_data_clause): Support attach and detach clauses.  Update
+	call to c_parser_omp_variable_list.
+	(c_parser_oacc_all_clauses): Support attach and detach clauses.
+	(OACC_DATA_CLAUSE_MASK, OACC_ENTER_DATA_CLAUSE_MASK)
+	(OACC_KERNELS_CLAUSE_MASK, OACC_PARALLEL_CLAUSE_MASK): Add
+	PRAGMA_OACC_CLAUSE_ATTACH.
+	(OACC_EXIT_DATA_CLAUSE_MASK): Add PRAGMA_OACC_CLAUSE_DETACH.
+	* c-typeck.c (handle_omp_array_sections_1): Reject subarrays for attach
+	and detach.  Support deref.
+	(c_oacc_check_attachments): New function.
+	(c_finish_omp_clauses): Check attach/detach arguments for being
+	pointers using above.  Support deref.
+
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index aafe8d1..523381d 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -11688,6 +11688,8 @@
 	    result = PRAGMA_OMP_CLAUSE_ALIGNED;
 	  else if (!strcmp ("async", p))
 	    result = PRAGMA_OACC_CLAUSE_ASYNC;
+	  else if (!strcmp ("attach", p))
+	    result = PRAGMA_OACC_CLAUSE_ATTACH;
 	  break;
 	case 'c':
 	  if (!strcmp ("collapse", p))
@@ -11710,6 +11712,8 @@
 	    result = PRAGMA_OACC_CLAUSE_DELETE;
 	  else if (!strcmp ("depend", p))
 	    result = PRAGMA_OMP_CLAUSE_DEPEND;
+	  else if (!strcmp ("detach", p))
+	    result = PRAGMA_OACC_CLAUSE_DETACH;
 	  else if (!strcmp ("device", p))
 	    result = PRAGMA_OMP_CLAUSE_DEVICE;
 	  else if (!strcmp ("deviceptr", p))
@@ -11768,7 +11772,9 @@
 	    result = PRAGMA_OMP_CLAUSE_MERGEABLE;
 	  break;
 	case 'n':
-	  if (!strcmp ("nogroup", p))
+	  if (!strcmp ("no_create", p))
+	    result = PRAGMA_OACC_CLAUSE_NO_CREATE;
+	  else if (!strcmp ("nogroup", p))
 	    result = PRAGMA_OMP_CLAUSE_NOGROUP;
 	  else if (!strcmp ("nontemporal", p))
 	    result = PRAGMA_OMP_CLAUSE_NONTEMPORAL;
@@ -11776,6 +11782,8 @@
 	    result = PRAGMA_OMP_CLAUSE_NOTINBRANCH;
 	  else if (!strcmp ("nowait", p))
 	    result = PRAGMA_OMP_CLAUSE_NOWAIT;
+	  else if (!strcmp ("nohost", p))
+	    result = PRAGMA_OACC_CLAUSE_NOHOST;
 	  else if (!strcmp ("num_gangs", p))
 	    result = PRAGMA_OACC_CLAUSE_NUM_GANGS;
 	  else if (!strcmp ("num_tasks", p))
@@ -11954,12 +11962,17 @@
    If KIND is nonzero, CLAUSE_LOC is the location of the clause.
 
    If KIND is zero, create a TREE_LIST with the decl in TREE_PURPOSE;
-   return the list created.  */
+   return the list created.
+
+   The optional ALLOW_DEREF argument is true if list items can use the deref
+   (->) operator.  */
 
 static tree
 c_parser_omp_variable_list (c_parser *parser,
 			    location_t clause_loc,
-			    enum omp_clause_code kind, tree list)
+			    enum omp_clause_code kind, tree list,
+			    enum c_omp_region_type ort = C_ORT_OMP,
+			    bool allow_deref = false)
 {
   auto_vec<c_token> tokens;
   unsigned int tokens_avail = 0;
@@ -12086,9 +12099,13 @@
 	    case OMP_CLAUSE_MAP:
 	    case OMP_CLAUSE_FROM:
 	    case OMP_CLAUSE_TO:
-	      while (c_parser_next_token_is (parser, CPP_DOT))
+	      while (c_parser_next_token_is (parser, CPP_DOT)
+		     || (allow_deref
+			 && c_parser_next_token_is (parser, CPP_DEREF)))
 		{
 		  location_t op_loc = c_parser_peek_token (parser)->location;
+		  if (c_parser_next_token_is (parser, CPP_DEREF))
+		    t = build_simple_mem_ref (t);
 		  c_parser_consume_token (parser);
 		  if (!c_parser_next_token_is (parser, CPP_NAME))
 		    {
@@ -12108,7 +12125,8 @@
 	    case OMP_CLAUSE_REDUCTION:
 	    case OMP_CLAUSE_IN_REDUCTION:
 	    case OMP_CLAUSE_TASK_REDUCTION:
-	      while (c_parser_next_token_is (parser, CPP_OPEN_SQUARE))
+	      while ((ort != C_ORT_ACC || kind != OMP_CLAUSE_REDUCTION)
+		     && c_parser_next_token_is (parser, CPP_OPEN_SQUARE))
 		{
 		  tree low_bound = NULL_TREE, length = NULL_TREE;
 
@@ -12210,11 +12228,14 @@
 }
 
 /* Similarly, but expect leading and trailing parenthesis.  This is a very
-   common case for OpenACC and OpenMP clauses.  */
+   common case for OpenACC and OpenMP clauses.  The optional ALLOW_DEREF
+   argument is true if list items can use the deref (->) operator.  */
 
 static tree
 c_parser_omp_var_list_parens (c_parser *parser, enum omp_clause_code kind,
-			      tree list)
+			      tree list,
+			      enum c_omp_region_type ort = C_ORT_OMP,
+			      bool allow_deref = false)
 {
   /* The clauses location.  */
   location_t loc = c_parser_peek_token (parser)->location;
@@ -12222,19 +12243,25 @@
   matching_parens parens;
   if (parens.require_open (parser))
     {
-      list = c_parser_omp_variable_list (parser, loc, kind, list);
+      list = c_parser_omp_variable_list (parser, loc, kind, list, ort,
+					 allow_deref);
       parens.skip_until_found_close (parser);
     }
   return list;
 }
 
-/* OpenACC 2.0:
+/* OpenACC 2.5:
+   attach ( variable-list )
    copy ( variable-list )
    copyin ( variable-list )
    copyout ( variable-list )
    create ( variable-list )
    delete ( variable-list )
-   present ( variable-list ) */
+   detach ( variable-list )
+   present ( variable-list )
+
+   OpenACC 2.6:
+   no_create ( variable-list ) */
 
 static tree
 c_parser_oacc_data_clause (c_parser *parser, pragma_omp_clause c_kind,
@@ -12243,6 +12270,9 @@
   enum gomp_map_kind kind;
   switch (c_kind)
     {
+    case PRAGMA_OACC_CLAUSE_ATTACH:
+      kind = GOMP_MAP_ATTACH;
+      break;
     case PRAGMA_OACC_CLAUSE_COPY:
       kind = GOMP_MAP_TOFROM;
       break;
@@ -12258,6 +12288,9 @@
     case PRAGMA_OACC_CLAUSE_DELETE:
       kind = GOMP_MAP_RELEASE;
       break;
+    case PRAGMA_OACC_CLAUSE_DETACH:
+      kind = GOMP_MAP_DETACH;
+      break;
     case PRAGMA_OACC_CLAUSE_DEVICE:
       kind = GOMP_MAP_FORCE_TO;
       break;
@@ -12270,6 +12303,9 @@
     case PRAGMA_OACC_CLAUSE_LINK:
       kind = GOMP_MAP_LINK;
       break;
+    case PRAGMA_OACC_CLAUSE_NO_CREATE:
+      kind = GOMP_MAP_NO_ALLOC;
+      break;
     case PRAGMA_OACC_CLAUSE_PRESENT:
       kind = GOMP_MAP_FORCE_PRESENT;
       break;
@@ -12277,7 +12313,8 @@
       gcc_unreachable ();
     }
   tree nl, c;
-  nl = c_parser_omp_var_list_parens (parser, OMP_CLAUSE_MAP, list);
+  nl = c_parser_omp_var_list_parens (parser, OMP_CLAUSE_MAP, list, C_ORT_ACC,
+				     true);
 
   for (c = nl; c != list; c = OMP_CLAUSE_CHAIN (c))
     OMP_CLAUSE_SET_MAP_KIND (c, kind);
@@ -12297,7 +12334,8 @@
   /* Can't use OMP_CLAUSE_MAP here (that is, can't use the generic
      c_parser_oacc_data_clause), as for PRAGMA_OACC_CLAUSE_DEVICEPTR,
      variable-list must only allow for pointer variables.  */
-  vars = c_parser_omp_var_list_parens (parser, OMP_CLAUSE_ERROR, NULL);
+  vars = c_parser_omp_var_list_parens (parser, OMP_CLAUSE_ERROR, NULL,
+				       C_ORT_ACC);
   for (t = vars; t && t; t = TREE_CHAIN (t))
     {
       tree v = TREE_PURPOSE (t);
@@ -13559,7 +13597,7 @@
 
 static tree
 c_parser_omp_clause_reduction (c_parser *parser, enum omp_clause_code kind,
-			       bool is_omp, tree list)
+			       enum c_omp_region_type ort, tree list)
 {
   location_t clause_loc = c_parser_peek_token (parser)->location;
   matching_parens parens;
@@ -13570,7 +13608,7 @@
       enum tree_code code = ERROR_MARK;
       tree reduc_id = NULL_TREE;
 
-      if (kind == OMP_CLAUSE_REDUCTION && is_omp)
+      if (kind == OMP_CLAUSE_REDUCTION && ort == C_ORT_OMP)
 	{
 	  if (c_parser_next_token_is_keyword (parser, RID_DEFAULT)
 	      && c_parser_peek_2nd_token (parser)->type == CPP_COMMA)
@@ -13655,7 +13693,8 @@
 	{
 	  tree nl, c;
 
-	  nl = c_parser_omp_variable_list (parser, clause_loc, kind, list);
+	  nl = c_parser_omp_variable_list (parser, clause_loc, kind, list, ort);
+
 	  for (c = nl; c != list; c = OMP_CLAUSE_CHAIN (c))
 	    {
 	      tree d = OMP_CLAUSE_DECL (c), type;
@@ -14864,6 +14903,10 @@
 						 clauses);
 	  c_name = "auto";
 	  break;
+	case PRAGMA_OACC_CLAUSE_ATTACH:
+	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "attach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_COLLAPSE:
 	  clauses = c_parser_omp_clause_collapse (parser, clauses);
 	  c_name = "collapse";
@@ -14892,6 +14935,10 @@
 	  clauses = c_parser_omp_clause_default (parser, clauses, true);
 	  c_name = "default";
 	  break;
+	case PRAGMA_OACC_CLAUSE_DETACH:
+	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "detach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_DEVICE:
 	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "device";
@@ -14940,6 +14987,15 @@
 	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "link";
 	  break;
+	case PRAGMA_OACC_CLAUSE_NO_CREATE:
+	  clauses = c_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "no_create";
+	  break;
+	case PRAGMA_OACC_CLAUSE_NOHOST:
+	  clauses = c_parser_oacc_simple_clause (here, OMP_CLAUSE_NOHOST,
+						 clauses);
+	  c_name = "nohost";
+	  break;
 	case PRAGMA_OACC_CLAUSE_NUM_GANGS:
 	  clauses = c_parser_oacc_single_int_clause (parser,
 						     OMP_CLAUSE_NUM_GANGS,
@@ -14963,7 +15019,7 @@
 	case PRAGMA_OACC_CLAUSE_REDUCTION:
 	  clauses
 	    = c_parser_omp_clause_reduction (parser, OMP_CLAUSE_REDUCTION,
-					     false, clauses);
+					     C_ORT_ACC, clauses);
 	  c_name = "reduction";
 	  break;
 	case PRAGMA_OACC_CLAUSE_SEQ:
@@ -15092,7 +15148,7 @@
 	case PRAGMA_OMP_CLAUSE_IN_REDUCTION:
 	  clauses
 	    = c_parser_omp_clause_reduction (parser, OMP_CLAUSE_IN_REDUCTION,
-					     true, clauses);
+					     C_ORT_OMP, clauses);
 	  c_name = "in_reduction";
 	  break;
 	case PRAGMA_OMP_CLAUSE_LASTPRIVATE:
@@ -15130,7 +15186,7 @@
 	case PRAGMA_OMP_CLAUSE_REDUCTION:
 	  clauses
 	    = c_parser_omp_clause_reduction (parser, OMP_CLAUSE_REDUCTION,
-					     true, clauses);
+					     C_ORT_OMP, clauses);
 	  c_name = "reduction";
 	  break;
 	case PRAGMA_OMP_CLAUSE_SCHEDULE:
@@ -15144,7 +15200,7 @@
 	case PRAGMA_OMP_CLAUSE_TASK_REDUCTION:
 	  clauses
 	    = c_parser_omp_clause_reduction (parser, OMP_CLAUSE_TASK_REDUCTION,
-					     true, clauses);
+					     C_ORT_OMP, clauses);
 	  c_name = "task_reduction";
 	  break;
 	case PRAGMA_OMP_CLAUSE_UNTIED:
@@ -15348,7 +15404,8 @@
 {
   tree stmt, clauses;
 
-  clauses = c_parser_omp_var_list_parens (parser, OMP_CLAUSE__CACHE_, NULL);
+  clauses = c_parser_omp_var_list_parens (parser, OMP_CLAUSE__CACHE_, NULL,
+					  C_ORT_ACC);
   clauses = c_finish_omp_clauses (clauses, C_ORT_ACC);
 
   c_parser_skip_to_pragma_eol (parser);
@@ -15370,12 +15427,14 @@
 */
 
 #define OACC_DATA_CLAUSE_MASK						\
-	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NO_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT))
 
 static tree
@@ -15553,6 +15612,7 @@
 #define OACC_ENTER_DATA_CLAUSE_MASK					\
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
@@ -15562,6 +15622,7 @@
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DELETE) 		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DETACH) 		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_FINALIZE) 		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
@@ -15617,7 +15678,9 @@
 */
 
 #define OACC_HOST_DATA_CLAUSE_MASK					\
-	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_USE_DEVICE) )
+        ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_USE_DEVICE)          \
+        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)                  \
+        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF_PRESENT) )
 
 static tree
 c_parser_oacc_host_data (location_t loc, c_parser *parser, bool *if_p)
@@ -15691,11 +15754,17 @@
    # pragma acc parallel oacc-parallel-clause[optseq] new-line
      structured-block
 
+   OpenACC 2.6:
+
+   # pragma acc serial oacc-serial-clause[optseq] new-line
+     structured-block
+
    LOC is the location of the #pragma token.
 */
 
 #define OACC_KERNELS_CLAUSE_MASK					\
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
@@ -15703,6 +15772,7 @@
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEFAULT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NO_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_GANGS)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_WORKERS)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT)		\
@@ -15711,6 +15781,7 @@
 
 #define OACC_PARALLEL_CLAUSE_MASK					\
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
@@ -15718,6 +15789,7 @@
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEFAULT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NO_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRIVATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_FIRSTPRIVATE)	\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_GANGS)		\
@@ -15727,10 +15799,26 @@
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR_LENGTH)	\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
+#define OACC_SERIAL_CLAUSE_MASK					\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEFAULT)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NO_CREATE)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRIVATE)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_FIRSTPRIVATE)	\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_REDUCTION)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
+
 static tree
-c_parser_oacc_kernels_parallel (location_t loc, c_parser *parser,
-				enum pragma_kind p_kind, char *p_name,
-				bool *if_p)
+c_parser_oacc_compute (location_t loc, c_parser *parser,
+		       enum pragma_kind p_kind, char *p_name, bool *if_p)
 {
   omp_clause_mask mask;
   enum tree_code code;
@@ -15746,6 +15834,11 @@
       mask = OACC_PARALLEL_CLAUSE_MASK;
       code = OACC_PARALLEL;
       break;
+    case PRAGMA_OACC_SERIAL:
+      strcat (p_name, " serial");
+      mask = OACC_SERIAL_CLAUSE_MASK;
+      code = OACC_SERIAL;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -15782,7 +15875,8 @@
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_GANG)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WORKER)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR)		\
-	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_SEQ) )
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_SEQ)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NOHOST) )
 
 /* Parse an OpenACC routine directive.  For named directives, we apply
    immediately to the named function.  For unnamed ones we then parse
@@ -15832,6 +15926,9 @@
       data.clauses
 	= c_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK,
 				     "#pragma acc routine");
+      /* The clauses are in reverse order; fix that to make later diagnostic
+	 emission easier.  */
+      data.clauses = nreverse (data.clauses);
 
       if (TREE_CODE (decl) != FUNCTION_DECL)
 	{
@@ -15846,6 +15943,9 @@
       data.clauses
 	= c_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK,
 				     "#pragma acc routine");
+      /* The clauses are in reverse order; fix that to make later diagnostic
+	 emission easier.  */
+      data.clauses = nreverse (data.clauses);
 
       /* Emit a helpful diagnostic if there's another pragma following this
 	 one.  Also don't allow a static assertion declaration, as in the
@@ -15909,6 +16009,8 @@
       return;
     }
 
+  oacc_verify_routine_clauses (&data->clauses, data->loc);
+
   if (oacc_get_fn_attrib (fndecl))
     {
       error_at (data->loc,
@@ -15935,7 +16037,7 @@
   /* Add an "omp declare target" attribute.  */
   DECL_ATTRIBUTES (fndecl)
     = tree_cons (get_identifier ("omp declare target"),
-		 NULL_TREE, DECL_ATTRIBUTES (fndecl));
+		 data->clauses, DECL_ATTRIBUTES (fndecl));
 
   /* Remember that we've used this "#pragma acc routine".  */
   data->fndecl_seen = true;
@@ -19537,9 +19639,9 @@
       break;
     case PRAGMA_OACC_KERNELS:
     case PRAGMA_OACC_PARALLEL:
+    case PRAGMA_OACC_SERIAL:
       strcpy (p_name, "#pragma acc");
-      stmt = c_parser_oacc_kernels_parallel (loc, parser, p_kind, p_name,
-					     if_p);
+      stmt = c_parser_oacc_compute (loc, parser, p_kind, p_name, if_p);
       break;
     case PRAGMA_OACC_LOOP:
       strcpy (p_name, "#pragma acc");
diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index c0582a5..83fda52 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -12859,7 +12859,7 @@
 static tree
 handle_omp_array_sections_1 (tree c, tree t, vec<tree> &types,
 			     bool &maybe_zero_len, unsigned int &first_non_one,
-			     enum c_omp_region_type ort)
+			     bool &non_contiguous, enum c_omp_region_type ort)
 {
   tree ret, low_bound, length, type;
   if (TREE_CODE (t) != TREE_LIST)
@@ -12875,7 +12875,6 @@
 	  return error_mark_node;
 	}
       if (TREE_CODE (t) == COMPONENT_REF
-	  && ort == C_ORT_OMP
 	  && (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
 	      || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_TO
 	      || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FROM))
@@ -12896,6 +12895,15 @@
 		  return error_mark_node;
 		}
 	      t = TREE_OPERAND (t, 0);
+	      if (ort == C_ORT_ACC && TREE_CODE (t) == MEM_REF)
+		{
+		  if (maybe_ne (mem_ref_offset (t), 0))
+	            error_at (OMP_CLAUSE_LOCATION (c),
+			      "cannot dereference %qE in %qs clause", t,
+			      omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
+		  else
+		    t = TREE_OPERAND (t, 0);
+		}
 	    }
 	}
       if (!VAR_P (t) && TREE_CODE (t) != PARM_DECL)
@@ -12944,7 +12952,8 @@
     }
 
   ret = handle_omp_array_sections_1 (c, TREE_CHAIN (t), types,
-				     maybe_zero_len, first_non_one, ort);
+				     maybe_zero_len, first_non_one,
+				     non_contiguous, ort);
   if (ret == error_mark_node || ret == NULL_TREE)
     return ret;
 
@@ -12981,7 +12990,18 @@
     length = fold_convert (sizetype, length);
   if (low_bound == NULL_TREE)
     low_bound = integer_zero_node;
-
+  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+      && (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH
+	  || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_DETACH))
+    {
+      if (length != integer_one_node)
+	{
+	  error_at (OMP_CLAUSE_LOCATION (c),
+		    "expected single pointer in %qs clause",
+		    c_omp_map_clause_name (c, ort == C_ORT_ACC));
+	  return error_mark_node;
+	}
+    }
   if (length != NULL_TREE)
     {
       if (!integer_nonzerop (length))
@@ -13110,6 +13130,21 @@
 		    }
 		}
 	    }
+
+	  /* For OpenACC, if the low_bound/length suggest this is a subarray,
+	     and is referenced through by a pointer, then mark this as
+	     non-contiguous.  */
+	  if (ort == C_ORT_ACC
+	      && types.length () > 0
+	      && (TREE_CODE (low_bound) != INTEGER_CST
+		  || integer_nonzerop (low_bound)
+		  || (length && (TREE_CODE (length) != INTEGER_CST
+				 || !tree_int_cst_equal (size, length)))))
+	    {
+	      tree x = types.last ();
+	      if (TREE_CODE (x) == POINTER_TYPE)
+		non_contiguous = true;
+	    }
 	}
       else if (length == NULL_TREE)
 	{
@@ -13153,13 +13188,16 @@
       /* If there is a pointer type anywhere but in the very first
 	 array-section-subscript, the array section can't be contiguous.  */
       if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_DEPEND
-	  && TREE_CODE (TREE_CHAIN (t)) == TREE_LIST)
+	  && TREE_CODE (TREE_CHAIN (t)) == TREE_LIST
+	  && ort != C_ORT_ACC)
 	{
 	  error_at (OMP_CLAUSE_LOCATION (c),
 		    "array section is not contiguous in %qs clause",
 		    omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
 	  return error_mark_node;
 	}
+      else if (TREE_CODE (TREE_CHAIN (t)) == TREE_LIST)
+	non_contiguous = true;
     }
   else
     {
@@ -13187,6 +13225,7 @@
 {
   bool maybe_zero_len = false;
   unsigned int first_non_one = 0;
+  bool non_contiguous = false;
   auto_vec<tree, 10> types;
   tree *tp = &OMP_CLAUSE_DECL (c);
   if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_DEPEND
@@ -13196,7 +13235,7 @@
     tp = &TREE_VALUE (*tp);
   tree first = handle_omp_array_sections_1 (c, *tp, types,
 					    maybe_zero_len, first_non_one,
-					    ort);
+					    non_contiguous, ort);
   if (first == error_mark_node)
     return true;
   if (first == NULL_TREE)
@@ -13229,6 +13268,7 @@
       unsigned int num = types.length (), i;
       tree t, side_effects = NULL_TREE, size = NULL_TREE;
       tree condition = NULL_TREE;
+      tree da_dims = NULL_TREE;
 
       if (int_size_in_bytes (TREE_TYPE (first)) <= 0)
 	maybe_zero_len = true;
@@ -13252,6 +13292,13 @@
 	    length = fold_convert (sizetype, length);
 	  if (low_bound == NULL_TREE)
 	    low_bound = integer_zero_node;
+
+	  if (non_contiguous)
+	    {
+	      da_dims = tree_cons (low_bound, length, da_dims);
+	      continue;
+	    }
+
 	  if (!maybe_zero_len && i > first_non_one)
 	    {
 	      if (integer_nonzerop (low_bound))
@@ -13348,6 +13395,14 @@
 		size = size_binop (MULT_EXPR, size, l);
 	    }
 	}
+      if (non_contiguous)
+	{
+	  int kind = OMP_CLAUSE_MAP_KIND (c);
+	  OMP_CLAUSE_SET_MAP_KIND (c, kind | GOMP_MAP_DYNAMIC_ARRAY);
+	  OMP_CLAUSE_DECL (c) = t;
+	  OMP_CLAUSE_SIZE (c) = da_dims;
+	  return false;
+	}
       if (side_effects)
 	size = build2 (COMPOUND_EXPR, sizetype, side_effects, size);
       if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
@@ -13400,6 +13455,7 @@
 	switch (OMP_CLAUSE_MAP_KIND (c))
 	  {
 	  case GOMP_MAP_ALLOC:
+	  case GOMP_MAP_NO_ALLOC:
 	  case GOMP_MAP_TO:
 	  case GOMP_MAP_FROM:
 	  case GOMP_MAP_TOFROM:
@@ -13421,7 +13477,11 @@
       if (ort != C_ORT_OMP && ort != C_ORT_ACC)
 	OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_POINTER);
       else if (TREE_CODE (t) == COMPONENT_REF)
-	OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_ALWAYS_POINTER);
+	{
+	  gomp_map_kind k = (ort == C_ORT_ACC) ? GOMP_MAP_ATTACH_DETACH
+					       : GOMP_MAP_ALWAYS_POINTER;
+	  OMP_CLAUSE_SET_MAP_KIND (c2, k);
+	}
       else
 	OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_FIRSTPRIVATE_POINTER);
       if (OMP_CLAUSE_MAP_KIND (c2) != GOMP_MAP_FIRSTPRIVATE_POINTER
@@ -13658,6 +13718,35 @@
   return ret;
 }
 
+/* Ensure that pointers are used in OpenACC attach and detach clauses.
+   Return true if an error has been detected.  */
+
+static bool
+c_oacc_check_attachments (tree c)
+{
+  if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_MAP)
+    return false;
+
+  /* OpenACC attach / detach clauses must be pointers.  */
+  if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH
+      || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_DETACH)
+    {
+      tree t = OMP_CLAUSE_DECL (c);
+
+      while (TREE_CODE (t) == TREE_LIST)
+	t = TREE_CHAIN (t);
+
+      if (TREE_CODE (TREE_TYPE (t)) != POINTER_TYPE)
+	{
+	  error_at (OMP_CLAUSE_LOCATION (c), "expected pointer in %qs clause",
+		    c_omp_map_clause_name (c, true));
+	  return true;
+	}
+    }
+
+  return false;
+}
+
 /* For all elements of CLAUSES, validate them against their constraints.
    Remove any elements from the list that are invalid.  */
 
@@ -13679,6 +13768,7 @@
   bool last_iterators_remove = false;
   tree *nogroup_seen = NULL;
   bool reduction_seen = false;
+  bool oacc_gang_seen = false;
 
   bitmap_obstack_initialize (NULL);
   bitmap_initialize (&generic_head, &bitmap_default_obstack);
@@ -13693,10 +13783,15 @@
 
   if (ort & C_ORT_ACC)
     for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
-      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
-	{
+      switch (OMP_CLAUSE_CODE (c))
+        {
+	case OMP_CLAUSE_ASYNC:
 	  oacc_async = true;
 	  break;
+	case OMP_CLAUSE_GANG:
+	  oacc_gang_seen = true;
+	  break;
+	default:;
 	}
 
   for (pc = &clauses, c = clauses; c ; c = *pc)
@@ -13717,6 +13812,13 @@
 	  goto check_dup_generic;
 
 	case OMP_CLAUSE_REDUCTION:
+	  if (oacc_gang_seen && oacc_get_fn_attrib (current_function_decl))
+	    {
+	      error_at (OMP_CLAUSE_LOCATION (c),
+			"gang reduction on an orphan loop");
+	      remove = true;
+	      break;
+	    }
 	  reduction_seen = true;
 	  /* FALLTHRU */
 	case OMP_CLAUSE_IN_REDUCTION:
@@ -13859,8 +13961,11 @@
 	    }
 	  else if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c) == error_mark_node)
 	    {
-	      error_at (OMP_CLAUSE_LOCATION (c),
-			"user defined reduction not found for %qE", t);
+	      /* There are no user-defined reductions in OpenACC (as of
+		 2.6).  */
+	      if (ort & C_ORT_OMP)
+		error_at (OMP_CLAUSE_LOCATION (c),
+			  "user defined reduction not found for %qE", t);
 	      remove = true;
 	      break;
 	    }
@@ -14382,6 +14487,8 @@
 			}
 		    }
 		}
+	      if (c_oacc_check_attachments (c))
+		remove = true;
 	      break;
 	    }
 	  if (t == error_mark_node)
@@ -14389,8 +14496,13 @@
 	      remove = true;
 	      break;
 	    }
+	  /* OpenACC attach / detach clauses must be pointers.  */
+	  if (c_oacc_check_attachments (c))
+	    {
+	      remove = true;
+	      break;
+	    }
 	  if (TREE_CODE (t) == COMPONENT_REF
-	      && (ort & C_ORT_OMP)
 	      && OMP_CLAUSE_CODE (c) != OMP_CLAUSE__CACHE_)
 	    {
 	      if (DECL_BIT_FIELD (TREE_OPERAND (t, 1)))
@@ -14425,6 +14537,15 @@
 		      break;
 		    }
 		  t = TREE_OPERAND (t, 0);
+		  if (ort == C_ORT_ACC && TREE_CODE (t) == MEM_REF)
+	            {
+		      if (maybe_ne (mem_ref_offset (t), 0))
+			error_at (OMP_CLAUSE_LOCATION (c),
+				  "cannot dereference %qE in %qs clause", t,
+				  omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
+		      else
+			t = TREE_OPERAND (t, 0);
+		    }
 		}
 	      if (remove)
 		break;
@@ -14655,6 +14776,7 @@
 	case OMP_CLAUSE_GANG:
 	case OMP_CLAUSE_WORKER:
 	case OMP_CLAUSE_VECTOR:
+	case OMP_CLAUSE_NOHOST:
 	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE_IF_PRESENT:
 	case OMP_CLAUSE_FINALIZE:
diff --git a/gcc/config.gcc b/gcc/config.gcc
index b2282ec..9ef0552 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -1429,6 +1429,9 @@
 		extra_programs="${extra_programs} mkoffload\$(exeext)"
 		tm_file="${tm_file} gcn/offload.h"
 	fi
+	# Force .init_array support.
+	gcc_cv_initfini_array=yes
+	thread_file=gcn
 	;;
 moxie-*-elf)
 	gas=yes
@@ -4137,7 +4140,7 @@
 		for which in arch tune; do
 			eval "val=\$with_$which"
 			case ${val} in
-			"" | carrizo | fiji | gfx900 )
+			"" | carrizo | fiji | gfx900 | gfx906 )
 				# OK
 				;;
 			*)
diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index da7faf2..e33c059 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -37,9 +37,9 @@
 extern rtx gcn_full_exec_reg ();
 extern rtx gcn_gen_undef (machine_mode);
 extern bool gcn_global_address_p (rtx);
-extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
+extern tree gcn_goacc_create_propagation_record (tree record_type, bool sender,
 						 const char *name);
-extern void gcn_goacc_adjust_gangprivate_decl (tree var);
+extern void gcn_goacc_adjust_private_decl (tree var, int level);
 extern void gcn_goacc_reduction (gcall *call);
 extern bool gcn_hard_regno_rename_ok (unsigned int from_reg,
 				      unsigned int to_reg);
diff --git a/gcc/config/gcn/gcn-run.c b/gcc/config/gcn/gcn-run.c
index d3def13..cf4870f 100644
--- a/gcc/config/gcn/gcn-run.c
+++ b/gcc/config/gcn/gcn-run.c
@@ -66,10 +66,13 @@
 
 hsa_agent_t device = { 0 };
 hsa_queue_t *queue = NULL;
-uint64_t kernel = 0;
+uint64_t init_array_kernel = 0;
+uint64_t fini_array_kernel = 0;
+uint64_t main_kernel = 0;
 hsa_executable_t executable = { 0 };
 
 hsa_region_t kernargs_region = { 0 };
+hsa_region_t heap_region = { 0 };
 uint32_t kernarg_segment_size = 0;
 uint32_t group_segment_size = 0;
 uint32_t private_segment_size = 0;
@@ -133,6 +136,8 @@
 					hsa_signal_t *signal);
   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
 					  void **ptr);
+  hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
+					      hsa_access_permission_t access);
   hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
 				      size_t size);
   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
@@ -202,6 +207,7 @@
   DLSYM_FN (hsa_executable_freeze)
   DLSYM_FN (hsa_signal_create)
   DLSYM_FN (hsa_memory_allocate)
+  DLSYM_FN (hsa_memory_assign_agent)
   DLSYM_FN (hsa_memory_copy)
   DLSYM_FN (hsa_memory_free)
   DLSYM_FN (hsa_signal_destroy)
@@ -280,7 +286,8 @@
    suitable one has been found.  */
 
 static hsa_status_t
-get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused)))
+get_memory_region (hsa_region_t region, hsa_region_t *retval,
+		   hsa_region_global_flag_t kind)
 {
   /* Reject non-global regions.  */
   hsa_region_segment_t segment;
@@ -292,9 +299,9 @@
   hsa_region_global_flag_t flags;
   hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
 				  &flags);
-  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
+  if (flags & kind)
     {
-      kernargs_region = region;
+      *retval = region;
       return HSA_STATUS_INFO_BREAK;
     }
 
@@ -302,6 +309,20 @@
   return HSA_STATUS_SUCCESS;
 }
 
+static hsa_status_t
+get_kernarg_region (hsa_region_t region, void *data __attribute__((unused)))
+{
+  return get_memory_region (region, &kernargs_region,
+			    HSA_REGION_GLOBAL_FLAG_KERNARG);
+}
+
+static hsa_status_t
+get_heap_region (hsa_region_t region, void *data __attribute__((unused)))
+{
+  return get_memory_region (region, &heap_region,
+			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
+}
+
 /* Initialize the HSA Runtime library and GPU device.  */
 
 static void
@@ -336,6 +357,13 @@
 						  NULL),
 	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
 	    "Locate kernargs memory");
+
+  /* Select a memory region for the kernel heap.
+     The call-back function, get_heap_region, does the selection.  */
+  XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region,
+						  NULL),
+	    status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
+	    "Locate device memory");
 }
 
 
@@ -427,14 +455,30 @@
   XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
 	"Freeze GCN executable");
 
-  /* Locate the "main" function, and read the kernel's properties.  */
+  /* Locate the "_init_array" function, and read the kernel's properties.  */
   hsa_executable_symbol_t symbol;
+  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
+					      device, 0, &symbol),
+	"Find '_init_array' function");
+  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
+	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
+	"Extract '_init_array' kernel object kernel object");
+
+  /* Locate the "_fini_array" function, and read the kernel's properties.  */
+  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
+					      device, 0, &symbol),
+	"Find '_fini_array' function");
+  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
+	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
+	"Extract '_fini_array' kernel object kernel object");
+
+  /* Locate the "main" function, and read the kernel's properties.  */
   XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
 					      device, 0, &symbol),
 	"Find 'main' function");
   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
-	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel),
-	"Extract kernel object");
+	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
+	"Extract 'main' kernel object");
   XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
 	     &kernarg_segment_size),
@@ -575,10 +619,10 @@
    __flat_scalar GCN address space).  */
 
 static void *
-device_malloc (size_t size)
+device_malloc (size_t size, hsa_region_t region)
 {
   void *result;
-  XHSA (hsa_fns.hsa_memory_allocate_fn (kernargs_region, size, &result),
+  XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result),
 	"Allocate device memory");
   return result;
 }
@@ -616,14 +660,14 @@
     } queue[1024];
     unsigned int consumed;
   } output_data;
-
-  struct heap
-  {
-    int64_t size;
-    char data[0];
-  } heap;
 };
 
+struct heap
+{
+  int64_t size;
+  char data[0];
+} heap;
+
 /* Print any console output from the kernel.
    We print all entries from "consumed" to the next entry without a "written"
    flag, or "next_output" is reached.  The buffer is circular, but the
@@ -684,7 +728,7 @@
 /* Execute an already-loaded kernel on the device.  */
 
 static void
-run (void *kernargs)
+run (uint64_t kernel, void *kernargs)
 {
   /* A "signal" is used to launch and monitor the kernel.  */
   hsa_signal_t signal;
@@ -793,13 +837,19 @@
 
   /* Allocate device memory for both function parameters and the argv
      data.  */
-  size_t heap_size = 10 * 1024 * 1024;	/* 10MB.  */
-  struct kernargs *kernargs = device_malloc (sizeof (*kernargs) + heap_size);
+  struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
+					     kernargs_region);
   struct argdata
   {
     int64_t argv_data[kernel_argc];
     char strings[args_size];
-  } *args = device_malloc (sizeof (struct argdata));
+  } *args = device_malloc (sizeof (struct argdata), kernargs_region);
+
+  size_t heap_size = 10 * 1024 * 1024;	/* 10MB.  */
+  struct heap *heap = device_malloc (heap_size, heap_region);
+  XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
+					    HSA_ACCESS_PERMISSION_RW),
+	"Assign heap to device agent");
 
   /* Write the data to the target.  */
   kernargs->argc = kernel_argc;
@@ -819,14 +869,20 @@
       memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
       offset += arg_len;
     }
-  kernargs->heap_ptr = (int64_t) &kernargs->heap;
-  kernargs->heap.size = heap_size;
+  kernargs->heap_ptr = (int64_t) heap;
+  hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
+
+  /* Run constructors on the GPU.  */
+  run (init_array_kernel, kernargs);
 
   /* Run the kernel on the GPU.  */
-  run (kernargs);
+  run (main_kernel, kernargs);
   unsigned int return_value =
     (unsigned int) kernargs->output_data.return_value;
 
+  /* Run destructors on the GPU.  */
+  run (fini_array_kernel, kernargs);
+
   unsigned int upper = (return_value & ~0xffff) >> 16;
   if (upper == 0xcafe)
     {
diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c
index c6b6302..db8e290 100644
--- a/gcc/config/gcn/gcn-tree.c
+++ b/gcc/config/gcn/gcn-tree.c
@@ -44,6 +44,7 @@
 #include "cgraph.h"
 #include "targhooks.h"
 #include "langhooks-def.h"
+#include "diagnostic-core.h"
 
 /* }}}  */
 /* {{{ OMP GCN pass.
@@ -667,12 +668,12 @@
     }
 }
 
-/* Implement TARGET_GOACC_ADJUST_PROPAGATION_RECORD.
+/* Implement TARGET_GOACC_CREATE_PROPAGATION_RECORD.
  
-   Tweak (worker) propagation record, e.g. to put it in shared memory.  */
+   Create (worker) propagation record in shared memory.  */
 
 tree
-gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
+gcn_goacc_create_propagation_record (tree record_type, bool sender,
 				     const char *name)
 {
   tree type = record_type;
@@ -697,8 +698,11 @@
 }
 
 void
-gcn_goacc_adjust_gangprivate_decl (tree var)
+gcn_goacc_adjust_private_decl (tree var, int level)
 {
+  if (level != GOMP_DIM_GANG)
+    return;
+
   tree type = TREE_TYPE (var);
   tree lds_type = build_qualified_type (type,
 		    TYPE_QUALS_NO_ADDR_SPACE (type)
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 3cc59dd..3a2287d 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -863,15 +863,12 @@
     if (AS_FLAT_P (as))
       {
 	if (TARGET_GCN5_PLUS)
-	  sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s\;"
-		   "s_waitcnt\texpcnt(0)", glc);
+	  sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s", glc);
 	else
-	  sprintf (buf, "flat_store%%s2\t%%0, %%2%s\;s_waitcnt\texpcnt(0)",
-		   glc);
+	  sprintf (buf, "flat_store%%s2\t%%0, %%2%s", glc);
       }
     else if (AS_GLOBAL_P (as))
-      sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s\;"
-	       "s_waitcnt\texpcnt(0)", glc);
+      sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s", glc);
     else
       gcc_unreachable ();
 
@@ -895,7 +892,7 @@
   {
     addr_space_t as = INTVAL (operands[3]);
     static char buf[200];
-    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\texpcnt(0)",
+    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s",
 	     (AS_GDS_P (as) ? " gds" : ""));
     return buf;
   }
@@ -929,8 +926,8 @@
 	/* Work around assembler bug in which a 64-bit register is expected,
 	but a 32-bit value would be correct.  */
 	int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
-	sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s\;"
-		      "s_waitcnt\texpcnt(0)", reg, reg + 1, glc);
+	sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s",
+		 reg, reg + 1, glc);
       }
     else
       gcc_unreachable ();
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 99fa45e..2fcc9c5 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -75,6 +75,12 @@
 
 #define LDS_SIZE 65536
 
+/* The number of registers usable by normal non-kernel functions.
+   The SGPR count includes any special extra registers such as VCC.  */
+
+#define MAX_NORMAL_SGPR_COUNT	64
+#define MAX_NORMAL_VGPR_COUNT	24
+
 /* }}}  */
 /* {{{ Initialization and options.  */
 
@@ -191,6 +197,17 @@
   {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
 };
 
+static const long default_requested_args
+	= (1 << PRIVATE_SEGMENT_BUFFER_ARG)
+	  | (1 << DISPATCH_PTR_ARG)
+	  | (1 << QUEUE_PTR_ARG)
+	  | (1 << KERNARG_SEGMENT_PTR_ARG)
+	  | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
+	  | (1 << WORKGROUP_ID_X_ARG)
+	  | (1 << WORK_ITEM_ID_X_ARG)
+	  | (1 << WORK_ITEM_ID_Y_ARG)
+	  | (1 << WORK_ITEM_ID_Z_ARG);
+
 /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
    This function also sets the default values for some arguments.
  
@@ -201,10 +218,7 @@
 				       tree list)
 {
   bool err = false;
-  args->requested = ((1 << PRIVATE_SEGMENT_BUFFER_ARG)
-		     | (1 << QUEUE_PTR_ARG)
-		     | (1 << KERNARG_SEGMENT_PTR_ARG)
-		     | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG));
+  args->requested = default_requested_args;
   args->nargs = 0;
 
   for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
@@ -242,8 +256,6 @@
       args->requested |= (1 << a);
       args->order[args->nargs++] = a;
     }
-  args->requested |= (1 << WORKGROUP_ID_X_ARG);
-  args->requested |= (1 << WORK_ITEM_ID_Z_ARG);
 
   /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
      WORK_ITEM_ID_Y_ARG.  Similarly, requesting WORK_ITEM_ID_Y_ARG implies
@@ -253,10 +265,6 @@
   if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
     args->requested |= (1 << WORK_ITEM_ID_X_ARG);
 
-  /* Always enable this so that kernargs is in a predictable place for
-     gomp_print, etc.  */
-  args->requested |= (1 << DISPATCH_PTR_ARG);
-
   int sgpr_regno = FIRST_SGPR_REG;
   args->nsgprs = 0;
   for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
@@ -305,9 +313,7 @@
 gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
 					tree args, int, bool *no_add_attrs)
 {
-  if (FUNC_OR_METHOD_TYPE_P (*node)
-      && TREE_CODE (*node) != FIELD_DECL
-      && TREE_CODE (*node) != TYPE_DECL)
+  if (!FUNC_OR_METHOD_TYPE_P (*node))
     {
       warning (OPT_Wattributes, "%qE attribute only applies to functions",
 	       name);
@@ -464,6 +470,9 @@
     {
     case SCC_REG:
       return SCC_CONDITIONAL_REG;
+    case VCC_LO_REG:
+    case VCC_HI_REG:
+      return VCC_CONDITIONAL_REG;
     case VCCZ_REG:
       return VCCZ_CONDITIONAL_REG;
     case EXECZ_REG:
@@ -631,7 +640,8 @@
 static reg_class_t
 gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
 {
-  if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c))
+  if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
+      || c == VCC_CONDITIONAL_REG)
     return SGPR_REGS;
   else
     return NO_REGS;
@@ -1768,7 +1778,6 @@
 	  /* tmp[:] += zext (mem_base)  */
 	  if (exec)
 	    {
-	      rtx undef_di = gcn_gen_undef (DImode);
 	      emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
 						     vcc, undef_v64si, exec));
 	      emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
@@ -2043,27 +2052,36 @@
 static void
 gcn_conditional_register_usage (void)
 {
-  int i;
+  if (!cfun || !cfun->machine)
+    return;
 
-  /* FIXME: Do we need to reset fixed_regs?  */
-
-/* Limit ourselves to 1/16 the register file for maximimum sized workgroups.
-   There are enough SGPRs not to limit those.
-   TODO: Adjust this more dynamically.  */
-  for (i = FIRST_VGPR_REG + 64; i <= LAST_VGPR_REG; i++)
-    fixed_regs[i] = 1, call_used_regs[i] = 1;
-
-  if (!cfun || !cfun->machine || cfun->machine->normal_function)
+  if (cfun->machine->normal_function)
     {
-      /* Normal functions can't know what kernel argument registers are
-         live, so just fix the bottom 16 SGPRs, and bottom 3 VGPRs.  */
-      for (i = 0; i < 16; i++)
-	fixed_regs[FIRST_SGPR_REG + i] = 1;
-      for (i = 0; i < 3; i++)
-	fixed_regs[FIRST_VGPR_REG + i] = 1;
+      /* Restrict the set of SGPRs and VGPRs used by non-kernel functions.  */
+      for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT - 2);
+	   i <= LAST_SGPR_REG; i++)
+	fixed_regs[i] = 1, call_used_regs[i] = 1;
+
+      for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
+	   i <= LAST_VGPR_REG; i++)
+	fixed_regs[i] = 1, call_used_regs[i] = 1;
+
       return;
     }
 
+  /* If the set of requested args is the default set, nothing more needs to
+     be done.  */
+  if (cfun->machine->args.requested == default_requested_args)
+    return;
+
+  /* Requesting a set of args different from the default violates the ABI.  */
+  if (!leaf_function_p ())
+    warning (0, "A non-default set of initial values has been requested, "
+		"which violates the ABI!");
+
+  for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
+    fixed_regs[i] = 0;
+
   /* Fix the runtime argument register containing values that may be
      needed later.  DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
      needed after the prologue so there's no need to fix them.  */
@@ -2071,10 +2089,10 @@
     fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
   if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
     {
+      /* The upper 32-bits of the 64-bit descriptor are not used, so allow
+	the containing registers to be used for other purposes.  */
       fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
       fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
-      fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 2] = 1;
-      fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 3] = 1;
     }
   if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
     {
@@ -2444,6 +2462,8 @@
   cfun->machine->args = cum->args;
   if (!caller && cfun->machine->normal_function)
     gcn_detect_incoming_pointer_arg (fndecl);
+
+  reinit_regs ();
 }
 
 static bool
@@ -2779,15 +2799,6 @@
 				     cfun->machine->args.
 				     reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
 
-      if (TARGET_GCN5_PLUS)
-	{
-	  /* v0 is reserved for constant zero so that "global"
-	     memory instructions can have a nul-offset without
-	     causing reloads.  */
-	  emit_insn (gen_vec_duplicatev64si
-		     (gen_rtx_REG (V64SImode, VGPR_REGNO (0)), const0_rtx));
-	}
-
       if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
 	{
 	  rtx fs_init_lo =
@@ -2846,8 +2857,6 @@
 		  gen_int_mode (LDS_SIZE, SImode));
 
   emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
-  if (TARGET_GCN5_PLUS)
-    emit_insn (gen_prologue_use (gen_rtx_REG (SImode, VGPR_REGNO (0))));
 
   if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
     {
@@ -3165,18 +3174,15 @@
 	  || (to == DFmode && (from == SImode || from == SFmode)));
 }
 
-/* Implement both TARGET_ASM_CONSTRUCTOR and TARGET_ASM_DESTRUCTOR.
+/* Implement TARGET_EMUTLS_VAR_INIT.
 
-   The current loader does not support running code outside "main".  This
-   hook implementation can be replaced or removed when that changes.  */
+   Disable emutls (gthr-gcn.h does not support it, yet).  */
 
-void
-gcn_disable_constructors (rtx symbol, int priority __attribute__ ((unused)))
+tree
+gcn_emutls_var_init (tree, tree decl, tree)
 {
-  tree d = SYMBOL_REF_DECL (symbol);
-  location_t l = d ? DECL_SOURCE_LOCATION (d) : UNKNOWN_LOCATION;
-
-  sorry_at (l, "GCN does not support static constructors or destructors");
+  sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
+  return NULL_TREE;
 }
 
 /* }}}  */
@@ -3478,8 +3484,6 @@
       TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
     }
 
-/* FIXME: remove the ifdef once OpenACC support is merged upstream.  */
-#ifdef BUILT_IN_GOACC_SINGLE_START
   /* These builtins need to take/return an LDS pointer: override the generic
      versions here.  */
 
@@ -3496,7 +3500,6 @@
 
   set_builtin_decl (BUILT_IN_GOACC_BARRIER,
 		    gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
-#endif
 }
 
 /* Expand the CMP_SWAP GCN builtins.  We have our own versions that do
@@ -4305,8 +4308,6 @@
 {
   basic_block bb;
   rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
-  rtx exec_lo_reg = gen_rtx_REG (SImode, EXEC_LO_REG);
-  rtx exec_hi_reg = gen_rtx_REG (SImode, EXEC_HI_REG);
   regset_head live;
 
   INIT_REG_SET (&live);
@@ -4522,7 +4523,9 @@
   {
     rtx_insn *insn;
     attr_unit unit;
+    attr_delayeduse delayeduse;
     HARD_REG_SET writes;
+    HARD_REG_SET reads;
     int age;
   } back[max_waits];
   int oldest = 0;
@@ -4541,6 +4544,7 @@
 
       attr_type itype = get_attr_type (insn);
       attr_unit iunit = get_attr_unit (insn);
+      attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
       HARD_REG_SET ireads, iwrites;
       CLEAR_HARD_REG_SET (ireads);
       CLEAR_HARD_REG_SET (iwrites);
@@ -4616,6 +4620,14 @@
 		  (regs, reg_class_contents[(int) VGPR_REGS]))
 		nops_rqd = 2 - prev_insn->age;
 	    }
+
+	  /* Store that requires input registers are not overwritten by
+	     following instruction.  */
+	  if ((prev_insn->age + nops_rqd) < 1
+	      && prev_insn->delayeduse == DELAYEDUSE_YES
+	      && ((hard_reg_set_intersect_p
+		   (prev_insn->reads, iwrites))))
+	    nops_rqd = 1 - prev_insn->age;
 	}
 
       /* Insert the required number of NOPs.  */
@@ -4643,7 +4655,9 @@
       /* Track the current instruction as a previous instruction.  */
       back[oldest].insn = insn;
       back[oldest].unit = iunit;
+      back[oldest].delayeduse = idelayeduse;
       COPY_HARD_REG_SET (back[oldest].writes, iwrites);
+      COPY_HARD_REG_SET (back[oldest].reads, ireads);
       back[oldest].age = 0;
       oldest = (oldest + 1) % max_waits;
 
@@ -4773,8 +4787,6 @@
 gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
 	       bool ARG_UNUSED (is_fork))
 {
-  /* GCN does not use the fork/join concept invented for NVPTX.
-     Instead we use standard autovectorization.  */
   return false;
 }
 
@@ -4880,12 +4892,20 @@
   if (!leaf_function_p ())
     {
       /* We can't know how many registers function calls might use.  */
-      if (vgpr < 64)
-	vgpr = 64;
-      if (sgpr + extra_regs < 102)
-	sgpr = 102 - extra_regs;
+      if (vgpr < MAX_NORMAL_VGPR_COUNT)
+	vgpr = MAX_NORMAL_VGPR_COUNT;
+      if (sgpr + extra_regs < MAX_NORMAL_SGPR_COUNT)
+	sgpr = MAX_NORMAL_SGPR_COUNT - extra_regs;
     }
 
+  /* GFX8 allocates SGPRs in blocks of 8.
+     GFX9 uses blocks of 16.  */
+  int granulated_sgprs;
+  if (TARGET_GCN3)
+    granulated_sgprs = (sgpr + extra_regs + 7) / 8 - 1;
+  else if (TARGET_GCN5)
+    granulated_sgprs = 2 * ((sgpr + extra_regs + 15) / 16 - 1);
+
   fputs ("\t.align\t256\n", file);
   fputs ("\t.type\t", file);
   assemble_name (file, name);
@@ -4924,7 +4944,7 @@
 	   "\t\tcompute_pgm_rsrc2_excp_en = 0\n",
 	   (vgpr - 1) / 4,
 	   /* Must match wavefront_sgpr_count */
-	   (sgpr + extra_regs + 7) / 8 - 1,
+	   granulated_sgprs,
 	   /* The total number of SGPR user data registers requested.  This
 	      number must match the number of user data registers enabled.  */
 	   cfun->machine->args.nsgprs);
@@ -5299,9 +5319,9 @@
 	      /* The assembler requires a 64-bit VGPR pair here, even though
 	         the offset should be only 32-bit.  */
 	      if (vgpr_offset == NULL_RTX)
-		/* In this case, the vector offset is zero, so we use v0,
-		   which is initialized by the kernel prologue to zero.  */
-		fprintf (file, "v[0:1]");
+		/* In this case, the vector offset is zero, so we use the first
+		   lane of v1, which is initialized to zero.  */
+		fprintf (file, "v[1:2]");
 	      else if (REG_P (vgpr_offset)
 		       && VGPR_REGNO_P (REGNO (vgpr_offset)))
 		{
@@ -5995,10 +6015,6 @@
 #define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
 #undef  TARGET_ASM_ALIGNED_DI_OP
 #define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
-#undef  TARGET_ASM_CONSTRUCTOR
-#define TARGET_ASM_CONSTRUCTOR gcn_disable_constructors
-#undef  TARGET_ASM_DESTRUCTOR
-#define TARGET_ASM_DESTRUCTOR gcn_disable_constructors
 #undef  TARGET_ASM_FILE_START
 #define TARGET_ASM_FILE_START output_file_start
 #undef  TARGET_ASM_FUNCTION_PROLOGUE
@@ -6027,6 +6043,8 @@
 #define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
 #undef  TARGET_DEBUG_UNWIND_INFO
 #define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
+#undef  TARGET_EMUTLS_VAR_INIT
+#define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN gcn_expand_builtin
 #undef  TARGET_FUNCTION_ARG
@@ -6039,11 +6057,11 @@
 #define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
 #undef  TARGET_GIMPLIFY_VA_ARG_EXPR
 #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
-#undef  TARGET_GOACC_ADJUST_PROPAGATION_RECORD
-#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
-  gcn_goacc_adjust_propagation_record
-#undef  TARGET_GOACC_ADJUST_GANGPRIVATE_DECL
-#define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl
+#undef  TARGET_GOACC_CREATE_PROPAGATION_RECORD
+#define TARGET_GOACC_CREATE_PROPAGATION_RECORD \
+  gcn_goacc_create_propagation_record
+#undef  TARGET_GOACC_ADJUST_PRIVATE_DECL
+#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
 #undef  TARGET_GOACC_FORK_JOIN
 #define TARGET_GOACC_FORK_JOIN gcn_fork_join
 #undef  TARGET_GOACC_REDUCTION
diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h
index b3b2d1a..e60b431 100644
--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@@ -160,9 +160,9 @@
 
 #define FIXED_REGISTERS {			    \
     /* Scalars.  */				    \
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		    \
+    1, 1, 0, 0, 1, 1, 0, 0, 1, 1,		    \
 /*		fp    sp    lr.  */		    \
-    0, 0, 0, 0, 1, 1, 1, 1, 0, 0,		    \
+    1, 1, 0, 0, 0, 0, 1, 1, 0, 0,		    \
 /*  exec_save, cc_save */			    \
     1, 1, 1, 1, 0, 0, 0, 0, 0, 0,		    \
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		    \
@@ -180,7 +180,7 @@
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,		    \
     /* VGRPs */					    \
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+    0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
@@ -203,7 +203,7 @@
 #define CALL_USED_REGISTERS {			    \
     /* Scalars.  */				    \
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 		    \
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 		    \
+    1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 		    \
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 		    \
     1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 		    \
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 		    \
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 7e5cf17..36908ba 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -285,6 +285,11 @@
 
 (define_attr "laneselect" "yes,no" (const_string "no"))
 
+; Identify instructions that require a "Manually Inserted Wait State" if
+; their inputs are overwritten by subsequent instructions.
+
+(define_attr "delayeduse" "yes,no" (const_string "no"))
+
 ;; }}}
 ;; {{{ Iterators useful across the wole machine description
 
@@ -475,15 +480,15 @@
     case 6:
       return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
     case 7:
-      return "s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)";
+      return "s_store_dword\t%1, %A0";
     case 8:
       return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
     case 9:
-      return "flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+      return "flat_store_dword\t%A0, %1%O0%g0";
     case 10:
       return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
     case 11:
-      return "global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+      return "global_store_dword\t%A0, %1%O0%g0";
     default:
       gcc_unreachable ();
     }
@@ -506,20 +511,20 @@
   s_movk_i32\t%0, %1
   s_mov_b32\t%0, %1
   s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
-  s_buffer_store%s1\t%1, s[0:3], %0\;s_waitcnt\texpcnt(0)
+  s_buffer_store%s1\t%1, s[0:3], %0
   s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
-  s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_store_dword\t%1, %A0
   v_mov_b32\t%0, %1
   v_readlane_b32\t%0, %1, 0
   v_writelane_b32\t%0, %1, 0
   flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
-  flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_store_dword\t%A0, %1%O0%g0
   v_mov_b32\t%0, %1
-  ds_write_b32\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_write_b32\t%A0, %1%O0
   ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   s_mov_b32\t%0, %1
   global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  global_store_dword\t%A0, %1%O0%g0"
   [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat,
 		     flat,vop1,ds,ds,sop1,flat,flat")
    (set_attr "exec" "*,*,*,*,*,*,*,*,none,none,*,*,*,*,*,*,*,*")
@@ -541,12 +546,12 @@
   v_readlane_b32\t%0, %1, 0
   v_writelane_b32\t%0, %1, 0
   flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
-  flat_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_store%s0\t%A0, %1%O0%g0
   v_mov_b32\t%0, %1
-  ds_write%b0\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_write%b0\t%A0, %1%O0
   ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  global_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  global_store%s0\t%A0, %1%O0%g0"
   [(set_attr "type"
 	     "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat")
    (set_attr "exec" "*,*,*,*,none,none,*,*,*,*,*,*,*")
@@ -564,18 +569,18 @@
   s_mov_b64\t%0, %1
   s_mov_b64\t%0, %1
   #
-  s_store_dwordx2\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_store_dwordx2\t%1, %A0
   s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
   #
   #
   #
   #
   flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
-  flat_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
-  ds_write_b64\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  flat_store_dwordx2\t%A0, %1%O0%g0
+  ds_write_b64\t%A0, %1%O0
   ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
   global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  global_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  global_store_dwordx2\t%A0, %1%O0%g0"
   "(reload_completed && !MEM_P (operands[0]) && !MEM_P (operands[1])
     && !gcn_sgpr_move_p (operands[0], operands[1]))
    || (GET_CODE (operands[1]) == CONST_INT && !gcn_constant64_p (operands[1]))"
@@ -617,16 +622,16 @@
   ""
   "@
   #
-  s_store_dwordx4\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_store_dwordx4\t%1, %A0
   s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
-  flat_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_store_dwordx4\t%A0, %1%O0%g0
   flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
   #
   #
   #
-  global_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  global_store_dwordx4\t%A0, %1%O0%g0
   global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  ds_write_b128\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_write_b128\t%A0, %1%O0
   ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
   "reload_completed
    && REG_P (operands[0])
@@ -647,6 +652,7 @@
   }
   [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\
 		     ds,ds")
+   (set_attr "delayeduse" "*,*,yes,*,*,*,*,*,yes,*,*,*")
    (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")])
 
 ;; }}}
@@ -1612,7 +1618,8 @@
    global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
   [(set_attr "type" "smem,flat,flat")
    (set_attr "length" "12")
-   (set_attr "gcn_version" "gcn5,*,gcn5")])
+   (set_attr "gcn_version" "gcn5,*,gcn5")
+   (set_attr "delayeduse" "*,yes,yes")])
 
 (define_insn "sync_compare_and_swap<mode>_lds_insn"
   [(set (match_operand:SIDI 0 "register_operand"    "= v")
@@ -1715,14 +1722,11 @@
 	switch (which_alternative)
 	  {
 	  case 0:
-	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
-		   "s_waitcnt\texpcnt(0)";
+	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc";
 	  case 1:
-	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
-		   "s_waitcnt\texpcnt(0)";
+	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc";
 	  case 2:
-	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
-	           "s_waitcnt\texpcnt(0)";
+	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc";
 	  }
 	break;
       case MEMMODEL_ACQ_REL:
@@ -1732,13 +1736,13 @@
 	  {
 	  case 0:
 	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
-		   "s_waitcnt\texpcnt(0)\;s_dcache_inv_vol";
+		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
 	  case 1:
 	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
-		   "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
 	  case 2:
 	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
-		   "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
 	  }
 	break;
       }
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index 2fd3996..bdc878f 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -34,6 +34,9 @@
 EnumValue
 Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA)
 
+EnumValue
+Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA)
+
 march=
 Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_CARRIZO)
 Specify the name of the target GPU.
diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c
new file mode 100644
index 0000000..c96ed23
--- /dev/null
+++ b/gcc/config/gcn/mkoffload.c
@@ -0,0 +1,696 @@
+/* Offload image generation tool for AMD GCN.
+
+   Copyright (C) 2014-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Munges GCN assembly into a C source file defining the GCN code as a
+   string.
+
+   This is not a complete assembler.  We presume the source is well
+   formed from the compiler and can die horribly if it is not.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "obstack.h"
+#include "diagnostic.h"
+#include "intl.h"
+#include <libgen.h>
+#include "collect-utils.h"
+#include "gomp-constants.h"
+
+const char tool_name[] = "gcn mkoffload";
+
+#define COMMENT_PREFIX "#"
+
+struct id_map
+{
+  id_map *next;
+  char *gcn_name;
+};
+
+static id_map *func_ids, **funcs_tail = &func_ids;
+static id_map *var_ids, **vars_tail = &var_ids;
+
+/* Files to unlink.  */
+static const char *gcn_s1_name;
+static const char *gcn_s2_name;
+static const char *gcn_o_name;
+static const char *gcn_cfile_name;
+
+enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
+
+/* Delete tempfiles.  */
+
+void
+tool_cleanup (bool from_signal ATTRIBUTE_UNUSED)
+{
+  if (gcn_cfile_name)
+    maybe_unlink (gcn_cfile_name);
+  if (gcn_s1_name)
+    maybe_unlink (gcn_s1_name);
+  if (gcn_s2_name)
+    maybe_unlink (gcn_s2_name);
+  if (gcn_o_name)
+    maybe_unlink (gcn_o_name);
+}
+
+static void
+mkoffload_cleanup (void)
+{
+  tool_cleanup (false);
+}
+
+/* Unlink FILE unless requested otherwise.  */
+
+void
+maybe_unlink (const char *file)
+{
+  if (!save_temps)
+    {
+      if (unlink_if_ordinary (file) && errno != ENOENT)
+	fatal_error (input_location, "deleting file %s: %m", file);
+    }
+  else if (verbose)
+    fprintf (stderr, "[Leaving %s]\n", file);
+}
+
+/* Add or change the value of an environment variable, outputting the
+   change to standard error if in verbose mode.  */
+
+static void
+xputenv (const char *string)
+{
+  if (verbose)
+    fprintf (stderr, "%s\n", string);
+  putenv (CONST_CAST (char *, string));
+}
+
+/* Read the whole input file.  It will be NUL terminated (but
+   remember, there could be a NUL in the file itself.  */
+
+static const char *
+read_file (FILE *stream, size_t *plen)
+{
+  size_t alloc = 16384;
+  size_t base = 0;
+  char *buffer;
+
+  if (!fseek (stream, 0, SEEK_END))
+    {
+      /* Get the file size.  */
+      long s = ftell (stream);
+      if (s >= 0)
+	alloc = s + 100;
+      fseek (stream, 0, SEEK_SET);
+    }
+  buffer = XNEWVEC (char, alloc);
+
+  for (;;)
+    {
+      size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
+
+      if (!n)
+	break;
+      base += n;
+      if (base + 1 == alloc)
+	{
+	  alloc *= 2;
+	  buffer = XRESIZEVEC (char, buffer, alloc);
+	}
+    }
+  buffer[base] = 0;
+  *plen = base;
+  return buffer;
+}
+
+/* Parse STR, saving found tokens into PVALUES and return their number.
+   Tokens are assumed to be delimited by ':'.  */
+
+static unsigned
+parse_env_var (const char *str, char ***pvalues)
+{
+  const char *curval, *nextval;
+  char **values;
+  unsigned num = 1, i;
+
+  curval = strchr (str, ':');
+  while (curval)
+    {
+      num++;
+      curval = strchr (curval + 1, ':');
+    }
+
+  values = (char **) xmalloc (num * sizeof (char *));
+  curval = str;
+  nextval = strchr (curval, ':');
+  if (nextval == NULL)
+    nextval = strchr (curval, '\0');
+
+  for (i = 0; i < num; i++)
+    {
+      int l = nextval - curval;
+      values[i] = (char *) xmalloc (l + 1);
+      memcpy (values[i], curval, l);
+      values[i][l] = 0;
+      curval = nextval + 1;
+      nextval = strchr (curval, ':');
+      if (nextval == NULL)
+	nextval = strchr (curval, '\0');
+    }
+  *pvalues = values;
+  return num;
+}
+
+/* Auxiliary function that frees elements of PTR and PTR itself.
+   N is number of elements to be freed.  If PTR is NULL, nothing is freed.
+   If an element is NULL, subsequent elements are not freed.  */
+
+static void
+free_array_of_ptrs (void **ptr, unsigned n)
+{
+  unsigned i;
+  if (!ptr)
+    return;
+  for (i = 0; i < n; i++)
+    {
+      if (!ptr[i])
+	break;
+      free (ptr[i]);
+    }
+  free (ptr);
+  return;
+}
+
+/* Check whether NAME can be accessed in MODE.  This is like access,
+   except that it never considers directories to be executable.  */
+
+static int
+access_check (const char *name, int mode)
+{
+  if (mode == X_OK)
+    {
+      struct stat st;
+
+      if (stat (name, &st) < 0 || S_ISDIR (st.st_mode))
+	return -1;
+    }
+
+  return access (name, mode);
+}
+
+/* Parse an input assembler file, extract the offload tables etc.,
+   and output (1) the assembler code, minus the tables (which can contain
+   problematic relocations), and (2) a C file with the offload tables
+   encoded as structured data.  */
+
+static void
+process_asm (FILE *in, FILE *out, FILE *cfile)
+{
+  int fn_count = 0, var_count = 0, dims_count = 0;
+  struct obstack fns_os, vars_os, varsizes_os, dims_os;
+  obstack_init (&fns_os);
+  obstack_init (&vars_os);
+  obstack_init (&varsizes_os);
+  obstack_init (&dims_os);
+
+  struct oaccdims
+  {
+    int d[3];
+    char *name;
+  } dim;
+
+  /* Always add _init_array and _fini_array as kernels.  */
+  obstack_ptr_grow (&fns_os, xstrdup ("_init_array"));
+  obstack_ptr_grow (&fns_os, xstrdup ("_fini_array"));
+  fn_count += 2;
+
+  char buf[1000];
+  enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE;
+  while (fgets (buf, sizeof (buf), in))
+    {
+      switch (state)
+	{
+	case IN_CODE:
+	  {
+	    if (sscanf (buf, " ;; OPENACC-DIMS: %d, %d, %d : %ms\n",
+			&dim.d[0], &dim.d[1], &dim.d[2], &dim.name) == 4)
+	      {
+		obstack_grow (&dims_os, &dim, sizeof (dim));
+		dims_count++;
+	      }
+	    break;
+	  }
+	case IN_VARS:
+	  {
+	    char *varname;
+	    unsigned varsize;
+	    if (sscanf (buf, " .8byte %ms\n", &varname))
+	      {
+		obstack_ptr_grow (&vars_os, varname);
+		fgets (buf, sizeof (buf), in);
+		if (!sscanf (buf, " .8byte %u\n", &varsize))
+		  abort ();
+		obstack_int_grow (&varsizes_os, varsize);
+		var_count++;
+
+		/* The HSA Runtime cannot locate the symbol if it is not
+		   exported from the kernel.  */
+		fprintf (out, "\t.global %s\n", varname);
+	      }
+	    break;
+	  }
+	case IN_FUNCS:
+	  {
+	    char *funcname;
+	    if (sscanf (buf, "\t.8byte\t%ms\n", &funcname))
+	      {
+		obstack_ptr_grow (&fns_os, funcname);
+		fn_count++;
+		continue;
+	      }
+	    break;
+	  }
+	}
+
+      char dummy;
+      if (sscanf (buf, " .section .gnu.offload_vars%c", &dummy) > 0)
+	state = IN_VARS;
+      else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
+	state = IN_FUNCS;
+      else if (sscanf (buf, " .section %c", &dummy) > 0
+	       || sscanf (buf, " .text%c", &dummy) > 0
+	       || sscanf (buf, " .bss%c", &dummy) > 0
+	       || sscanf (buf, " .data%c", &dummy) > 0
+	       || sscanf (buf, " .ident %c", &dummy) > 0)
+	state = IN_CODE;
+
+      if (state == IN_CODE)
+	fputs (buf, out);
+    }
+
+  char **fns = XOBFINISH (&fns_os, char **);
+  struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
+
+  fprintf (cfile, "#include <stdlib.h>\n");
+  fprintf (cfile, "#include <stdbool.h>\n\n");
+
+  char **vars = XOBFINISH (&vars_os, char **);
+  unsigned *varsizes = XOBFINISH (&varsizes_os, unsigned *);
+  fprintf (cfile,
+	   "static const struct global_var_info {\n"
+	   "  const char *name;\n"
+	   "  void *address;\n"
+	   "} vars[] = {\n");
+  int i;
+  for (i = 0; i < var_count; ++i)
+    {
+      const char *sep = i < var_count - 1 ? "," : " ";
+      fprintf (cfile, "  { \"%s\", NULL }%s /* size: %u */\n", vars[i], sep,
+	       varsizes[i]);
+    }
+  fprintf (cfile, "};\n\n");
+
+  obstack_free (&vars_os, NULL);
+  obstack_free (&varsizes_os, NULL);
+
+  /* Dump out function idents.  */
+  fprintf (cfile, "static const struct hsa_kernel_description {\n"
+	   "  const char *name;\n"
+	   "  int oacc_dims[3];\n"
+	   "} gcn_kernels[] = {\n  ");
+  dim.d[0] = dim.d[1] = dim.d[2] = 0;
+  const char *comma;
+  for (comma = "", i = 0; i < fn_count; comma = ",\n  ", i++)
+    {
+      /* Find if we recorded dimensions for this function.  */
+      int *d = dim.d;		/* Previously zeroed.  */
+      for (int j = 0; j < dims_count; j++)
+	if (strcmp (fns[i], dims[j].name) == 0)
+	  {
+	    d = dims[j].d;
+	    break;
+	  }
+
+      fprintf (cfile, "%s{\"%s\", {%d, %d, %d}}", comma,
+	       fns[i], d[0], d[1], d[2]);
+
+      free (fns[i]);
+    }
+  fprintf (cfile, "\n};\n\n");
+
+  obstack_free (&fns_os, NULL);
+  for (i = 0; i < dims_count; i++)
+    free (dims[i].name);
+  obstack_free (&dims_os, NULL);
+}
+
+/* Embed an object file into a C source file.  */
+
+static void
+process_obj (FILE *in, FILE *cfile)
+{
+  size_t len = 0;
+  const char *input = read_file (in, &len);
+
+  /* Dump out an array containing the binary.
+     FIXME: do this with objcopy.  */
+  fprintf (cfile, "static unsigned char gcn_code[] = {");
+  for (size_t i = 0; i < len; i += 17)
+    {
+      fprintf (cfile, "\n\t");
+      for (size_t j = i; j < i + 17 && j < len; j++)
+	fprintf (cfile, "%3u,", (unsigned char) input[j]);
+    }
+  fprintf (cfile, "\n};\n\n");
+
+  fprintf (cfile,
+	   "static const struct gcn_image {\n"
+	   "  char magic[4];\n"
+	   "  size_t size;\n"
+	   "  void *image;\n"
+	   "} gcn_image = {\n"
+	   "  \"GCN\",\n"
+	   "  %zu,\n"
+	   "  gcn_code\n"
+	   "};\n\n",
+	   len);
+
+  fprintf (cfile,
+	   "static const struct gcn_image_desc {\n"
+	   "  const struct gcn_image *gcn_image;\n"
+	   "  unsigned kernel_count;\n"
+	   "  const struct hsa_kernel_description *kernel_infos;\n"
+	   "  unsigned global_variable_count;\n"
+	   "  const struct global_var_info *global_variables;\n"
+	   "} target_data = {\n"
+	   "  &gcn_image,\n"
+	   "  sizeof (gcn_kernels) / sizeof (gcn_kernels[0]),\n"
+	   "  gcn_kernels,\n"
+	   "  sizeof (vars) / sizeof (vars[0]),\n"
+	   "  vars\n"
+	   "};\n\n");
+
+  fprintf (cfile,
+	   "#ifdef __cplusplus\n"
+	   "extern \"C\" {\n"
+	   "#endif\n"
+	   "extern void GOMP_offload_register_ver"
+	   " (unsigned, const void *, int, const void *);\n"
+	   "extern void GOMP_offload_unregister_ver"
+	   " (unsigned, const void *, int, const void *);\n"
+	   "#ifdef __cplusplus\n"
+	   "}\n"
+	   "#endif\n\n");
+
+  fprintf (cfile, "extern const void *const __OFFLOAD_TABLE__[];\n\n");
+
+  fprintf (cfile, "static __attribute__((constructor)) void init (void)\n"
+	   "{\n"
+	   "  GOMP_offload_register_ver (%#x, __OFFLOAD_TABLE__,"
+	   " %d/*GCN*/, &target_data);\n"
+	   "};\n",
+	   GOMP_VERSION_PACK (GOMP_VERSION, GOMP_VERSION_GCN),
+	   GOMP_DEVICE_GCN);
+
+  fprintf (cfile, "static __attribute__((destructor)) void fini (void)\n"
+	   "{\n"
+	   "  GOMP_offload_unregister_ver (%#x, __OFFLOAD_TABLE__,"
+	   " %d/*GCN*/, &target_data);\n"
+	   "};\n",
+	   GOMP_VERSION_PACK (GOMP_VERSION, GOMP_VERSION_GCN),
+	   GOMP_DEVICE_GCN);
+}
+
+/* Compile a C file using the host compiler.  */
+
+static void
+compile_native (const char *infile, const char *outfile, const char *compiler)
+{
+  const char *collect_gcc_options = getenv ("COLLECT_GCC_OPTIONS");
+  if (!collect_gcc_options)
+    fatal_error (input_location,
+		 "environment variable COLLECT_GCC_OPTIONS must be set");
+
+  struct obstack argv_obstack;
+  obstack_init (&argv_obstack);
+  obstack_ptr_grow (&argv_obstack, compiler);
+  if (save_temps)
+    obstack_ptr_grow (&argv_obstack, "-save-temps");
+  if (verbose)
+    obstack_ptr_grow (&argv_obstack, "-v");
+  switch (offload_abi)
+    {
+    case OFFLOAD_ABI_LP64:
+      obstack_ptr_grow (&argv_obstack, "-m64");
+      break;
+    case OFFLOAD_ABI_ILP32:
+      obstack_ptr_grow (&argv_obstack, "-m32");
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  obstack_ptr_grow (&argv_obstack, infile);
+  obstack_ptr_grow (&argv_obstack, "-c");
+  obstack_ptr_grow (&argv_obstack, "-o");
+  obstack_ptr_grow (&argv_obstack, outfile);
+  obstack_ptr_grow (&argv_obstack, NULL);
+
+  const char **new_argv = XOBFINISH (&argv_obstack, const char **);
+  fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true);
+  obstack_free (&argv_obstack, NULL);
+}
+
+int
+main (int argc, char **argv)
+{
+  FILE *in = stdin;
+  FILE *out = stdout;
+  FILE *cfile = stdout;
+  const char *outname = 0, *offloadsrc = 0;
+
+  progname = "mkoffload";
+  diagnostic_initialize (global_dc, 0);
+
+  if (atexit (mkoffload_cleanup) != 0)
+    fatal_error (input_location, "atexit failed");
+
+  char *collect_gcc = getenv ("COLLECT_GCC");
+  if (collect_gcc == NULL)
+    fatal_error (input_location, "COLLECT_GCC must be set.");
+  const char *gcc_path = dirname (ASTRDUP (collect_gcc));
+  const char *gcc_exec = basename (ASTRDUP (collect_gcc));
+
+  size_t len = (strlen (gcc_path) + 1 + strlen (GCC_INSTALL_NAME) + 1);
+  char *driver = XALLOCAVEC (char, len);
+
+  if (strcmp (gcc_exec, collect_gcc) == 0)
+    /* collect_gcc has no path, so it was found in PATH.  Make sure we also
+       find accel-gcc in PATH.  */
+    gcc_path = NULL;
+
+  int driver_used = 0;
+  if (gcc_path != NULL)
+    driver_used = sprintf (driver, "%s/", gcc_path);
+  sprintf (driver + driver_used, "%s", GCC_INSTALL_NAME);
+
+  bool found = false;
+  if (gcc_path == NULL)
+    found = true;
+  else if (access_check (driver, X_OK) == 0)
+    found = true;
+  else
+    {
+      /* Don't use alloca pointer with XRESIZEVEC.  */
+      driver = NULL;
+      /* Look in all COMPILER_PATHs for GCC_INSTALL_NAME.  */
+      char **paths = NULL;
+      unsigned n_paths;
+      n_paths = parse_env_var (getenv ("COMPILER_PATH"), &paths);
+      for (unsigned i = 0; i < n_paths; i++)
+	{
+	  len = strlen (paths[i]) + 1 + strlen (GCC_INSTALL_NAME) + 1;
+	  driver = XRESIZEVEC (char, driver, len);
+	  sprintf (driver, "%s/%s", paths[i], GCC_INSTALL_NAME);
+	  if (access_check (driver, X_OK) == 0)
+	    {
+	      found = true;
+	      break;
+	    }
+	}
+      free_array_of_ptrs ((void **) paths, n_paths);
+    }
+
+  if (!found)
+    fatal_error (input_location,
+		 "offload compiler %s not found", GCC_INSTALL_NAME);
+
+  /* We may be called with all the arguments stored in some file and
+     passed with @file.  Expand them into argv before processing.  */
+  expandargv (&argc, &argv);
+
+  /* Scan the argument vector.  */
+  bool fopenmp = false;
+  bool fopenacc = false;
+  for (int i = 1; i < argc; i++)
+    {
+#define STR "-foffload-abi="
+      if (strncmp (argv[i], STR, strlen (STR)) == 0)
+	{
+	  if (strcmp (argv[i] + strlen (STR), "lp64") == 0)
+	    offload_abi = OFFLOAD_ABI_LP64;
+	  else if (strcmp (argv[i] + strlen (STR), "ilp32") == 0)
+	    offload_abi = OFFLOAD_ABI_ILP32;
+	  else
+	    fatal_error (input_location,
+			 "unrecognizable argument of option " STR);
+	}
+#undef STR
+      else if (strcmp (argv[i], "-fopenmp") == 0)
+	fopenmp = true;
+      else if (strcmp (argv[i], "-fopenacc") == 0)
+	fopenacc = true;
+      else if (strcmp (argv[i], "-save-temps") == 0)
+	save_temps = true;
+      else if (strcmp (argv[i], "-v") == 0)
+	verbose = true;
+    }
+  if (!(fopenacc ^ fopenmp))
+    fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
+
+  const char *abi;
+  switch (offload_abi)
+    {
+    case OFFLOAD_ABI_LP64:
+      abi = "-m64";
+      break;
+    case OFFLOAD_ABI_ILP32:
+      abi = "-m32";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  gcn_s1_name = make_temp_file (".mkoffload.1.s");
+  gcn_s2_name = make_temp_file (".mkoffload.2.s");
+  gcn_o_name = make_temp_file (".mkoffload.hsaco");
+  gcn_cfile_name = make_temp_file (".c");
+
+  /* Build arguments for compiler pass.  */
+  struct obstack cc_argv_obstack;
+  obstack_init (&cc_argv_obstack);
+  obstack_ptr_grow (&cc_argv_obstack, driver);
+  obstack_ptr_grow (&cc_argv_obstack, "-S");
+
+  if (save_temps)
+    obstack_ptr_grow (&cc_argv_obstack, "-save-temps");
+  if (verbose)
+    obstack_ptr_grow (&cc_argv_obstack, "-v");
+  obstack_ptr_grow (&cc_argv_obstack, abi);
+  obstack_ptr_grow (&cc_argv_obstack, "-xlto");
+  if (fopenmp)
+    obstack_ptr_grow (&cc_argv_obstack, "-mgomp");
+
+  for (int ix = 1; ix != argc; ix++)
+    {
+      if (!strcmp (argv[ix], "-o") && ix + 1 != argc)
+	outname = argv[++ix];
+      else
+	{
+	  obstack_ptr_grow (&cc_argv_obstack, argv[ix]);
+
+	  if (argv[ix][0] != '-')
+	    offloadsrc = argv[ix];
+	}
+    }
+
+  obstack_ptr_grow (&cc_argv_obstack, "-o");
+  obstack_ptr_grow (&cc_argv_obstack, gcn_s1_name);
+  obstack_ptr_grow (&cc_argv_obstack,
+		    concat ("-mlocal-symbol-id=", offloadsrc, NULL));
+  obstack_ptr_grow (&cc_argv_obstack, NULL);
+  const char **cc_argv = XOBFINISH (&cc_argv_obstack, const char **);
+
+  /* Build arguments for assemble/link pass.  */
+  struct obstack ld_argv_obstack;
+  obstack_init (&ld_argv_obstack);
+  obstack_ptr_grow (&ld_argv_obstack, driver);
+  obstack_ptr_grow (&ld_argv_obstack, gcn_s2_name);
+  obstack_ptr_grow (&ld_argv_obstack, "-lgomp");
+
+  for (int i = 1; i < argc; i++)
+    if (strncmp (argv[i], "-l", 2) == 0
+	|| strncmp (argv[i], "-Wl", 3) == 0
+	|| strncmp (argv[i], "-march", 6) == 0)
+      obstack_ptr_grow (&ld_argv_obstack, argv[i]);
+
+  obstack_ptr_grow (&ld_argv_obstack, "-o");
+  obstack_ptr_grow (&ld_argv_obstack, gcn_o_name);
+  obstack_ptr_grow (&ld_argv_obstack, NULL);
+  const char **ld_argv = XOBFINISH (&ld_argv_obstack, const char **);
+
+  /* Clean up unhelpful environment variables.  */
+  char *execpath = getenv ("GCC_EXEC_PREFIX");
+  char *cpath = getenv ("COMPILER_PATH");
+  char *lpath = getenv ("LIBRARY_PATH");
+  unsetenv ("GCC_EXEC_PREFIX");
+  unsetenv ("COMPILER_PATH");
+  unsetenv ("LIBRARY_PATH");
+
+  /* Run the compiler pass.  */
+  fork_execute (cc_argv[0], CONST_CAST (char **, cc_argv), true);
+  obstack_free (&cc_argv_obstack, NULL);
+
+  in = fopen (gcn_s1_name, "r");
+  if (!in)
+    fatal_error (input_location, "cannot open intermediate gcn asm file");
+
+  out = fopen (gcn_s2_name, "w");
+  if (!out)
+    fatal_error (input_location, "cannot open '%s'", gcn_s2_name);
+
+  cfile = fopen (gcn_cfile_name, "w");
+  if (!cfile)
+    fatal_error (input_location, "cannot open '%s'", gcn_cfile_name);
+
+  process_asm (in, out, cfile);
+
+  fclose (in);
+  fclose (out);
+
+  /* Run the assemble/link pass.  */
+  fork_execute (ld_argv[0], CONST_CAST (char **, ld_argv), true);
+  obstack_free (&ld_argv_obstack, NULL);
+
+  in = fopen (gcn_o_name, "r");
+  if (!in)
+    fatal_error (input_location, "cannot open intermediate gcn obj file");
+
+  process_obj (in, cfile);
+
+  fclose (in);
+  fclose (cfile);
+
+  xputenv (concat ("GCC_EXEC_PREFIX=", execpath, NULL));
+  xputenv (concat ("COMPILER_PATH=", cpath, NULL));
+  xputenv (concat ("LIBRARY_PATH=", lpath, NULL));
+
+  compile_native (gcn_cfile_name, outname, collect_gcc);
+
+  return 0;
+}
diff --git a/gcc/config/gcn/offload.h b/gcc/config/gcn/offload.h
new file mode 100644
index 0000000..795ee3f
--- /dev/null
+++ b/gcc/config/gcn/offload.h
@@ -0,0 +1,35 @@
+/* Support for AMD GCN offloading.
+
+   Copyright (C) 2014-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_GCN_OFFLOAD_H
+#define GCC_GCN_OFFLOAD_H
+
+/* Support for OpenACC acc_on_device.  */
+
+#include "gomp-constants.h"
+
+#define ACCEL_COMPILER_acc_device GOMP_DEVICE_GCN
+
+#endif
diff --git a/gcc/config/gcn/t-gcn-hsa b/gcc/config/gcn/t-gcn-hsa
index 085ba429..1600a58 100644
--- a/gcc/config/gcn/t-gcn-hsa
+++ b/gcc/config/gcn/t-gcn-hsa
@@ -42,8 +42,8 @@
 gcn-run$(exeext): gcn-run.o
 	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ $< -ldl
 
-MULTILIB_OPTIONS = march=gfx900
-MULTILIB_DIRNAMES = gcn5
+MULTILIB_OPTIONS = march=gfx900 march=gfx906
+MULTILIB_DIRNAMES = gfx900 gfx906
 
 PASSES_EXTRA += $(srcdir)/config/gcn/gcn-passes.def
 gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 1bca5a7..1dda118 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -51253,6 +51253,35 @@
   return ROUND_UP (bytes, UNITS_PER_WORD);
 }
 
+/* Return TRUE if offloaded OpenACC target-code regions should have their
+   parameters passed as separate function arguments, rather than in an array.
+   This can be a performance win on some (NVidia) GPUs.  */
+
+bool
+ix86_goacc_explode_args (void)
+{
+#ifdef OFFLOAD_TARGETS
+  const char *offload_targets = OFFLOAD_TARGETS;
+  if (strstr (offload_targets, "nvptx"))
+    {
+      if (strchr (offload_targets, ','))
+	{
+	  static bool warned_ptx_args = false;
+	  if (!warned_ptx_args)
+	    {
+	      warning (0, "NVidia PTX parameter-passing optimization disabled "
+		       "with multiple offload targets");
+	      warned_ptx_args = true;
+	    }
+	  return false;
+	}
+      return true;
+    }
+
+#endif
+  return false;
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -52028,6 +52057,9 @@
 #define TARGET_GET_MULTILIB_ABI_NAME \
   ix86_get_multilib_abi_name
 
+#undef TARGET_GOACC_EXPLODE_ARGS
+#define TARGET_GOACC_EXPLODE_ARGS ix86_goacc_explode_args
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 1986e79..bf93886 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -74,6 +74,9 @@
 #include "cfgloop.h"
 #include "fold-const.h"
 #include "intl.h"
+#include "tree-hash-traits.h"
+#include "omp-sese.h"
+#include "tree-pretty-print.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -166,6 +169,12 @@
 static unsigned vector_red_partition;
 static GTY(()) rtx vector_red_sym;
 
+/* Shared memory block for gang-private variables.  */
+static unsigned gangprivate_shared_size;
+static unsigned gangprivate_shared_align;
+static GTY(()) rtx gangprivate_shared_sym;
+static hash_map<tree_decl_hash, unsigned int> gangprivate_shared_hmap;
+
 /* Global lock variable, needed for 128bit worker & gang reductions.  */
 static GTY(()) tree global_lock_var;
 
@@ -247,6 +256,10 @@
   vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
   vector_red_partition = 0;
 
+  gangprivate_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gangprivate_shared");
+  SET_SYMBOL_DATA_AREA (gangprivate_shared_sym, DATA_AREA_SHARED);
+  gangprivate_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+
   diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
   diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
   diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
@@ -3010,6 +3023,52 @@
   return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
 }
 
+/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
+   mark that as unified.  We expect to be in a single block.  */
+
+static void
+nvptx_propagate_unified (rtx_insn *unified)
+{
+  rtx_insn *probe = unified;
+  rtx cond_reg = SET_DEST (PATTERN (unified));
+  rtx pat = NULL_RTX;
+
+  /* Find the comparison.  (We could skip this and simply scan to he
+     blocks' terminating branch, if we didn't care for self
+     checking.)  */
+  for (;;)
+    {
+      probe = next_real_insn (probe);
+      if (!probe)
+	break;
+      pat = PATTERN (probe);
+
+      if (GET_CODE (pat) == SET
+	  && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
+	  && XEXP (SET_SRC (pat), 0) == cond_reg)
+	break;
+      gcc_assert (NONJUMP_INSN_P (probe));
+    }
+  gcc_assert (pat);
+  rtx pred_reg = SET_DEST (pat);
+
+  /* Find the branch.  */
+  do
+    probe = NEXT_INSN (probe);
+  while (!JUMP_P (probe));
+
+  pat = PATTERN (probe);
+  rtx itec = XEXP (SET_SRC (pat), 0);
+  gcc_assert (XEXP (itec, 0) == pred_reg);
+
+  /* Mark the branch's condition as unified.  */
+  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
+			       UNSPEC_BR_UNIFIED);
+  bool ok = validate_change (probe, &XEXP (itec, 0), unspec, false);
+
+  gcc_assert (ok);
+}
+
 /* Loop structure of the function.  The entire function is described as
    a NULL loop.  */
 
@@ -3111,6 +3170,9 @@
 	    continue;
 	  switch (recog_memoized (insn))
 	    {
+	    case CODE_FOR_cond_uni:
+	      nvptx_propagate_unified (insn);
+	      /* FALLTHROUGH */
 	    default:
 	      seen_insn = true;
 	      continue;
@@ -3320,625 +3382,6 @@
   return par;
 }
 
-/* Analyse a group of BBs within a partitioned region and create N
-   Single-Entry-Single-Exit regions.  Some of those regions will be
-   trivial ones consisting of a single BB.  The blocks of a
-   partitioned region might form a set of disjoint graphs -- because
-   the region encloses a differently partitoned sub region.
-
-   We use the linear time algorithm described in 'Finding Regions Fast:
-   Single Entry Single Exit and control Regions in Linear Time'
-   Johnson, Pearson & Pingali.  That algorithm deals with complete
-   CFGs, where a back edge is inserted from END to START, and thus the
-   problem becomes one of finding equivalent loops.
-
-   In this case we have a partial CFG.  We complete it by redirecting
-   any incoming edge to the graph to be from an arbitrary external BB,
-   and similarly redirecting any outgoing edge to be to  that BB.
-   Thus we end up with a closed graph.
-
-   The algorithm works by building a spanning tree of an undirected
-   graph and keeping track of back edges from nodes further from the
-   root in the tree to nodes nearer to the root in the tree.  In the
-   description below, the root is up and the tree grows downwards.
-
-   We avoid having to deal with degenerate back-edges to the same
-   block, by splitting each BB into 3 -- one for input edges, one for
-   the node itself and one for the output edges.  Such back edges are
-   referred to as 'Brackets'.  Cycle equivalent nodes will have the
-   same set of brackets.
-   
-   Determining bracket equivalency is done by maintaining a list of
-   brackets in such a manner that the list length and final bracket
-   uniquely identify the set.
-
-   We use coloring to mark all BBs with cycle equivalency with the
-   same color.  This is the output of the 'Finding Regions Fast'
-   algorithm.  Notice it doesn't actually find the set of nodes within
-   a particular region, just unorderd sets of nodes that are the
-   entries and exits of SESE regions.
-   
-   After determining cycle equivalency, we need to find the minimal
-   set of SESE regions.  Do this with a DFS coloring walk of the
-   complete graph.  We're either 'looking' or 'coloring'.  When
-   looking, and we're in the subgraph, we start coloring the color of
-   the current node, and remember that node as the start of the
-   current color's SESE region.  Every time we go to a new node, we
-   decrement the count of nodes with thet color.  If it reaches zero,
-   we remember that node as the end of the current color's SESE region
-   and return to 'looking'.  Otherwise we color the node the current
-   color.
-
-   This way we end up with coloring the inside of non-trivial SESE
-   regions with the color of that region.  */
-
-/* A pair of BBs.  We use this to represent SESE regions.  */
-typedef std::pair<basic_block, basic_block> bb_pair_t;
-typedef auto_vec<bb_pair_t> bb_pair_vec_t;
-
-/* A node in the undirected CFG.  The discriminator SECOND indicates just
-   above or just below the BB idicated by FIRST.  */
-typedef std::pair<basic_block, int> pseudo_node_t;
-
-/* A bracket indicates an edge towards the root of the spanning tree of the
-   undirected graph.  Each bracket has a color, determined
-   from the currrent set of brackets.  */
-struct bracket
-{
-  pseudo_node_t back; /* Back target */
-
-  /* Current color and size of set.  */
-  unsigned color;
-  unsigned size;
-
-  bracket (pseudo_node_t back_)
-  : back (back_), color (~0u), size (~0u)
-  {
-  }
-
-  unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
-  {
-    if (length != size)
-      {
-	size = length;
-	color = color_counts.length ();
-	color_counts.quick_push (0);
-      }
-    color_counts[color]++;
-    return color;
-  }
-};
-
-typedef auto_vec<bracket> bracket_vec_t;
-
-/* Basic block info for finding SESE regions.    */
-
-struct bb_sese
-{
-  int node;  /* Node number in spanning tree.  */
-  int parent; /* Parent node number.  */
-
-  /* The algorithm splits each node A into Ai, A', Ao. The incoming
-     edges arrive at pseudo-node Ai and the outgoing edges leave at
-     pseudo-node Ao.  We have to remember which way we arrived at a
-     particular node when generating the spanning tree.  dir > 0 means
-     we arrived at Ai, dir < 0 means we arrived at Ao.  */
-  int dir;
-
-  /* Lowest numbered pseudo-node reached via a backedge from thsis
-     node, or any descendant.  */
-  pseudo_node_t high;
-
-  int color;  /* Cycle-equivalence color  */
-
-  /* Stack of brackets for this node.  */
-  bracket_vec_t brackets;
-
-  bb_sese (unsigned node_, unsigned p, int dir_)
-  :node (node_), parent (p), dir (dir_)
-  {
-  }
-  ~bb_sese ();
-
-  /* Push a bracket ending at BACK.  */
-  void push (const pseudo_node_t &back)
-  {
-    if (dump_file)
-      fprintf (dump_file, "Pushing backedge %d:%+d\n",
-	       back.first ? back.first->index : 0, back.second);
-    brackets.safe_push (bracket (back));
-  }
-  
-  void append (bb_sese *child);
-  void remove (const pseudo_node_t &);
-
-  /* Set node's color.  */
-  void set_color (auto_vec<unsigned> &color_counts)
-  {
-    color = brackets.last ().get_color (color_counts, brackets.length ());
-  }
-};
-
-bb_sese::~bb_sese ()
-{
-}
-
-/* Destructively append CHILD's brackets.  */
-
-void
-bb_sese::append (bb_sese *child)
-{
-  if (int len = child->brackets.length ())
-    {
-      int ix;
-
-      if (dump_file)
-	{
-	  for (ix = 0; ix < len; ix++)
-	    {
-	      const pseudo_node_t &pseudo = child->brackets[ix].back;
-	      fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
-		       child->node, pseudo.first ? pseudo.first->index : 0,
-		       pseudo.second);
-	    }
-	}
-      if (!brackets.length ())
-	std::swap (brackets, child->brackets);
-      else
-	{
-	  brackets.reserve (len);
-	  for (ix = 0; ix < len; ix++)
-	    brackets.quick_push (child->brackets[ix]);
-	}
-    }
-}
-
-/* Remove brackets that terminate at PSEUDO.  */
-
-void
-bb_sese::remove (const pseudo_node_t &pseudo)
-{
-  unsigned removed = 0;
-  int len = brackets.length ();
-
-  for (int ix = 0; ix < len; ix++)
-    {
-      if (brackets[ix].back == pseudo)
-	{
-	  if (dump_file)
-	    fprintf (dump_file, "Removing backedge %d:%+d\n",
-		     pseudo.first ? pseudo.first->index : 0, pseudo.second);
-	  removed++;
-	}
-      else if (removed)
-	brackets[ix-removed] = brackets[ix];
-    }
-  while (removed--)
-    brackets.pop ();
-}
-
-/* Accessors for BB's aux pointer.  */
-#define BB_SET_SESE(B, S) ((B)->aux = (S))
-#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
-
-/* DFS walk creating SESE data structures.  Only cover nodes with
-   BB_VISITED set.  Append discovered blocks to LIST.  We number in
-   increments of 3 so that the above and below pseudo nodes can be
-   implicitly numbered too.  */
-
-static int
-nvptx_sese_number (int n, int p, int dir, basic_block b,
-		   auto_vec<basic_block> *list)
-{
-  if (BB_GET_SESE (b))
-    return n;
-
-  if (dump_file)
-    fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
-	     b->index, n, p, dir);
-  
-  BB_SET_SESE (b, new bb_sese (n, p, dir));
-  p = n;
-      
-  n += 3;
-  list->quick_push (b);
-
-  /* First walk the nodes on the 'other side' of this node, then walk
-     the nodes on the same side.  */
-  for (unsigned ix = 2; ix; ix--)
-    {
-      vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
-      size_t offset = (dir > 0 ? offsetof (edge_def, dest)
-		       : offsetof (edge_def, src));
-      edge e;
-      edge_iterator (ei);
-
-      FOR_EACH_EDGE (e, ei, edges)
-	{
-	  basic_block target = *(basic_block *)((char *)e + offset);
-	  
-	  if (target->flags & BB_VISITED)
-	    n = nvptx_sese_number (n, p, dir, target, list);
-	}
-      dir = -dir;
-    }
-  return n;
-}
-
-/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
-   EDGES are the outgoing edges and OFFSET is the offset to the src
-   or dst block on the edges.   */
-
-static void
-nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
-		   vec<edge, va_gc> *edges, size_t offset)
-{
-  edge e;
-  edge_iterator (ei);
-  int hi_back = depth;
-  pseudo_node_t node_back (0, depth);
-  int hi_child = depth;
-  pseudo_node_t node_child (0, depth);
-  basic_block child = NULL;
-  unsigned num_children = 0;
-  int usd = -dir * sese->dir;
-
-  if (dump_file)
-    fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
-	     me->index, sese->node, dir);
-
-  if (dir < 0)
-    {
-      /* This is the above pseudo-child.  It has the BB itself as an
-	 additional child node.  */
-      node_child = sese->high;
-      hi_child = node_child.second;
-      if (node_child.first)
-	hi_child += BB_GET_SESE (node_child.first)->node;
-      num_children++;
-    }
-
-  /* Examine each edge.
-     - if it is a child (a) append its bracket list and (b) record
-          whether it is the child with the highest reaching bracket.
-     - if it is an edge to ancestor, record whether it's the highest
-          reaching backlink.  */
-  FOR_EACH_EDGE (e, ei, edges)
-    {
-      basic_block target = *(basic_block *)((char *)e + offset);
-
-      if (bb_sese *t_sese = BB_GET_SESE (target))
-	{
-	  if (t_sese->parent == sese->node && !(t_sese->dir + usd))
-	    {
-	      /* Child node.  Append its bracket list. */
-	      num_children++;
-	      sese->append (t_sese);
-
-	      /* Compare it's hi value.  */
-	      int t_hi = t_sese->high.second;
-
-	      if (basic_block child_hi_block = t_sese->high.first)
-		t_hi += BB_GET_SESE (child_hi_block)->node;
-
-	      if (hi_child > t_hi)
-		{
-		  hi_child = t_hi;
-		  node_child = t_sese->high;
-		  child = target;
-		}
-	    }
-	  else if (t_sese->node < sese->node + dir
-		   && !(dir < 0 && sese->parent == t_sese->node))
-	    {
-	      /* Non-parental ancestor node -- a backlink.  */
-	      int d = usd * t_sese->dir;
-	      int back = t_sese->node + d;
-	
-	      if (hi_back > back)
-		{
-		  hi_back = back;
-		  node_back = pseudo_node_t (target, d);
-		}
-	    }
-	}
-      else
-	{ /* Fallen off graph, backlink to entry node.  */
-	  hi_back = 0;
-	  node_back = pseudo_node_t (0, 0);
-	}
-    }
-
-  /* Remove any brackets that terminate at this pseudo node.  */
-  sese->remove (pseudo_node_t (me, dir));
-
-  /* Now push any backlinks from this pseudo node.  */
-  FOR_EACH_EDGE (e, ei, edges)
-    {
-      basic_block target = *(basic_block *)((char *)e + offset);
-      if (bb_sese *t_sese = BB_GET_SESE (target))
-	{
-	  if (t_sese->node < sese->node + dir
-	      && !(dir < 0 && sese->parent == t_sese->node))
-	    /* Non-parental ancestor node - backedge from me.  */
-	    sese->push (pseudo_node_t (target, usd * t_sese->dir));
-	}
-      else
-	{
-	  /* back edge to entry node */
-	  sese->push (pseudo_node_t (0, 0));
-	}
-    }
-  
- /* If this node leads directly or indirectly to a no-return region of
-     the graph, then fake a backedge to entry node.  */
-  if (!sese->brackets.length () || !edges || !edges->length ())
-    {
-      hi_back = 0;
-      node_back = pseudo_node_t (0, 0);
-      sese->push (node_back);
-    }
-
-  /* Record the highest reaching backedge from us or a descendant.  */
-  sese->high = hi_back < hi_child ? node_back : node_child;
-
-  if (num_children > 1)
-    {
-      /* There is more than one child -- this is a Y shaped piece of
-	 spanning tree.  We have to insert a fake backedge from this
-	 node to the highest ancestor reached by not-the-highest
-	 reaching child.  Note that there may be multiple children
-	 with backedges to the same highest node.  That's ok and we
-	 insert the edge to that highest node.  */
-      hi_child = depth;
-      if (dir < 0 && child)
-	{
-	  node_child = sese->high;
-	  hi_child = node_child.second;
-	  if (node_child.first)
-	    hi_child += BB_GET_SESE (node_child.first)->node;
-	}
-
-      FOR_EACH_EDGE (e, ei, edges)
-	{
-	  basic_block target = *(basic_block *)((char *)e + offset);
-
-	  if (target == child)
-	    /* Ignore the highest child. */
-	    continue;
-
-	  bb_sese *t_sese = BB_GET_SESE (target);
-	  if (!t_sese)
-	    continue;
-	  if (t_sese->parent != sese->node)
-	    /* Not a child. */
-	    continue;
-
-	  /* Compare its hi value.  */
-	  int t_hi = t_sese->high.second;
-
-	  if (basic_block child_hi_block = t_sese->high.first)
-	    t_hi += BB_GET_SESE (child_hi_block)->node;
-
-	  if (hi_child > t_hi)
-	    {
-	      hi_child = t_hi;
-	      node_child = t_sese->high;
-	    }
-	}
-      
-      sese->push (node_child);
-    }
-}
-
-
-/* DFS walk of BB graph.  Color node BLOCK according to COLORING then
-   proceed to successors.  Set SESE entry and exit nodes of
-   REGIONS.  */
-
-static void
-nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
-		  basic_block block, int coloring)
-{
-  bb_sese *sese = BB_GET_SESE (block);
-
-  if (block->flags & BB_VISITED)
-    {
-      /* If we've already encountered this block, either we must not
-	 be coloring, or it must have been colored the current color.  */
-      gcc_assert (coloring < 0 || (sese && coloring == sese->color));
-      return;
-    }
-  
-  block->flags |= BB_VISITED;
-
-  if (sese)
-    {
-      if (coloring < 0)
-	{
-	  /* Start coloring a region.  */
-	  regions[sese->color].first = block;
-	  coloring = sese->color;
-	}
-
-      if (!--color_counts[sese->color] && sese->color == coloring)
-	{
-	  /* Found final block of SESE region.  */
-	  regions[sese->color].second = block;
-	  coloring = -1;
-	}
-      else
-	/* Color the node, so we can assert on revisiting the node
-	   that the graph is indeed SESE.  */
-	sese->color = coloring;
-    }
-  else
-    /* Fallen off the subgraph, we cannot be coloring.  */
-    gcc_assert (coloring < 0);
-
-  /* Walk each successor block.  */
-  if (block->succs && block->succs->length ())
-    {
-      edge e;
-      edge_iterator ei;
-      
-      FOR_EACH_EDGE (e, ei, block->succs)
-	nvptx_sese_color (color_counts, regions, e->dest, coloring);
-    }
-  else
-    gcc_assert (coloring < 0);
-}
-
-/* Find minimal set of SESE regions covering BLOCKS.  REGIONS might
-   end up with NULL entries in it.  */
-
-static void
-nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
-{
-  basic_block block;
-  int ix;
-
-  /* First clear each BB of the whole function.  */ 
-  FOR_ALL_BB_FN (block, cfun)
-    {
-      block->flags &= ~BB_VISITED;
-      BB_SET_SESE (block, 0);
-    }
-
-  /* Mark blocks in the function that are in this graph.  */
-  for (ix = 0; blocks.iterate (ix, &block); ix++)
-    block->flags |= BB_VISITED;
-
-  /* Counts of nodes assigned to each color.  There cannot be more
-     colors than blocks (and hopefully there will be fewer).  */
-  auto_vec<unsigned> color_counts;
-  color_counts.reserve (blocks.length ());
-
-  /* Worklist of nodes in the spanning tree.  Again, there cannot be
-     more nodes in the tree than blocks (there will be fewer if the
-     CFG of blocks is disjoint).  */
-  auto_vec<basic_block> spanlist;
-  spanlist.reserve (blocks.length ());
-
-  /* Make sure every block has its cycle class determined.  */
-  for (ix = 0; blocks.iterate (ix, &block); ix++)
-    {
-      if (BB_GET_SESE (block))
-	/* We already met this block in an earlier graph solve.  */
-	continue;
-
-      if (dump_file)
-	fprintf (dump_file, "Searching graph starting at %d\n", block->index);
-      
-      /* Number the nodes reachable from block initial DFS order.  */
-      int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
-
-      /* Now walk in reverse DFS order to find cycle equivalents.  */
-      while (spanlist.length ())
-	{
-	  block = spanlist.pop ();
-	  bb_sese *sese = BB_GET_SESE (block);
-
-	  /* Do the pseudo node below.  */
-	  nvptx_sese_pseudo (block, sese, depth, +1,
-			     sese->dir > 0 ? block->succs : block->preds,
-			     (sese->dir > 0 ? offsetof (edge_def, dest)
-			      : offsetof (edge_def, src)));
-	  sese->set_color (color_counts);
-	  /* Do the pseudo node above.  */
-	  nvptx_sese_pseudo (block, sese, depth, -1,
-			     sese->dir < 0 ? block->succs : block->preds,
-			     (sese->dir < 0 ? offsetof (edge_def, dest)
-			      : offsetof (edge_def, src)));
-	}
-      if (dump_file)
-	fprintf (dump_file, "\n");
-    }
-
-  if (dump_file)
-    {
-      unsigned count;
-      const char *comma = "";
-      
-      fprintf (dump_file, "Found %d cycle equivalents\n",
-	       color_counts.length ());
-      for (ix = 0; color_counts.iterate (ix, &count); ix++)
-	{
-	  fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
-
-	  comma = "";
-	  for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
-	    if (BB_GET_SESE (block)->color == ix)
-	      {
-		block->flags |= BB_VISITED;
-		fprintf (dump_file, "%s%d", comma, block->index);
-		comma=",";
-	      }
-	  fprintf (dump_file, "}");
-	  comma = ", ";
-	}
-      fprintf (dump_file, "\n");
-   }
-  
-  /* Now we've colored every block in the subgraph.  We now need to
-     determine the minimal set of SESE regions that cover that
-     subgraph.  Do this with a DFS walk of the complete function.
-     During the walk we're either 'looking' or 'coloring'.  When we
-     reach the last node of a particular color, we stop coloring and
-     return to looking.  */
-
-  /* There cannot be more SESE regions than colors.  */
-  regions.reserve (color_counts.length ());
-  for (ix = color_counts.length (); ix--;)
-    regions.quick_push (bb_pair_t (0, 0));
-
-  for (ix = 0; blocks.iterate (ix, &block); ix++)
-    block->flags &= ~BB_VISITED;
-
-  nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
-
-  if (dump_file)
-    {
-      const char *comma = "";
-      int len = regions.length ();
-      
-      fprintf (dump_file, "SESE regions:");
-      for (ix = 0; ix != len; ix++)
-	{
-	  basic_block from = regions[ix].first;
-	  basic_block to = regions[ix].second;
-
-	  if (from)
-	    {
-	      fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
-	      if (to != from)
-		fprintf (dump_file, "->%d", to->index);
-
-	      int color = BB_GET_SESE (from)->color;
-
-	      /* Print the blocks within the region (excluding ends).  */
-	      FOR_EACH_BB_FN (block, cfun)
-		{
-		  bb_sese *sese = BB_GET_SESE (block);
-
-		  if (sese && sese->color == color
-		      && block != from && block != to)
-		    fprintf (dump_file, ".%d", block->index);
-		}
-	      fprintf (dump_file, "}");
-	    }
-	  comma = ",";
-	}
-      fprintf (dump_file, "\n\n");
-    }
-  
-  for (ix = 0; blocks.iterate (ix, &block); ix++)
-    delete BB_GET_SESE (block);
-}
-
-#undef BB_SET_SESE
-#undef BB_GET_SESE
-
 /* Propagate live state at the start of a partitioned region.  IS_CALL
    indicates whether the propagation is for a (partitioned) call
    instruction.  BLOCK provides the live register information, and
@@ -4761,7 +4204,7 @@
 	  /* Neuter whole SESE regions.  */
 	  bb_pair_vec_t regions;
 
-	  nvptx_find_sese (par->blocks, regions);
+	  omp_find_sese (par->blocks, regions);
 	  len = regions.length ();
 	  for (ix = 0; ix != len; ix++)
 	    {
@@ -5237,6 +4680,10 @@
     write_shared_buffer (asm_out_file, vector_red_sym,
 			 vector_red_align, vector_red_size);
 
+  if (gangprivate_shared_size)
+    write_shared_buffer (asm_out_file, gangprivate_shared_sym,
+			 gangprivate_shared_align, gangprivate_shared_size);
+
   if (need_softstack_decl)
     {
       write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
@@ -5368,6 +4815,10 @@
 			 NULL_RTX, mode, EXPAND_NORMAL);
   rtx pat;
 
+  /* 'mem' might be a PARM_DECL.  If so, convert it to a register.  */
+  if (!REG_P (mem))
+    mem = copy_to_mode_reg (GET_MODE (mem), mem);
+
   mem = gen_rtx_MEM (mode, mem);
   if (!REG_P (cmp))
     cmp = copy_to_mode_reg (mode, cmp);
@@ -5384,6 +4835,21 @@
   return target;
 }
 
+/* Expander for the compare unified builtin.  */
+
+static rtx
+nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
+{
+  if (ignore)
+    return target;
+
+  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
+			 NULL_RTX, mode, EXPAND_NORMAL);
+
+  emit_insn (gen_cond_uni (target, src));
+
+  return target;
+}
 
 /* Codes for all the NVPTX builtins.  */
 enum nvptx_builtins
@@ -5394,6 +4860,7 @@
   NVPTX_BUILTIN_VECTOR_ADDR,
   NVPTX_BUILTIN_CMP_SWAP,
   NVPTX_BUILTIN_CMP_SWAPLL,
+  NVPTX_BUILTIN_COND_UNI,
   NVPTX_BUILTIN_MAX
 };
 
@@ -5433,6 +4900,7 @@
        (PTRVOID, ST, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
+  DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, NULL_TREE));
 
 #undef DEF
 #undef ST
@@ -5468,6 +4936,9 @@
     case NVPTX_BUILTIN_CMP_SWAPLL:
       return nvptx_expand_cmp_swap (exp, target, mode, ignore);
 
+    case NVPTX_BUILTIN_COND_UNI:
+      return nvptx_expand_cond_uni (exp, target, mode, ignore);
+
     default: gcc_unreachable ();
     }
 }
@@ -5787,7 +5258,7 @@
 
 static void
 nvptx_generate_vector_shuffle (location_t loc,
-			       tree dest_var, tree var, unsigned shift,
+			       tree dest_var, tree var, tree bits,
 			       gimple_seq *seq)
 {
   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
@@ -5810,7 +5281,6 @@
     }
   
   tree call = nvptx_builtin_decl (fn, true);
-  tree bits = build_int_cst (unsigned_type_node, shift);
   tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
   tree expr;
 
@@ -6086,6 +5556,129 @@
     return nvptx_lockfull_update (loc, gsi, ptr, var, op);
 }
 
+/* Emit a vector-level reduction loop.  OLD_VAR is the incoming
+   variable to reduce (valid in each vector), OP is the reduction
+   operator.  Return the reduced value (an SSA var).
+
+   The code we generate looks like:
+      unsigned old_shift = DIM_SIZE(VECTOR);
+      do
+	{
+	  shift = PHI (old_shift, new_shift);
+	  var = PHI (old_var, new_var);
+	  new_shift = shift >> 1;
+	  other_var = VSHUFFLE (var, new_shift);
+	  new_var = var OP other_var;
+	  cond_var = builtin_cond_uni (new_shift);
+	}
+	while (cond_var > 1);
+
+  The builtin_cond_ini expands to a cond_uni instruction, which is
+  processed in nvpts_split_blocks to mark the loop's terminating
+  branch instruction.  */
+
+static tree
+nvptx_vector_reduction (location_t loc, gimple_stmt_iterator *gsi,
+			tree old_var, tree_code op)
+{
+  tree var_type = TREE_TYPE (old_var);
+
+  /*  Emit old_shift = DIM_SIZE(VECTOR) */
+  tree old_shift = make_ssa_name (integer_type_node);
+  tree dim = build_int_cst (integer_type_node, GOMP_DIM_VECTOR);
+  gcall *call = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, dim);
+  gimple_set_lhs (call, old_shift);
+  gimple_set_location (call, loc);
+  gsi_insert_before (gsi, call, GSI_SAME_STMT);
+
+  /* Split the block just after the init stmts.  */
+  basic_block pre_bb = gsi_bb (*gsi);
+  edge pre_edge = split_block (pre_bb, call);
+  pre_edge->probability = profile_probability::even ();
+  basic_block loop_bb = pre_edge->dest;
+  pre_bb = pre_edge->src;
+  /* Reset the iterator.  */
+  *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+  tree shift = make_ssa_name (integer_type_node);
+  tree new_shift = make_ssa_name (integer_type_node);
+  tree var = make_ssa_name (var_type);
+  tree other_var = make_ssa_name (var_type);
+  tree new_var = make_ssa_name (var_type);
+
+  /* Build and insert the loop body.  */
+  gimple_seq loop_seq = NULL;
+
+  /* new_shift = shift >> 1 */
+  tree shift_expr = fold_build2 (RSHIFT_EXPR, integer_type_node,
+				 shift, integer_one_node);
+  gimplify_assign (new_shift, shift_expr, &loop_seq);
+
+  /* other_var = shuffle (var, shift) */
+  nvptx_generate_vector_shuffle (loc, other_var, var, new_shift, &loop_seq);
+  /* new_var = var  OP other_var */
+  tree red_expr = fold_build2 (op, var_type, var, other_var);
+  gimplify_assign (new_var, red_expr, &loop_seq);
+
+  /* Mark the iterator variable as unified.  */
+  tree cond_var = make_ssa_name (integer_type_node);
+  tree uni_fn = nvptx_builtin_decl (NVPTX_BUILTIN_COND_UNI, true);
+  tree uni_expr = build_call_expr_loc (loc, uni_fn, 1, new_shift);
+  gimplify_assign (cond_var,  uni_expr, &loop_seq);
+
+  gcond *cond = gimple_build_cond (LE_EXPR, cond_var, integer_one_node,
+				   NULL_TREE, NULL_TREE);
+  gimple_seq_add_stmt (&loop_seq, cond);
+
+  gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT);
+
+  /* Split the block just after the loop stmts.  */
+  edge post_edge = split_block (loop_bb, cond);
+  post_edge->probability = profile_probability::even ();
+  basic_block post_bb = post_edge->dest;
+  loop_bb = post_edge->src;
+  *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+  /* Create the loop.  */
+  post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
+  edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
+  loop_edge->probability = profile_probability::even ();
+  set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
+  set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
+
+  gphi *shift_phi = create_phi_node (shift, loop_bb);
+  add_phi_arg (shift_phi, old_shift, pre_edge, loc);
+  add_phi_arg (shift_phi, new_shift, loop_edge, loc);
+
+  gphi *var_phi = create_phi_node (var, loop_bb);
+  add_phi_arg (var_phi, old_var, pre_edge, loc);
+  add_phi_arg (var_phi, new_var, loop_edge, loc);
+
+  loop *loop = alloc_loop ();
+  loop->header = loop_bb;
+  loop->latch = loop_bb;
+  add_loop (loop, loop_bb->loop_father);
+
+  return new_var;
+}
+
+/* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data
+   mappings gets retyped to (void *).  Adjust the type of VAR to TYPE
+   as appropriate.  */
+
+static tree
+nvptx_adjust_reduction_type (tree var, tree type, gimple_seq *seq)
+{
+  if (TREE_TYPE (TREE_TYPE (var)) == type)
+    return var;
+
+  tree ptype = build_pointer_type (type);
+  tree t = make_ssa_name (ptype);
+  tree expr = fold_build1 (NOP_EXPR, ptype, var);
+  gimple_seq_add_stmt (seq, gimple_build_assign (t, expr));
+  return t;
+}
+
 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
 
 static void
@@ -6105,7 +5698,11 @@
       tree ref_to_res = gimple_call_arg (call, 1);
 
       if (!integer_zerop (ref_to_res))
-	var = build_simple_mem_ref (ref_to_res);
+	{
+	  ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
+						    &seq);
+	  var = build_simple_mem_ref (ref_to_res);
+	}
     }
   
   if (level == GOMP_DIM_WORKER
@@ -6233,22 +5830,7 @@
   push_gimplify_context (true);
 
   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
-    {
-      /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
-	 but that requires a method of emitting a unified jump at the
-	 gimple level.  */
-      for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
-	{
-	  tree other_var = make_ssa_name (TREE_TYPE (var));
-	  nvptx_generate_vector_shuffle (gimple_location (call),
-					 other_var, var, shfl, &seq);
-
-	  r = make_ssa_name (TREE_TYPE (var));
-	  gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
-					   var, other_var), &seq);
-	  var = r;
-	}
-    }
+    r = nvptx_vector_reduction (gimple_location (call), &gsi, var, op);
   else
     {
       tree accum = NULL_TREE;
@@ -6267,7 +5849,11 @@
       else if (integer_zerop (ref_to_res))
 	r = var;
       else
-	accum = ref_to_res;
+	{
+	  ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
+						    &seq);
+	  accum = ref_to_res;
+	}
 
       if (accum)
 	{
@@ -6318,7 +5904,11 @@
       tree ref_to_res = gimple_call_arg (call, 1);
 
       if (!integer_zerop (ref_to_res))
-	gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
+	{
+	  ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
+						    &seq);
+	  gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
+	}
     }
 
   if (lhs)
@@ -6430,6 +6020,60 @@
   return false;
 }
 
+/* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL.  Set "oacc gangprivate"
+   attribute for gang-private variable declarations.  */
+
+void
+nvptx_goacc_adjust_private_decl (tree decl, int level)
+{
+  if (level != GOMP_DIM_GANG)
+    return;
+
+  if (!lookup_attribute ("oacc gangprivate", DECL_ATTRIBUTES (decl)))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Setting 'oacc gangprivate' attribute for decl:");
+	  print_generic_decl (dump_file, decl, TDF_SLIM);
+	  fputc ('\n', dump_file);
+	}
+      tree id = get_identifier ("oacc gangprivate");
+      DECL_ATTRIBUTES (decl) = tree_cons (id, NULL, DECL_ATTRIBUTES (decl));
+    }
+}
+
+/* Implement TARGET_GOACC_EXPAND_ACCEL_VAR.  Place "oacc gangprivate"
+   variables in shared memory.  */
+
+static rtx
+nvptx_goacc_expand_accel_var (tree var)
+{
+  if (VAR_P (var)
+      && lookup_attribute ("oacc gangprivate", DECL_ATTRIBUTES (var)))
+    {
+      unsigned int offset, *poffset;
+      poffset = gangprivate_shared_hmap.get (var);
+      if (poffset)
+	offset = *poffset;
+      else
+	{
+	  unsigned HOST_WIDE_INT align = DECL_ALIGN (var);
+	  gangprivate_shared_size
+	    = (gangprivate_shared_size + align - 1) & ~(align - 1);
+	  if (gangprivate_shared_align < align)
+	    gangprivate_shared_align = align;
+
+	  offset = gangprivate_shared_size;
+	  bool existed = gangprivate_shared_hmap.put (var, offset);
+	  gcc_assert (!existed);
+	  gangprivate_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var));
+	}
+      rtx addr = plus_constant (Pmode, gangprivate_shared_sym, offset);
+      return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr);
+    }
+  return NULL_RTX;
+}
+
 static GTY(()) tree nvptx_previous_fndecl;
 
 static void
@@ -6438,6 +6082,7 @@
   if (!fndecl || fndecl == nvptx_previous_fndecl)
     return;
 
+  gangprivate_shared_hmap.empty ();
   nvptx_previous_fndecl = fndecl;
   vector_red_partition = 0;
   oacc_bcast_partition = 0;
@@ -6579,6 +6224,12 @@
 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
 
+#undef TARGET_GOACC_ADJUST_PRIVATE_DECL
+#define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl
+
+#undef TARGET_GOACC_EXPAND_ACCEL_VAR
+#define TARGET_GOACC_EXPAND_ACCEL_VAR nvptx_goacc_expand_accel_var
+
 #undef TARGET_SET_CURRENT_FUNCTION
 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
 
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 1a090a47..b7bcaec 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -602,6 +602,13 @@
   "%J0\\tbra.uni\\t%l1;"
   [(set_attr "predicable" "false")])
 
+(define_insn "cond_uni"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+          (unspec:SI [(match_operand:SI 1 "nvptx_nonmemory_operand" "R")]
+		      UNSPEC_BR_UNIFIED))]
+  ""
+  "%.\\tmov%t0\\t%0, %1; // unified")
+
 (define_expand "cbranch<mode>4"
   [(set (pc)
 	(if_then_else (match_operator 0 "nvptx_comparison_operator"
diff --git a/gcc/cp/ChangeLog.omp b/gcc/cp/ChangeLog.omp
new file mode 100644
index 0000000..00ee6b4
--- /dev/null
+++ b/gcc/cp/ChangeLog.omp
@@ -0,0 +1,126 @@
+2019-07-10  Julian Brown  <julian@codesourcery.com>
+
+	* semantics.c (handle_omp_array_sections): Likewise.
+	(finish_omp_clauses): Handle GOMP_MAP_ATTACH_DETACH.
+
+2019-07-09  Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+	2019-07-08  Jakub Jelinek  <jakub@redhat.com>
+
+	PR c++/91110
+	* decl2.c (cp_omp_mappable_type_1): Don't emit any note for
+	error_mark_node type.
+
+2019-07-04  Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+	2019-07-04  Andrew Stubbs  <ams@codesourcery.com>
+
+	* cp-tree.h (cp_omp_emit_unmappable_type_notes): New prototype.
+	* decl.c (cp_finish_decl): Call cp_omp_emit_unmappable_type_notes.
+	* decl2.c (cp_omp_mappable_type): Move contents to ...
+	(cp_omp_mappable_type_1):  ... here and add note output.
+	(cp_omp_emit_unmappable_type_notes): New function.
+	* semantics.c (finish_omp_clauses): Call
+	cp_omp_emit_unmappable_type_notes in four places.
+
+2018-12-19  Julian Brown  <julian@codesourcery.com>
+            Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* parser.c (cp_parser_omp_clause_name): Support no_create.
+	(cp_parser_oacc_data_clause): Likewise.
+	(cp_parser_oacc_all_clauses): Likewise.
+	(OACC_DATA_CLAUSE_MASK, OACC_KERNELS_CLAUSE_MASK)
+	(OACC_PARALLEL_CLAUSE_MASK, OACC_SERIAL_CLAUSE_MASK): Add
+	PRAGMA_OACC_CLAUSE_NO_CREATE.
+	* semantics.c (handle_omp_array_sections): Support no_create.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* constexpr.c (potential_constant_expression_1): Handle
+	OACC_SERIAL.
+	* parser.c (OACC_SERIAL_CLAUSE_MASK): New macro.
+	(OACC_SERIAL_CLAUSE_DEVICE_TYPE_MASK): Likewise.
+	(cp_parser_oacc_kernels_parallel): Rename function to...
+	(cp_parser_oacc_compute): ... this.  Handle PRAGMA_OACC_SERIAL.
+	(cp_parser_omp_construct): Update accordingly.
+	(cp_parser_pragma): Handle PRAGMA_OACC_SERIAL.  Fix alphabetic
+	order.
+	* pt.c (tsubst_expr): Handle OACC_SERIAL.
+
+2018-12-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* parser.c (OACC_HOST_DATA_CLAUSE_MASK): Likewise.
+
+2018-12-13  Cesar Philippidis  <cesar@codesourcery.com>
+            Nathan Sidwell  <nathan@acm.org>
+            Julian Brown  <julian@codesourcery.com>
+
+        * parser.c (cp_parser_omp_var_list_no_open):  New c_omp_region_type
+        argument.  Use it to specialize handling of OMP_CLAUSE_REDUCTION for
+        OpenACC.
+	(cp_parser_omp_var_list): Add c_omp_region_type argument. Update call
+	to cp_parser_omp_var_list_parens.
+	(cp_parser_oacc_data_clause): Update call to cp_parser_omp_var_list.
+        (cp_parser_omp_clause_reduction): Change is_omp boolean parameter to
+        c_omp_region_type.  Update call to cp_parser_omp_var_list_no_open.
+        (cp_parser_oacc_all_clauses): Update call to
+        cp_parser_omp_clause_reduction.
+        (cp_parser_omp_all_clauses): Likewise.
+        * semantics.c (finish_omp_reduction_clause): Add c_omp_region_type
+	argument.  Suppress user-defined reduction error for OpenACC.
+	(finish_omp_clauses): Emit an error on orphan OpenACC gang reductions.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* parser.c (cp_parser_oacc_routine)
+	(cp_parser_late_parsing_oacc_routine): Normalize order of clauses.
+	(cp_finalize_oacc_routine): Call oacc_verify_routine_clauses.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* parser.c (cp_parser_omp_clause_name): Handle "nohost".
+	(cp_parser_oacc_all_clauses): Handle PRAGMA_OACC_CLAUSE_NOHOST,
+	(cp_parser_oacc_routine, cp_finalize_oacc_routine): Update.
+	* pt.c (tsubst_omp_clauses): Handle OMP_CLAUSE_NOHOST.
+	* semantics.c (finish_omp_clauses): Handle OMP_CLAUSE_NOHOST.
+
+2018-10-16  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* semantics.c (handle_omp_array_sections_1): Add 'bool &non_contiguous'
+	parameter, adjust recursive call site, add cases for allowing
+	pointer based multi-dimensional arrays for OpenACC.
+	(handle_omp_array_sections): Adjust handle_omp_array_sections_1 call,
+	handle non-contiguous case to create dynamic array map.
+
+2019-01-31  Julian Brown  <julian@codesourcery.com>
+
+	* semantics.c (handle_omp_array_sections_1): Handle array section on
+	dereferenced struct member.
+	(finish_omp_clauses): Don't error on multiple dereferenced struct
+	elements with the same base.
+
+2018-12-14  Julian Brown  <julian@codesourcery.com>
+
+	* parser.c (cp_parser_omp_clause_name): Support attach and detach
+	clauses.
+	(cp_parser_omp_var_list_no_open): Add ALLOW_DEREF optional parameter.
+	Parse deref if true.
+	(cp_parser_omp_var_list): Add ALLOW_DEREF optional parameter.  Pass to
+	cp_parser_omp_var_list_no_open.
+	(cp_parser_oacc_data_clause): Support attach and detach clauses.
+	Update call to cp_parser_omp_var_list_no_open.
+	(cp_parser_oacc_all_clauses): Support attach and detach.
+	(OACC_DATA_CLAUSE_MASK, OACC_ENTER_DATA_CLAUSE_MASK)
+	(OACC_KERNELS_CLAUSE_MASK, OACC_PARALLEL_CLAUSE_MASK): Add
+	PRAGMA_OACC_CLAUSE_ATTACH.
+	(OACC_EXIT_DATA_CLAUSE_MASK): Add PRAGMA_OACC_CLAUSE_DETACH.
+	* semantics.c (handle_omp_array_sections_1): Reject subarrays for
+	attach and detach.
+	(cp_oacc_check_attachments): New function.
+	(finish_omp_clauses): Use above function.  Allow structure fields and
+	class members to appear in OpenACC data clauses.  Support deref.
+
diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index bab5935..7ec586a 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -6448,6 +6448,7 @@
     case OMP_DEPOBJ:
     case OACC_PARALLEL:
     case OACC_KERNELS:
+    case OACC_SERIAL:
     case OACC_DATA:
     case OACC_HOST_DATA:
     case OACC_LOOP:
diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index f7c3eea..8886c77 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -6532,6 +6532,7 @@
 extern tree vtv_start_verification_constructor_init_function (void);
 extern tree vtv_finish_verification_constructor_init_function (tree);
 extern bool cp_omp_mappable_type		(tree);
+extern bool cp_omp_emit_unmappable_type_notes	(tree);
 extern void cp_check_const_attributes (tree);
 
 /* in error.c */
diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 39d5558..e39e3e0b 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -7419,8 +7419,11 @@
 			    DECL_ATTRIBUTES (decl));
       complete_type (TREE_TYPE (decl));
       if (!cp_omp_mappable_type (TREE_TYPE (decl)))
-	error ("%q+D in declare target directive does not have mappable type",
-	       decl);
+	{
+	  error ("%q+D in declare target directive does not have mappable"
+		 " type", decl);
+	  cp_omp_emit_unmappable_type_notes (TREE_TYPE (decl));
+	}
       else if (!lookup_attribute ("omp declare target",
 				  DECL_ATTRIBUTES (decl))
 	       && !lookup_attribute ("omp declare target link",
diff --git a/gcc/cp/decl2.c b/gcc/cp/decl2.c
index a46cbce..5381c3d 100644
--- a/gcc/cp/decl2.c
+++ b/gcc/cp/decl2.c
@@ -1406,32 +1406,68 @@
     }
 }
 
-/* Return true if TYPE is an OpenMP mappable type.  */
-bool
-cp_omp_mappable_type (tree type)
+/* Return true if TYPE is an OpenMP mappable type.
+   If NOTES is non-zero, emit a note message for each problem.  */
+static bool
+cp_omp_mappable_type_1 (tree type, bool notes)
 {
+  bool result = true;
+
   /* Mappable type has to be complete.  */
   if (type == error_mark_node || !COMPLETE_TYPE_P (type))
-    return false;
+    {
+      if (notes && type != error_mark_node)
+	{
+	  tree decl = TYPE_MAIN_DECL (type);
+	  inform ((decl ? DECL_SOURCE_LOCATION (decl) : input_location),
+		  "incomplete type %qT is not mappable", type);
+	}
+      result = false;
+    }
   /* Arrays have mappable type if the elements have mappable type.  */
   while (TREE_CODE (type) == ARRAY_TYPE)
     type = TREE_TYPE (type);
   /* A mappable type cannot contain virtual members.  */
   if (CLASS_TYPE_P (type) && CLASSTYPE_VTABLES (type))
-    return false;
+    {
+      if (notes)
+	inform (DECL_SOURCE_LOCATION (TYPE_MAIN_DECL (type)),
+		"type %qT with virtual members is not mappable", type);
+      result = false;
+    }
   /* All data members must be non-static.  */
   if (CLASS_TYPE_P (type))
     {
       tree field;
       for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
 	if (VAR_P (field))
-	  return false;
+	  {
+	    if (notes)
+	      inform (DECL_SOURCE_LOCATION (field),
+		      "static field %qD is not mappable", field);
+	    result = false;
+	  }
 	/* All fields must have mappable types.  */
 	else if (TREE_CODE (field) == FIELD_DECL
-		 && !cp_omp_mappable_type (TREE_TYPE (field)))
-	  return false;
+		 && !cp_omp_mappable_type_1 (TREE_TYPE (field), notes))
+	  result = false;
     }
-  return true;
+  return result;
+}
+
+/* Return true if TYPE is an OpenMP mappable type.  */
+bool
+cp_omp_mappable_type (tree type)
+{
+  return cp_omp_mappable_type_1 (type, false);
+}
+
+/* Return true if TYPE is an OpenMP mappable type.
+   Emit an error messages if not.  */
+bool
+cp_omp_emit_unmappable_type_notes (tree type)
+{
+  return cp_omp_mappable_type_1 (type, true);
 }
 
 /* Return the last pushed declaration for the symbol DECL or NULL
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index e1c02d7..52800c1 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -32285,6 +32285,8 @@
 	    result = PRAGMA_OMP_CLAUSE_ALIGNED;
 	  else if (!strcmp ("async", p))
 	    result = PRAGMA_OACC_CLAUSE_ASYNC;
+	  else if (!strcmp ("attach", p))
+	    result = PRAGMA_OACC_CLAUSE_ATTACH;
 	  break;
 	case 'c':
 	  if (!strcmp ("collapse", p))
@@ -32305,6 +32307,8 @@
 	    result = PRAGMA_OMP_CLAUSE_DEFAULTMAP;
 	  else if (!strcmp ("depend", p))
 	    result = PRAGMA_OMP_CLAUSE_DEPEND;
+	  else if (!strcmp ("detach", p))
+	    result = PRAGMA_OACC_CLAUSE_DETACH;
 	  else if (!strcmp ("device", p))
 	    result = PRAGMA_OMP_CLAUSE_DEVICE;
 	  else if (!strcmp ("deviceptr", p))
@@ -32363,8 +32367,12 @@
 	    result = PRAGMA_OMP_CLAUSE_MERGEABLE;
 	  break;
 	case 'n':
-	  if (!strcmp ("nogroup", p))
+	  if (!strcmp ("no_create", p))
+	    result = PRAGMA_OACC_CLAUSE_NO_CREATE;
+	  else if (!strcmp ("nogroup", p))
 	    result = PRAGMA_OMP_CLAUSE_NOGROUP;
+	  else if (!strcmp ("nohost", p))
+	    result = PRAGMA_OACC_CLAUSE_NOHOST;
 	  else if (!strcmp ("nontemporal", p))
 	    result = PRAGMA_OMP_CLAUSE_NONTEMPORAL;
 	  else if (!strcmp ("notinbranch", p))
@@ -32507,11 +32515,16 @@
 
    COLON can be NULL if only closing parenthesis should end the list,
    or pointer to bool which will receive false if the list is terminated
-   by closing parenthesis or true if the list is terminated by colon.  */
+   by closing parenthesis or true if the list is terminated by colon.
+
+   The optional ALLOW_DEREF argument is true if list items can use the deref
+   (->) operator.  */
 
 static tree
 cp_parser_omp_var_list_no_open (cp_parser *parser, enum omp_clause_code kind,
-				tree list, bool *colon)
+				tree list, bool *colon,
+				enum c_omp_region_type ort = C_ORT_OMP,
+				bool allow_deref = false)
 {
   cp_token *token;
   bool saved_colon_corrects_to_scope_p = parser->colon_corrects_to_scope_p;
@@ -32592,15 +32605,20 @@
 	    case OMP_CLAUSE_MAP:
 	    case OMP_CLAUSE_FROM:
 	    case OMP_CLAUSE_TO:
-	      while (cp_lexer_next_token_is (parser->lexer, CPP_DOT))
+	      while (cp_lexer_next_token_is (parser->lexer, CPP_DOT)
+		     || (allow_deref
+			 && cp_lexer_next_token_is (parser->lexer, CPP_DEREF)))
 		{
+		  cpp_ttype ttype
+		    = cp_lexer_next_token_is (parser->lexer, CPP_DOT)
+		      ? CPP_DOT : CPP_DEREF;
 		  location_t loc
 		    = cp_lexer_peek_token (parser->lexer)->location;
 		  cp_id_kind idk = CP_ID_KIND_NONE;
 		  cp_lexer_consume_token (parser->lexer);
 		  decl = convert_from_reference (decl);
 		  decl
-		    = cp_parser_postfix_dot_deref_expression (parser, CPP_DOT,
+		    = cp_parser_postfix_dot_deref_expression (parser, ttype,
 							      decl, false,
 							      &idk, loc);
 		}
@@ -32609,7 +32627,8 @@
 	    case OMP_CLAUSE_REDUCTION:
 	    case OMP_CLAUSE_IN_REDUCTION:
 	    case OMP_CLAUSE_TASK_REDUCTION:
-	      while (cp_lexer_next_token_is (parser->lexer, CPP_OPEN_SQUARE))
+	      while ((ort != C_ORT_ACC || kind != OMP_CLAUSE_REDUCTION)
+		     && cp_lexer_next_token_is (parser->lexer, CPP_OPEN_SQUARE))
 		{
 		  tree low_bound = NULL_TREE, length = NULL_TREE;
 
@@ -32718,20 +32737,28 @@
    common case for omp clauses.  */
 
 static tree
-cp_parser_omp_var_list (cp_parser *parser, enum omp_clause_code kind, tree list)
+cp_parser_omp_var_list (cp_parser *parser, enum omp_clause_code kind, tree list,
+			enum c_omp_region_type ort = C_ORT_OMP,
+			bool allow_deref = false)
 {
   if (cp_parser_require (parser, CPP_OPEN_PAREN, RT_OPEN_PAREN))
-    return cp_parser_omp_var_list_no_open (parser, kind, list, NULL);
+    return cp_parser_omp_var_list_no_open (parser, kind, list, NULL, ort,
+					   allow_deref);
   return list;
 }
 
-/* OpenACC 2.0:
+/* OpenACC 2.5:
+   attach ( variable-list )
    copy ( variable-list )
    copyin ( variable-list )
    copyout ( variable-list )
    create ( variable-list )
    delete ( variable-list )
-   present ( variable-list ) */
+   detach ( variable-list )
+   present ( variable-list )
+
+   OpenACC 2.6:
+   no_create ( variable-list ) */
 
 static tree
 cp_parser_oacc_data_clause (cp_parser *parser, pragma_omp_clause c_kind,
@@ -32740,6 +32767,9 @@
   enum gomp_map_kind kind;
   switch (c_kind)
     {
+    case PRAGMA_OACC_CLAUSE_ATTACH:
+      kind = GOMP_MAP_ATTACH;
+      break;
     case PRAGMA_OACC_CLAUSE_COPY:
       kind = GOMP_MAP_TOFROM;
       break;
@@ -32755,6 +32785,9 @@
     case PRAGMA_OACC_CLAUSE_DELETE:
       kind = GOMP_MAP_RELEASE;
       break;
+    case PRAGMA_OACC_CLAUSE_DETACH:
+      kind = GOMP_MAP_DETACH;
+      break;
     case PRAGMA_OACC_CLAUSE_DEVICE:
       kind = GOMP_MAP_FORCE_TO;
       break;
@@ -32767,6 +32800,9 @@
     case PRAGMA_OACC_CLAUSE_LINK:
       kind = GOMP_MAP_LINK;
       break;
+    case PRAGMA_OACC_CLAUSE_NO_CREATE:
+      kind = GOMP_MAP_NO_ALLOC;
+      break;
     case PRAGMA_OACC_CLAUSE_PRESENT:
       kind = GOMP_MAP_FORCE_PRESENT;
       break;
@@ -32774,7 +32810,7 @@
       gcc_unreachable ();
     }
   tree nl, c;
-  nl = cp_parser_omp_var_list (parser, OMP_CLAUSE_MAP, list);
+  nl = cp_parser_omp_var_list (parser, OMP_CLAUSE_MAP, list, C_ORT_ACC, true);
 
   for (c = nl; c != list; c = OMP_CLAUSE_CHAIN (c))
     OMP_CLAUSE_SET_MAP_KIND (c, kind);
@@ -33857,7 +33893,7 @@
 
 static tree
 cp_parser_omp_clause_reduction (cp_parser *parser, enum omp_clause_code kind,
-				bool is_omp, tree list)
+				enum c_omp_region_type ort, tree list)
 {
   enum tree_code code = ERROR_MARK;
   tree nlist, c, id = NULL_TREE;
@@ -33867,7 +33903,7 @@
   if (!cp_parser_require (parser, CPP_OPEN_PAREN, RT_OPEN_PAREN))
     return list;
 
-  if (kind == OMP_CLAUSE_REDUCTION && is_omp)
+  if (kind == OMP_CLAUSE_REDUCTION && ort == C_ORT_OMP)
     {
       if (cp_lexer_next_token_is_keyword (parser->lexer, RID_DEFAULT)
 	  && cp_lexer_nth_token_is (parser->lexer, 2, CPP_COMMA))
@@ -33968,8 +34004,7 @@
   if (!cp_parser_require (parser, CPP_COLON, RT_COLON))
     goto resync_fail;
 
-  nlist = cp_parser_omp_var_list_no_open (parser, kind, list,
-					  NULL);
+  nlist = cp_parser_omp_var_list_no_open (parser, kind, list, NULL, ort);
   for (c = nlist; c != list; c = OMP_CLAUSE_CHAIN (c))
     {
       OMP_CLAUSE_REDUCTION_CODE (c) = code;
@@ -35112,6 +35147,10 @@
 						  clauses);
 	  c_name = "auto";
 	  break;
+	case PRAGMA_OACC_CLAUSE_ATTACH:
+	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "attach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_COLLAPSE:
 	  clauses = cp_parser_omp_clause_collapse (parser, clauses, here);
 	  c_name = "collapse";
@@ -35140,6 +35179,10 @@
 	  clauses = cp_parser_omp_clause_default (parser, clauses, here, true);
 	  c_name = "default";
 	  break;
+	case PRAGMA_OACC_CLAUSE_DETACH:
+	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "detach";
+	  break;
 	case PRAGMA_OACC_CLAUSE_DEVICE:
 	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "device";
@@ -35189,6 +35232,15 @@
 	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
 	  c_name = "link";
 	  break;
+	case PRAGMA_OACC_CLAUSE_NO_CREATE:
+	  clauses = cp_parser_oacc_data_clause (parser, c_kind, clauses);
+	  c_name = "no_create";
+	  break;
+	case PRAGMA_OACC_CLAUSE_NOHOST:
+	  clauses = cp_parser_oacc_simple_clause (here, OMP_CLAUSE_NOHOST,
+						  clauses);
+	  c_name = "nohost";
+	  break;
 	case PRAGMA_OACC_CLAUSE_NUM_GANGS:
 	  code = OMP_CLAUSE_NUM_GANGS;
 	  c_name = "num_gangs";
@@ -35213,7 +35265,7 @@
 	case PRAGMA_OACC_CLAUSE_REDUCTION:
 	  clauses
 	    = cp_parser_omp_clause_reduction (parser, OMP_CLAUSE_REDUCTION,
-					      false, clauses);
+					      C_ORT_ACC, clauses);
 	  c_name = "reduction";
 	  break;
 	case PRAGMA_OACC_CLAUSE_SEQ:
@@ -35368,7 +35420,7 @@
 	case PRAGMA_OMP_CLAUSE_IN_REDUCTION:
 	  clauses
 	    = cp_parser_omp_clause_reduction (parser, OMP_CLAUSE_IN_REDUCTION,
-					      true, clauses);
+					      C_ORT_OMP, clauses);
 	  c_name = "in_reduction";
 	  break;
 	case PRAGMA_OMP_CLAUSE_LASTPRIVATE:
@@ -35412,7 +35464,7 @@
 	case PRAGMA_OMP_CLAUSE_REDUCTION:
 	  clauses
 	    = cp_parser_omp_clause_reduction (parser, OMP_CLAUSE_REDUCTION,
-					      true, clauses);
+					      C_ORT_OMP, clauses);
 	  c_name = "reduction";
 	  break;
 	case PRAGMA_OMP_CLAUSE_SCHEDULE:
@@ -35429,7 +35481,7 @@
 	  clauses
 	    = cp_parser_omp_clause_reduction (parser,
 					      OMP_CLAUSE_TASK_REDUCTION,
-					      true, clauses);
+					      C_ORT_OMP, clauses);
 	  c_name = "task_reduction";
 	  break;
 	case PRAGMA_OMP_CLAUSE_UNTIED:
@@ -38789,12 +38841,15 @@
      structured-block  */
 
 #define OACC_DATA_CLAUSE_MASK						\
-	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DETACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NO_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT) )
 
 static tree
@@ -38819,7 +38874,9 @@
   structured-block  */
 
 #define OACC_HOST_DATA_CLAUSE_MASK					\
-  ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_USE_DEVICE) )
+  ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_USE_DEVICE)                \
+  | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)                        \
+  | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF_PRESENT) )
 
 static tree
 cp_parser_oacc_host_data (cp_parser *parser, cp_token *pragma_tok, bool *if_p)
@@ -38992,6 +39049,7 @@
 
 #define OACC_ENTER_DATA_CLAUSE_MASK					\
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
@@ -39002,6 +39060,7 @@
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DELETE) 		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DETACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_FINALIZE) 		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
@@ -39101,10 +39160,15 @@
 
    # pragma acc parallel oacc-parallel-clause[optseq] new-line
      structured-block
+
+   OpenACC 2.6:
+
+   # pragma acc serial oacc-serial-clause[optseq] new-line
 */
 
 #define OACC_KERNELS_CLAUSE_MASK					\
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
@@ -39112,6 +39176,7 @@
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEFAULT)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NO_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_GANGS)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_WORKERS)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT)		\
@@ -39120,6 +39185,7 @@
 
 #define OACC_PARALLEL_CLAUSE_MASK					\
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
@@ -39128,6 +39194,7 @@
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_FIRSTPRIVATE)       	\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NO_CREATE)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_GANGS)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_WORKERS)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT)		\
@@ -39136,9 +39203,30 @@
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR_LENGTH)       \
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
+#define OACC_SERIAL_CLAUSE_MASK						\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ATTACH)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPY)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYIN)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_COPYOUT)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_CREATE)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEFAULT)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NO_CREATE)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRIVATE)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_FIRSTPRIVATE)	\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_REDUCTION)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
+
+#define OACC_SERIAL_CLAUSE_DEVICE_TYPE_MASK				\
+	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)		\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
+
 static tree
-cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok,
-				 char *p_name, bool *if_p)
+cp_parser_oacc_compute (cp_parser *parser, cp_token *pragma_tok,
+			char *p_name, bool *if_p)
 {
   omp_clause_mask mask;
   enum tree_code code;
@@ -39154,6 +39242,11 @@
       mask = OACC_PARALLEL_CLAUSE_MASK;
       code = OACC_PARALLEL;
       break;
+    case PRAGMA_OACC_SERIAL:
+      strcat (p_name, " serial");
+      mask = OACC_SERIAL_CLAUSE_MASK;
+      code = OACC_SERIAL;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -40190,8 +40283,8 @@
 	( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_GANG)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WORKER)		\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR)		\
-	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_SEQ))
-
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_SEQ)			\
+	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NOHOST) )
 
 /* Parse the OpenACC routine pragma.  This has an optional '( name )'
    component, which must resolve to a declared namespace-scope
@@ -40252,6 +40345,9 @@
 	= cp_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK,
 				      "#pragma acc routine",
 				      cp_lexer_peek_token (parser->lexer));
+      /* The clauses are in reverse order; fix that to make later diagnostic
+	 emission easier.  */
+      data.clauses = nreverse (data.clauses);
 
       if (decl && is_overloaded_fn (decl)
 	  && (TREE_CODE (decl) != FUNCTION_DECL
@@ -40348,6 +40444,9 @@
   parser->oacc_routine->clauses
     = cp_parser_oacc_all_clauses (parser, OACC_ROUTINE_CLAUSE_MASK,
 				  "#pragma acc routine", pragma_tok);
+  /* The clauses are in reverse order; fix that to make later diagnostic
+     emission easier.  */
+  parser->oacc_routine->clauses = nreverse (parser->oacc_routine->clauses);
   cp_parser_pop_lexer (parser);
   /* Later, cp_finalize_oacc_routine will process the clauses, and then set
      fndecl_seen.  */
@@ -40382,6 +40481,9 @@
 	  return;
 	}
 
+      oacc_verify_routine_clauses (&parser->oacc_routine->clauses,
+				   parser->oacc_routine->loc);
+
       if (oacc_get_fn_attrib (fndecl))
 	{
 	  error_at (parser->oacc_routine->loc,
@@ -40408,7 +40510,7 @@
       /* Add an "omp declare target" attribute.  */
       DECL_ATTRIBUTES (fndecl)
 	= tree_cons (get_identifier ("omp declare target"),
-		     NULL_TREE, DECL_ATTRIBUTES (fndecl));
+		     parser->oacc_routine->clauses, DECL_ATTRIBUTES (fndecl));
 
       /* Don't unset parser->oacc_routine here: we may still need it to
 	 diagnose wrong usage.  But, remember that we've used this "#pragma acc
@@ -40448,9 +40550,9 @@
       break;
     case PRAGMA_OACC_KERNELS:
     case PRAGMA_OACC_PARALLEL:
+    case PRAGMA_OACC_SERIAL:
       strcpy (p_name, "#pragma acc");
-      stmt = cp_parser_oacc_kernels_parallel (parser, pragma_tok, p_name,
-					      if_p);
+      stmt = cp_parser_oacc_compute (parser, pragma_tok, p_name, if_p);
       break;
     case PRAGMA_OACC_LOOP:
       strcpy (p_name, "#pragma acc");
@@ -41105,8 +41207,9 @@
     case PRAGMA_OACC_DATA:
     case PRAGMA_OACC_HOST_DATA:
     case PRAGMA_OACC_KERNELS:
-    case PRAGMA_OACC_PARALLEL:
     case PRAGMA_OACC_LOOP:
+    case PRAGMA_OACC_PARALLEL:
+    case PRAGMA_OACC_SERIAL:
     case PRAGMA_OMP_ATOMIC:
     case PRAGMA_OMP_CRITICAL:
     case PRAGMA_OMP_DISTRIBUTE:
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index 43d9660..476514d 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -16478,6 +16478,7 @@
 	case OMP_CLAUSE_IF_PRESENT:
 	case OMP_CLAUSE_FINALIZE:
 	  break;
+	case OMP_CLAUSE_NOHOST:
 	default:
 	  gcc_unreachable ();
 	}
@@ -17565,6 +17566,7 @@
 
     case OACC_KERNELS:
     case OACC_PARALLEL:
+    case OACC_SERIAL:
       tmp = tsubst_omp_clauses (OMP_CLAUSES (t), C_ORT_ACC, args, complain,
 				in_decl);
       stmt = begin_omp_parallel ();
diff --git a/gcc/cp/semantics.c b/gcc/cp/semantics.c
index 4bfb5d1..e2e7cd8 100644
--- a/gcc/cp/semantics.c
+++ b/gcc/cp/semantics.c
@@ -4643,7 +4643,7 @@
 static tree
 handle_omp_array_sections_1 (tree c, tree t, vec<tree> &types,
 			     bool &maybe_zero_len, unsigned int &first_non_one,
-			     enum c_omp_region_type ort)
+			     bool &non_contiguous, enum c_omp_region_type ort)
 {
   tree ret, low_bound, length, type;
   if (TREE_CODE (t) != TREE_LIST)
@@ -4655,7 +4655,6 @@
 	t = TREE_OPERAND (t, 0);
       ret = t;
       if (TREE_CODE (t) == COMPONENT_REF
-	  && ort == C_ORT_OMP
 	  && (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
 	      || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_TO
 	      || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FROM)
@@ -4679,6 +4678,8 @@
 		  return error_mark_node;
 		}
 	      t = TREE_OPERAND (t, 0);
+	      if (ort == C_ORT_ACC && TREE_CODE (t) == INDIRECT_REF)
+	        t = TREE_OPERAND (t, 0);
 	    }
 	  if (REFERENCE_REF_P (t))
 	    t = TREE_OPERAND (t, 0);
@@ -4728,7 +4729,8 @@
       && TREE_CODE (TREE_CHAIN (t)) == FIELD_DECL)
     TREE_CHAIN (t) = omp_privatize_field (TREE_CHAIN (t), false);
   ret = handle_omp_array_sections_1 (c, TREE_CHAIN (t), types,
-				     maybe_zero_len, first_non_one, ort);
+				     maybe_zero_len, first_non_one,
+				     non_contiguous, ort);
   if (ret == error_mark_node || ret == NULL_TREE)
     return ret;
 
@@ -4778,6 +4780,18 @@
   if (low_bound == NULL_TREE)
     low_bound = integer_zero_node;
 
+  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+      && (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH
+	  || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_DETACH))
+    {
+      if (length != integer_one_node)
+	{
+	  error_at (OMP_CLAUSE_LOCATION (c),
+		    "expected single pointer in %qs clause",
+		    c_omp_map_clause_name (c, ort == C_ORT_ACC));
+	  return error_mark_node;
+	}
+    }
   if (length != NULL_TREE)
     {
       if (!integer_nonzerop (length))
@@ -4906,6 +4920,21 @@
 		    }
 		}
 	    }
+
+	  /* For OpenACC, if the low_bound/length suggest this is a subarray,
+	     and is referenced through by a pointer, then mark this as
+	     non-contiguous.  */
+	  if (ort == C_ORT_ACC
+	      && types.length () > 0
+	      && (TREE_CODE (low_bound) != INTEGER_CST
+		  || integer_nonzerop (low_bound)
+		  || (length && (TREE_CODE (length) != INTEGER_CST
+				 || !tree_int_cst_equal (size, length)))))
+	    {
+	      tree x = types.last ();
+	      if (TREE_CODE (x) == POINTER_TYPE)
+		non_contiguous = true;
+	    }
 	}
       else if (length == NULL_TREE)
 	{
@@ -4949,13 +4978,16 @@
       /* If there is a pointer type anywhere but in the very first
 	 array-section-subscript, the array section can't be contiguous.  */
       if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_DEPEND
-	  && TREE_CODE (TREE_CHAIN (t)) == TREE_LIST)
+	  && TREE_CODE (TREE_CHAIN (t)) == TREE_LIST
+	  && ort != C_ORT_ACC)
 	{
 	  error_at (OMP_CLAUSE_LOCATION (c),
 		    "array section is not contiguous in %qs clause",
 		    omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
 	  return error_mark_node;
 	}
+      else if (TREE_CODE (TREE_CHAIN (t)) == TREE_LIST)
+	non_contiguous = true;
     }
   else
     {
@@ -4983,6 +5015,7 @@
 {
   bool maybe_zero_len = false;
   unsigned int first_non_one = 0;
+  bool non_contiguous = false;
   auto_vec<tree, 10> types;
   tree *tp = &OMP_CLAUSE_DECL (c);
   if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_DEPEND
@@ -4992,7 +5025,7 @@
     tp = &TREE_VALUE (*tp);
   tree first = handle_omp_array_sections_1 (c, *tp, types,
 					    maybe_zero_len, first_non_one,
-					    ort);
+					    non_contiguous, ort);
   if (first == error_mark_node)
     return true;
   if (first == NULL_TREE)
@@ -5026,6 +5059,7 @@
       unsigned int num = types.length (), i;
       tree t, side_effects = NULL_TREE, size = NULL_TREE;
       tree condition = NULL_TREE;
+      tree da_dims = NULL_TREE;
 
       if (int_size_in_bytes (TREE_TYPE (first)) <= 0)
 	maybe_zero_len = true;
@@ -5051,6 +5085,13 @@
 	    length = fold_convert (sizetype, length);
 	  if (low_bound == NULL_TREE)
 	    low_bound = integer_zero_node;
+
+	  if (non_contiguous)
+	    {
+	      da_dims = tree_cons (low_bound, length, da_dims);
+	      continue;
+	    }
+
 	  if (!maybe_zero_len && i > first_non_one)
 	    {
 	      if (integer_nonzerop (low_bound))
@@ -5142,6 +5183,14 @@
 	}
       if (!processing_template_decl)
 	{
+	  if (non_contiguous)
+	    {
+	      int kind = OMP_CLAUSE_MAP_KIND (c);
+	      OMP_CLAUSE_SET_MAP_KIND (c, kind | GOMP_MAP_DYNAMIC_ARRAY);
+	      OMP_CLAUSE_DECL (c) = t;
+	      OMP_CLAUSE_SIZE (c) = da_dims;
+	      return false;
+	    }
 	  if (side_effects)
 	    size = build2 (COMPOUND_EXPR, sizetype, side_effects, size);
 	  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
@@ -5192,6 +5241,7 @@
 	    switch (OMP_CLAUSE_MAP_KIND (c))
 	      {
 	      case GOMP_MAP_ALLOC:
+	      case GOMP_MAP_NO_ALLOC:
 	      case GOMP_MAP_TO:
 	      case GOMP_MAP_FROM:
 	      case GOMP_MAP_TOFROM:
@@ -5214,12 +5264,18 @@
 	  if ((ort & C_ORT_OMP_DECLARE_SIMD) != C_ORT_OMP && ort != C_ORT_ACC)
 	    OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_POINTER);
 	  else if (TREE_CODE (t) == COMPONENT_REF)
-	    OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_ALWAYS_POINTER);
+	    {
+	      gomp_map_kind k = (ort == C_ORT_ACC) ? GOMP_MAP_ATTACH_DETACH
+						   : GOMP_MAP_ALWAYS_POINTER;
+	      OMP_CLAUSE_SET_MAP_KIND (c2, k);
+	    }
 	  else if (REFERENCE_REF_P (t)
 		   && TREE_CODE (TREE_OPERAND (t, 0)) == COMPONENT_REF)
 	    {
 	      t = TREE_OPERAND (t, 0);
-	      OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_ALWAYS_POINTER);
+	      gomp_map_kind k = (ort == C_ORT_ACC) ? GOMP_MAP_ATTACH_DETACH
+						   : GOMP_MAP_ALWAYS_POINTER;
+	      OMP_CLAUSE_SET_MAP_KIND (c2, k);
 	    }
 	  else
 	    OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_FIRSTPRIVATE_POINTER);
@@ -5658,7 +5714,8 @@
    Return true if there is some error and the clause should be removed.  */
 
 static bool
-finish_omp_reduction_clause (tree c, bool *need_default_ctor, bool *need_dtor)
+finish_omp_reduction_clause (tree c, enum c_omp_region_type ort,
+			     bool *need_default_ctor, bool *need_dtor)
 {
   tree t = OMP_CLAUSE_DECL (c);
   bool predefined = false;
@@ -5903,9 +5960,11 @@
     *need_dtor = true;
   else
     {
-      error_at (OMP_CLAUSE_LOCATION (c),
-		"user defined reduction not found for %qE",
-		omp_clause_printable_decl (t));
+      /* There are no user-defined reductions for OpenACC (as of 2.6).  */
+      if (ort & C_ORT_OMP)
+	error_at (OMP_CLAUSE_LOCATION (c),
+		  "user defined reduction not found for %qE",
+		  omp_clause_printable_decl (t));
       return true;
     }
   if (TREE_CODE (OMP_CLAUSE_DECL (c)) == MEM_REF)
@@ -6136,6 +6195,41 @@
   return ret;
 }
 
+/* Ensure that pointers are used in OpenACC attach and detach clauses.
+   Return true if an error has been detected.  */
+
+static bool
+cp_oacc_check_attachments (tree c)
+{
+  if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_MAP)
+    return false;
+
+  /* OpenACC attach / detach clauses must be pointers.  */
+  if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH
+      || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_DETACH)
+    {
+      tree t = OMP_CLAUSE_DECL (c);
+      tree type;
+
+      while (TREE_CODE (t) == TREE_LIST)
+	t = TREE_CHAIN (t);
+
+      type = TREE_TYPE (t);
+
+      if (TREE_CODE (type) == REFERENCE_TYPE)
+	type = TREE_TYPE (type);
+
+      if (TREE_CODE (type) != POINTER_TYPE)
+	{
+	  error_at (OMP_CLAUSE_LOCATION (c), "expected pointer in %qs clause",
+		    c_omp_map_clause_name (c, true));
+	  return true;
+	}
+    }
+
+  return false;
+}
+
 /* For all elements of CLAUSES, validate them vs OpenMP constraints.
    Remove any elements from the list that are invalid.  */
 
@@ -6153,6 +6247,7 @@
   tree last_iterators = NULL_TREE;
   bool last_iterators_remove = false;
   bool reduction_seen = false;
+  bool oacc_gang_seen = false;
 
   bitmap_obstack_initialize (NULL);
   bitmap_initialize (&generic_head, &bitmap_default_obstack);
@@ -6167,10 +6262,15 @@
 
   if (ort & C_ORT_ACC)
     for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
-      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
-	{
+      switch (OMP_CLAUSE_CODE (c))
+        {
+	case OMP_CLAUSE_ASYNC:
 	  oacc_async = true;
 	  break;
+	case OMP_CLAUSE_GANG:
+	  oacc_gang_seen = true;
+	  break;
+	default:;
 	}
 
   for (pc = &clauses, c = clauses; c ; c = *pc)
@@ -6187,6 +6287,13 @@
 	  field_ok = ((ort & C_ORT_OMP_DECLARE_SIMD) == C_ORT_OMP);
 	  goto check_dup_generic;
 	case OMP_CLAUSE_REDUCTION:
+	  if (oacc_gang_seen && oacc_get_fn_attrib (current_function_decl))
+	    {
+	      error_at (OMP_CLAUSE_LOCATION (c),
+			"gang reduction on an orphan loop");
+	      remove = true;
+	      break;
+	    }
 	  reduction_seen = true;
 	  /* FALLTHRU */
 	case OMP_CLAUSE_IN_REDUCTION:
@@ -6376,7 +6483,7 @@
 	    t = OMP_CLAUSE_DECL (c);
 	check_dup_generic_t:
 	  if (t == current_class_ptr
-	      && (ort != C_ORT_OMP_DECLARE_SIMD
+	      && ((ort != C_ORT_OMP_DECLARE_SIMD && ort != C_ORT_ACC)
 		  || (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_LINEAR
 		      && OMP_CLAUSE_CODE (c) != OMP_CLAUSE_UNIFORM)))
 	    {
@@ -6440,8 +6547,7 @@
 	handle_field_decl:
 	  if (!remove
 	      && TREE_CODE (t) == FIELD_DECL
-	      && t == OMP_CLAUSE_DECL (c)
-	      && ort != C_ORT_ACC)
+	      && t == OMP_CLAUSE_DECL (c))
 	    {
 	      OMP_CLAUSE_DECL (c)
 		= omp_privatize_field (t, (OMP_CLAUSE_CODE (c)
@@ -6508,7 +6614,7 @@
 	    omp_note_field_privatization (t, OMP_CLAUSE_DECL (c));
 	  else
 	    t = OMP_CLAUSE_DECL (c);
-	  if (t == current_class_ptr)
+	  if (ort != C_ORT_ACC && t == current_class_ptr)
 	    {
 	      error_at (OMP_CLAUSE_LOCATION (c),
 			"%<this%> allowed in OpenMP only in %<declare simd%>"
@@ -6995,7 +7101,7 @@
 	    }
 	  if (t == error_mark_node)
 	    remove = true;
-	  else if (t == current_class_ptr)
+	  else if (ort != C_ORT_ACC && t == current_class_ptr)
 	    {
 	      error_at (OMP_CLAUSE_LOCATION (c),
 			"%<this%> allowed in OpenMP only in %<declare simd%>"
@@ -7089,6 +7195,7 @@
 				"array section does not have mappable type "
 				"in %qs clause",
 				omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
+		      cp_omp_emit_unmappable_type_notes (TREE_TYPE (t));
 		      remove = true;
 		    }
 		  while (TREE_CODE (t) == ARRAY_REF)
@@ -7125,6 +7232,8 @@
 			}
 		    }
 		}
+	      if (cp_oacc_check_attachments (c))
+		remove = true;
 	      break;
 	    }
 	  if (t == error_mark_node)
@@ -7132,14 +7241,25 @@
 	      remove = true;
 	      break;
 	    }
+	  /* OpenACC attach / detach clauses must be pointers.  */
+	  if (cp_oacc_check_attachments (c))
+	    {
+	      remove = true;
+	      break;
+	    }
 	  if (REFERENCE_REF_P (t)
 	      && TREE_CODE (TREE_OPERAND (t, 0)) == COMPONENT_REF)
 	    {
 	      t = TREE_OPERAND (t, 0);
 	      OMP_CLAUSE_DECL (c) = t;
 	    }
+	  if (ort == C_ORT_ACC
+	      && TREE_CODE (t) == COMPONENT_REF
+	      && TREE_CODE (TREE_OPERAND (t, 0)) == INDIRECT_REF)
+	    t = TREE_OPERAND (TREE_OPERAND (t, 0), 0);
 	  if (TREE_CODE (t) == COMPONENT_REF
-	      && (ort & C_ORT_OMP_DECLARE_SIMD) == C_ORT_OMP
+	      && ((ort & C_ORT_OMP_DECLARE_SIMD) == C_ORT_OMP
+		  || ort == C_ORT_ACC)
 	      && OMP_CLAUSE_CODE (c) != OMP_CLAUSE__CACHE_)
 	    {
 	      if (type_dependent_expression_p (t))
@@ -7157,6 +7277,7 @@
 		  error_at (OMP_CLAUSE_LOCATION (c),
 			    "%qE does not have a mappable type in %qs clause",
 			    t, omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
+		  cp_omp_emit_unmappable_type_notes (TREE_TYPE (t));
 		  remove = true;
 		}
 	      while (TREE_CODE (t) == COMPONENT_REF)
@@ -7188,7 +7309,8 @@
 		break;
 	      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
 		  && (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_POINTER
-		      || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_POINTER))
+		      || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_POINTER
+		      || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH_DETACH))
 		break;
 	      if (DECL_P (t))
 		error_at (OMP_CLAUSE_LOCATION (c),
@@ -7235,6 +7357,7 @@
 	      error_at (OMP_CLAUSE_LOCATION (c),
 			"%qD does not have a mappable type in %qs clause", t,
 			omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
+	      cp_omp_emit_unmappable_type_notes (TREE_TYPE (t));
 	      remove = true;
 	    }
 	  else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
@@ -7269,7 +7392,9 @@
 	      else
 		bitmap_set_bit (&generic_head, DECL_UID (t));
 	    }
-	  else if (bitmap_bit_p (&map_head, DECL_UID (t)))
+	  else if (bitmap_bit_p (&map_head, DECL_UID (t))
+		   && (ort != C_ORT_ACC
+		       || !bitmap_bit_p (&map_field_head, DECL_UID (t))))
 	    {
 	      if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_MAP)
 		error_at (OMP_CLAUSE_LOCATION (c),
@@ -7324,7 +7449,12 @@
 		  tree c2 = build_omp_clause (OMP_CLAUSE_LOCATION (c),
 					      OMP_CLAUSE_MAP);
 		  if (TREE_CODE (t) == COMPONENT_REF)
-		    OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_ALWAYS_POINTER);
+		    {
+		      gomp_map_kind k
+			= (ort == C_ORT_ACC) ? GOMP_MAP_ATTACH_DETACH
+					     : GOMP_MAP_ALWAYS_POINTER;
+		      OMP_CLAUSE_SET_MAP_KIND (c2, k);
+		    }
 		  else
 		    OMP_CLAUSE_SET_MAP_KIND (c2,
 					     GOMP_MAP_FIRSTPRIVATE_REFERENCE);
@@ -7383,6 +7513,7 @@
 	      error_at (OMP_CLAUSE_LOCATION (c),
 			"%qD does not have a mappable type in %qs clause", t,
 			omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
+	      cp_omp_emit_unmappable_type_notes (TREE_TYPE (t));
 	      remove = true;
 	    }
 	  if (remove)
@@ -7548,6 +7679,7 @@
 	case OMP_CLAUSE_SEQ:
 	case OMP_CLAUSE_IF_PRESENT:
 	case OMP_CLAUSE_FINALIZE:
+	case OMP_CLAUSE_NOHOST:
 	  break;
 
 	case OMP_CLAUSE_TILE:
@@ -7752,7 +7884,7 @@
 	case OMP_CLAUSE_REDUCTION:
 	case OMP_CLAUSE_IN_REDUCTION:
 	case OMP_CLAUSE_TASK_REDUCTION:
-	  if (finish_omp_reduction_clause (c, &need_default_ctor,
+	  if (finish_omp_reduction_clause (c, ort, &need_default_ctor,
 					   &need_dtor))
 	    remove = true;
 	  else
diff --git a/gcc/d/d-spec.cc b/gcc/d/d-spec.cc
index 9eba690..e423852 100644
--- a/gcc/d/d-spec.cc
+++ b/gcc/d/d-spec.cc
@@ -505,7 +505,7 @@
 lang_specific_pre_link (void)
 {
   if ((phobos_library != PHOBOS_NOLINK && need_phobos) || need_spec)
-    do_spec ("%:include(libgphobos.spec)");
+    do_spec ("%:include(libgphobos.spec)", 0);
 
   return 0;
 }
diff --git a/gcc/doc/ChangeLog.omp b/gcc/doc/ChangeLog.omp
new file mode 100644
index 0000000..1f5bfdd
--- /dev/null
+++ b/gcc/doc/ChangeLog.omp
@@ -0,0 +1,4 @@
+2019-01-09  Julian Brown  <julian@codesourcery.com>
+
+	* invoke.texi: Update mention of OpenACC version to 2.6.
+
diff --git a/gcc/doc/generic.texi b/gcc/doc/generic.texi
index 15f6cfe..e9804d8 100644
--- a/gcc/doc/generic.texi
+++ b/gcc/doc/generic.texi
@@ -2384,6 +2384,7 @@
 @tindex OACC_KERNELS
 @tindex OACC_LOOP
 @tindex OACC_PARALLEL
+@tindex OACC_SERIAL
 @tindex OACC_UPDATE
 
 All the statements starting with @code{OACC_} represent directives and
@@ -2428,6 +2429,10 @@
 
 Represents @code{#pragma acc parallel [clause1 @dots{} clauseN]}.
 
+@item OACC_SERIAL
+
+Represents @code{#pragma acc serial [clause1 @dots{} clauseN]}.
+
 @item OACC_UPDATE
 
 Represents @code{#pragma acc update [clause1 @dots{} clauseN]}.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 0ab6c9c..b3c486a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -198,7 +198,7 @@
 -aux-info @var{filename}  -fallow-parameterless-variadic-functions @gol
 -fno-asm  -fno-builtin  -fno-builtin-@var{function}  -fgimple@gol
 -fhosted  -ffreestanding @gol
--fopenacc  -fopenacc-dim=@var{geom} @gol
+-fopenacc  -fopenacc-dim=@var{geom}  -fopenacc-kernels=@var{style} @gol
 -fopenmp  -fopenmp-simd @gol
 -fms-extensions  -fplan9-extensions  -fsso-struct=@var{endianness} @gol
 -fallow-single-precision  -fcond-mismatch  -flax-vector-conversions @gol
@@ -2173,7 +2173,7 @@
 Enable handling of OpenACC directives @code{#pragma acc} in C/C++ and
 @code{!$acc} in Fortran.  When @option{-fopenacc} is specified, the
 compiler generates accelerated code according to the OpenACC Application
-Programming Interface v2.0 @w{@uref{https://www.openacc.org}}.  This option
+Programming Interface v2.6 @w{@uref{https://www.openacc.org}}.  This option
 implies @option{-pthread}, and thus is only supported on targets that
 have support for @option{-pthread}.
 
@@ -2182,8 +2182,23 @@
 @cindex OpenACC accelerator programming
 Specify default compute dimensions for parallel offload regions that do
 not explicitly specify.  The @var{geom} value is a triple of
-':'-separated sizes, in order 'gang', 'worker' and, 'vector'.  A size
-can be omitted, to use a target-specific default value.
+':'-separated sizes, in order 'gang', 'worker' and, 'vector'.  If a size
+is to be deferred until execution '-' can be used, alternatively a size
+can be omitted to use a target-specific default value.  When deferring
+to runtime, the environment variable @var{GOMP_OPENACC_DIM} can be set.
+It has the same format as the option value, except that '-' is not
+permitted.
+
+@item -fopenacc-kernels=@var{style}
+@opindex fopenacc-kernels
+@cindex OpenACC accelerator programming
+Configure OpenACC 'kernels' constructs handling.
+With @option{-fopenacc-kernels=split}, OpenACC 'kernels' constructs
+are split into a sequence of compute constructs, each then handled
+individually.
+This is the default.
+With @option{-fopenacc-kernels=parloops}, the whole OpenACC
+'kernels' constructs is handled by the @samp{parloops} pass.
 
 @item -fopenmp
 @opindex fopenmp
@@ -14105,6 +14120,10 @@
 This flag does not have a negative form, because it specifies a
 three-way choice.
 
+Note that this flag may conflict with the @option{-ffixed-form} as
+well as @option{-ffixed-line-length-none} and
+@option{-ffixed-line-length-<n>} options of the Fortran front end.
+
 @item -fcall-used-@var{reg}
 @opindex fcall-used
 Treat the register named @var{reg} as an allocable register that is
@@ -16208,6 +16227,9 @@
 @item gfx900
 Compile for GCN5 Vega 10 devices (gfx900).
 
+@item gfx906
+Compile for GCN5 Vega 20 devices (gfx906).
+
 @end table
 
 @item -mstack-size=@var{bytes}
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 8c8978b..e44f805 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6154,6 +6154,34 @@
 constant of type @var{type}.
 @end deftypefn
 
+@deftypefn {Target Hook} rtx TARGET_GOACC_EXPAND_ACCEL_VAR (tree @var{var})
+This hook, if defined, is used by accelerator target back-ends to expand
+specially handled kinds of VAR_DECL expressions.  A particular use is to
+place variables with specific attributes inside special accelarator
+memories.  A return value of NULL indicates that the target does not
+handle this VAR_DECL, and normal RTL expanding is resumed.
+@end deftypefn
+
+@deftypefn {Target Hook} void TARGET_GOACC_ADJUST_PRIVATE_DECL (tree @var{var}, @var{int})
+Tweak variable declaration for a private variable at the specified
+parallelism level.
+@end deftypefn
+
+@deftypevr {Target Hook} bool TARGET_GOACC_WORKER_PARTITIONING
+Use gimple transformation for worker neutering/broadcasting.
+@end deftypevr
+
+@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_PROPAGATION_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name})
+Create a record used to propagate local-variable state from an active
+worker to other workers.  A possible implementation might adjust the type
+of REC to place the new variable in shared GPU memory.
+@end deftypefn
+
+@deftypefn {Target Hook} bool TARGET_GOACC_EXPLODE_ARGS (void)
+Define this hook to TRUE if arguments to offload regions should be
+exploded, i.e. passed as true arguments rather than in an argument array.
+@end deftypefn
+
 @node Anchored Addresses
 @section Anchored Addresses
 @cindex anchored addresses
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index fe1194e..74a1b03 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4208,6 +4208,16 @@
 
 @hook TARGET_PREFERRED_ELSE_VALUE
 
+@hook TARGET_GOACC_EXPAND_ACCEL_VAR
+
+@hook TARGET_GOACC_ADJUST_PRIVATE_DECL
+
+@hook TARGET_GOACC_WORKER_PARTITIONING
+
+@hook TARGET_GOACC_CREATE_PROPAGATION_RECORD
+
+@hook TARGET_GOACC_EXPLODE_ARGS
+
 @node Anchored Addresses
 @section Anchored Addresses
 @cindex anchored addresses
diff --git a/gcc/expr.c b/gcc/expr.c
index fa15b7e..50e90e0 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -9963,8 +9963,19 @@
       exp = SSA_NAME_VAR (ssa_name);
       goto expand_decl_rtl;
 
-    case PARM_DECL:
     case VAR_DECL:
+      /* Allow accel compiler to handle specific cases of variables,
+	 specifically those tagged with the "oacc gangprivate" attribute,
+	 which may be intended to be placed in special memory in GPUs.  */
+      if (flag_openacc && targetm.goacc.expand_accel_var)
+	{
+	  temp = targetm.goacc.expand_accel_var (exp);
+	  if (temp)
+	    return temp;
+	}
+      /* ... fall through ...  */
+
+    case PARM_DECL:
       /* If a static var's type was incomplete when the decl was written,
 	 but the type is complete now, lay out the decl now.  */
       if (DECL_SIZE (exp) == 0
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index a210328..ce32607 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -354,4 +354,11 @@
   CF_FULL = CF_BRANCH | CF_RETURN,
   CF_SET = 1 << 2
 };
+
+/* OpenACC 'kernels' constructs handling.  */
+enum openacc_kernels
+{
+  OPENACC_KERNELS_SPLIT,
+  OPENACC_KERNELS_PARLOOPS
+};
 #endif /* ! GCC_FLAG_TYPES_H */
diff --git a/gcc/fortran/ChangeLog.omp b/gcc/fortran/ChangeLog.omp
new file mode 100644
index 0000000..c619eaa
--- /dev/null
+++ b/gcc/fortran/ChangeLog.omp
@@ -0,0 +1,274 @@
+2019-10-09  Tobias Burnus  <tobias@codesourcery.com>
+
+	* hooks.c (hook_tree_tree_null): New.
+	* hooks.h (hook_tree_tree_null): Declare.
+	* langhooks-def.h (LANG_HOOKS_OMP_ARRAY_DATA): Define.
+	(LANG_HOOKS_FOR_TYPES_INITIALIZER): Use it.
+	* langhooks.h (lang_hooks_for_types): Add omp_array_data.
+	* omp-general.c (omp_is_optional_argument): Handle value+optional.
+	* omp-low.c (omp_context): Add array_data_map + present_map.
+	(install_var_field): Handle array descriptors.
+	(delete_omp_context): Free new maps.
+	(scan_sharing_clauses): Handle array descriptors.
+	(lower_omp_target): Ditto. Fix optional-arg present check.
+
+2019-10-08  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from mainline
+	2019-10-08  Tobias Burnus  <tobias@codesourcery.com>
+
+	* parse.c (parse_executable): Add missing ST_OMP_TARGET_SIMD.
+
+2019-10-08  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from mainline
+	2019-10-08  Tobias Burnus  <tobias@codesourcery.com>
+
+	* match.h (gfc_match_omp_eos_error): Renamed from gfc_match_omp_eos.
+	* openmp.c (gfc_match_omp_eos): Make static.
+	(gfc_match_omp_eos_error): New.
+	* parse.c (matchs, matchdo, matchds): Do as done for 'matcho' -
+	if error occurred after OpenMP/OpenACC directive matched, do not
+	try other directives.
+	(decode_oacc_directive, decode_omp_directive): Call new function
+	instead.
+
+2019-10-02  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from mainline
+	2019-10-02  Tobias Burnus  <tobias@codesourcery.com>
+
+	* openmp.c (gfc_match_omp_clauses): Show a clause-parsing
+	error if none was rised before.
+	* parse.c (matcha, matcho): If error occurred after
+	OpenMP/OpenACC directive matched, do not try other directives.
+
+2019-09-20  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from mainline
+	2019-09-20  Tobias Burnus  <tobias@codesourcery.com>
+
+	* openmp.c (gfc_resolve_oacc_declare): Reject all
+	non variables but accept function result variables.
+	* trans-openmp.c (gfc_trans_omp_clauses): Handle
+	function-result variables for remaing cases.
+
+2019-09-17  Tobias Burnus  <tobias@codesourcery.com>
+
+	* trans-expr.c (gfc_auto_dereference_var): Use passed loc argument.
+
+2019-09-06  Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+
+	2019-07-05  Andrew Stubbs  <ams@codesourcery.com>
+
+	* openmp.c (resolve_omp_clauses): Add custom error messages for
+	parameters in map clauses.
+
+2019-07-10  Julian Brown  <julian@codesourcery.com>
+
+	* trans-openmp.c (gfc_omp_finish_clause): Change clauses mapping
+	assumed-size arrays to use the GOMP_MAP_FORCE_PRESENT map type.
+
+2019-07-10  Julian Brown  <julian@codesourcery.com>
+
+	* openmp.c (resolve_oacc_data_clauses): Allow polymorphic allocatable
+	variables.
+	* trans-expr.c (gfc_conv_component_ref,
+	conv_parent_component_reference): Make global.
+	(gfc_auto_dereference_var): New function, broken out of...
+	(gfc_conv_variable): ...here. Call outlined function instead.
+	* trans-openmp.c (gfc_trans_omp_array_section): New function, broken out
+	of...
+	(gfc_trans_omp_clauses): ...here. Separate out OpenACC derived
+	type/polymorphic class pointer handling. Call above outlined function.
+	* trans.h (gfc_conv_component_ref, conv_parent_component_references,
+	gfc_auto_dereference_var): Add prototypes.
+
+2019-05-19  Julian Brown  <julian@codesourcery.com>
+
+	* trans-openmp.c (gfc_omp_finish_clause): Guard addition of clauses for
+	pointers with DECL_P.
+
+2019-01-09  Julian Brown  <julian@codesourcery.com>
+
+	* cpp.c (cpp_define_builtins): Update _OPENACC define to 201711.
+	* gfortran.texi: Update mentions of OpenACC version to 2.6.
+	* intrinsic.texi: Likewise.
+
+2019-01-23  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* lang.opt (fopenacc-kernels): Default to "split".
+
+2019-01-30  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* lang.opt (fopenacc-kernels): New flag.
+
+2018-12-19  Julian Brown  <julian@codesourcery.com>
+            Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* gfortran.h (gfc_omp_map_op): Add OMP_MAP_NO_ALLOC.
+	* openmp.c (omp_mask2): Add OMP_CLAUSE_NO_CREATE.
+	(gfc_match_omp_clauses): Support no_create.
+	(OACC_PARALLEL_CLAUSES, OACC_KERNELS_CLAUSES)
+	(OACC_SERIAL_CLAUSES, OACC_DATA_CLAUSES): Add
+	OMP_CLAUSE_NO_CREATE.
+	* trans-openmp.c (gfc_trans_omp_clauses_1): Support
+	OMP_MAP_NO_ALLOC.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* gfortran.h (gfc_statement): Add ST_OACC_SERIAL_LOOP,
+	ST_OACC_END_SERIAL_LOOP, ST_OACC_SERIAL and ST_OACC_END_SERIAL
+	enumeration constants.
+	(gfc_exec_op): Add EXEC_OACC_SERIAL_LOOP and EXEC_OACC_SERIAL
+	enumeration constants.
+	* match.h (gfc_match_oacc_serial): New prototype.
+	(gfc_match_oacc_serial_loop): Likewise.
+	* dump-parse-tree.c (show_omp_node, show_code_node): Handle
+	EXEC_OACC_SERIAL_LOOP and EXEC_OACC_SERIAL.
+	* match.c (match_exit_cycle): Handle EXEC_OACC_SERIAL_LOOP.
+	* openmp.c (OACC_SERIAL_CLAUSES): New macro.
+	(OACC_SERIAL_CLAUSE_DEVICE_TYPE_MASK): Likewise.
+	(gfc_match_oacc_serial_loop): New function.
+	(gfc_match_oacc_serial): Likewise.
+	(oacc_is_loop): Handle EXEC_OACC_SERIAL_LOOP.
+	(resolve_omp_clauses): Handle EXEC_OACC_SERIAL.
+	(oacc_is_serial): New function.
+	(oacc_code_to_statement): Handle EXEC_OACC_SERIAL and
+	EXEC_OACC_SERIAL_LOOP.
+	(gfc_resolve_oacc_directive): Likewise.
+	* parse.c (decode_oacc_directive) <'s'>: Add case for "serial"
+	and "serial loop".
+	(next_statement): Handle ST_OACC_SERIAL_LOOP and ST_OACC_SERIAL.
+	(gfc_ascii_statement): Likewise.  Handle ST_OACC_END_SERIAL_LOOP
+	and ST_OACC_END_SERIAL.
+	(parse_oacc_structured_block): Handle ST_OACC_SERIAL.
+	(parse_oacc_loop): Handle ST_OACC_SERIAL_LOOP and
+	ST_OACC_END_SERIAL_LOOP.
+	(parse_executable): Handle ST_OACC_SERIAL_LOOP and
+	ST_OACC_SERIAL.
+	(is_oacc): Handle EXEC_OACC_SERIAL_LOOP and EXEC_OACC_SERIAL.
+	* resolve.c (gfc_resolve_blocks, gfc_resolve_code): Likewise.
+	* st.c (gfc_free_statement): Likewise.
+	* trans-openmp.c (gfc_trans_oacc_construct): Handle
+	EXEC_OACC_SERIAL.
+	(gfc_trans_oacc_combined_directive): Handle
+	EXEC_OACC_SERIAL_LOOP.
+	(gfc_trans_oacc_directive): Handle EXEC_OACC_SERIAL_LOOP and
+	EXEC_OACC_SERIAL.
+	* trans.c (trans_code): Likewise.
+
+2017-12-21  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* types.def: (BF_FN_VOID_INT_INT_OMPFN_SIZE_PTR_PTR_PTR_VAR):
+	Define.
+
+2018-12-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* openmp.c (OACC_HOST_DATA_CLAUSES): Add OMP_CLAUSE_IF and
+	OMP_CLAUSE_IF_PRESENT.
+
+2019-01-30  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* trans-openmp.c (gfc_build_conditional_assign): New.
+	(gfc_build_conditional_assign_expr): New.
+	(gfc_omp_finish_clause): Add conditionals to set the clause
+	declaration to null and size to zero if the declaration is a
+	non-present optional argument.
+	(gfc_trans_omp_clauses_1): Likewise.
+
+2018-10-04  Cesar Philippidis  <cesar@codesourcery.com>
+            Julian Brown  <julian@codesourcery.com>
+
+	* gfortran.h (enum gfc_omp_map_op): Add OMP_MAP_DECLARE_ALLOCATE,
+	OMP_MAP_DECLARE_DEALLOCATE.
+	(gfc_omp_clauses): Add update_allocatable.
+	* trans-array.c (gfc_array_allocate): Call
+	gfc_trans_oacc_declare_allocate for decls that have oacc_declare_create
+	attribute set.
+	* trans-decl.c (add_attributes_to_decl): Enable lowering of OpenACC
+	declare create, declare copyin and declare deviceptr clauses.
+	(find_module_oacc_declare_clauses): Relax oacc_declare_create to
+	OMP_MAP_ALLOC, and oacc_declare_copyin to OMP_MAP_TO, in order to
+	match OpenACC 2.5 semantics.
+	(finish_oacc_declare): Reset module_oacc_clauses before scanning each
+	namespace.
+	* trans-openmp.c (gfc_trans_omp_clauses): Use GOMP_MAP_ALWAYS_POINTER
+	(for update directive) or GOMP_MAP_FIRSTPRIVATE_POINTER (otherwise) for
+	allocatable scalar decls.  Handle OMP_MAP_DECLARE_{ALLOCATE,DEALLOCATE}
+	clauses.
+	(gfc_trans_oacc_executable_directive): Use GOMP_MAP_ALWAYS_POINTER
+	for allocatable scalar data clauses inside acc update directives.
+	(gfc_trans_oacc_declare_allocate): New function.
+	* trans-stmt.c (gfc_trans_allocate): Call
+	gfc_trans_oacc_declare_allocate for decls with oacc_declare_create
+	attribute set.
+	(gfc_trans_deallocate): Likewise.
+	* trans.h (gfc_trans_oacc_declare_allocate): Declare.
+
+2018-12-13  Cesar Philippidis  <cesar@codesourcery.com>
+            Nathan Sidwell  <nathan@acm.org>
+            Julian Brown  <julian@codesourcery.com>
+
+        * openmp.c (resolve_oacc_loop_blocks): Emit an error on orphan OpenACC
+        gang reductions.
+        * trans-openmp.c (gfc_omp_clause_copy_ctor): Permit reductions.
+
+2018-06-29  Cesar Philippidis  <cesar@codesourcery.com>
+	    James Norris  <jnorris@codesourcery.com>
+
+	* openmp.c (gfc_match_omp_map_clause): Re-write handling of the
+	deviceptr clause.  Add new common_blocks argument.  Propagate it to
+	gfc_match_omp_variable_list.
+	(gfc_match_omp_clauses): Update calls to gfc_match_omp_map_clauses.
+	(resolve_positive_int_expr): Promote the warning to an error.
+	(check_array_not_assumed): Remove pointer check.
+	(resolve_oacc_nested_loops): Error on do concurrent loops.
+	* trans-openmp.c (gfc_omp_finish_clause): Don't create pointer data
+	mappings for deviceptr clauses.
+	(gfc_trans_omp_clauses): Likewise.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* gfortran.h (gfc_omp_clauses): Add nohost members.
+	* openmp.c (omp_mask2): Add OMP_CLAUSE_NOHOST.
+	(gfc_match_omp_clauses): Handle OMP_CLAUSE_NOHOST.
+	(gfc_match_oacc_routine): Set oacc_function_nohost when appropriate.
+	* gfortran.h (symbol_attribute): Add oacc_function_nohost member.
+	* trans-openmp.c (gfc_add_omp_offload_attributes): Use it to decide
+	whether to generate an OMP_CLAUSE_NOHOST clause.
+	(gfc_trans_omp_clauses_1): Unreachable code to generate an
+	OMP_CLAUSE_NOHOST clause.
+
+2018-12-14  Julian Brown  <julian@codesourcery.com>
+
+	* gfortran.h (gfc_omp_map_op): Add OMP_MAP_ATTACH, OMP_MAP_DETACH.
+	* openmp.c (gfc_match_omp_variable_list): Add allow_derived parameter.
+	Parse derived-type member accesses if true.
+	(omp_mask2): Add OMP_CLAUSE_ATTACH and OMP_CLAUSE_DETACH.
+	(gfc_match_omp_map_clause): Add allow_derived parameter.  Pass to
+	gfc_match_omp_variable_list.
+	(gfc_match_omp_clauses): Support attach and detach.  Support derived
+	types for appropriate OpenACC directives.
+	(OACC_PARALLEL_CLAUSES, OACC_KERNELS_CLAUSES, OACC_DATA_CLAUSES)
+	(OACC_ENTER_DATA_CLAUSES): Add OMP_CLAUSE_ATTACH.
+	(OACC_EXIT_DATA_CLAUSES): Add OMP_CLAUSE_DETACH.
+	(check_symbol_not_pointer): Don't disallow pointer objects of derived
+	type.
+	(resolve_oacc_data_clauses): Don't disallow allocatable derived types.
+	(resolve_omp_clauses): Perform duplicate checking only for non-derived
+	type component accesses (plain variables and arrays or array sections).
+	Support component refs.
+	* trans-openmp.c (gfc_omp_privatize_by_reference): Support component
+	refs.
+	(gfc_trans_omp_clauses): Support component refs, attach and detach
+	clauses.
+
+2015-08-20  Joseph Myers  <joseph@codesourcery.com>
+
+	PR libgomp/81886
+	* gfortranspec.c (lang_specific_pre_link): Update call to do_spec.
+
diff --git a/gcc/fortran/cpp.c b/gcc/fortran/cpp.c
index d3c4487..70f22d9 100644
--- a/gcc/fortran/cpp.c
+++ b/gcc/fortran/cpp.c
@@ -166,7 +166,7 @@
   cpp_define (pfile, "_LANGUAGE_FORTRAN=1");
 
   if (flag_openacc)
-    cpp_define (pfile, "_OPENACC=201306");
+    cpp_define (pfile, "_OPENACC=201711");
 
   if (flag_openmp)
     cpp_define (pfile, "_OPENMP=201511");
diff --git a/gcc/fortran/dump-parse-tree.c b/gcc/fortran/dump-parse-tree.c
index ebd7311..6b4163f 100644
--- a/gcc/fortran/dump-parse-tree.c
+++ b/gcc/fortran/dump-parse-tree.c
@@ -1618,6 +1618,8 @@
     case EXEC_OACC_PARALLEL: name = "PARALLEL"; is_oacc = true; break;
     case EXEC_OACC_KERNELS_LOOP: name = "KERNELS LOOP"; is_oacc = true; break;
     case EXEC_OACC_KERNELS: name = "KERNELS"; is_oacc = true; break;
+    case EXEC_OACC_SERIAL_LOOP: name = "SERIAL LOOP"; is_oacc = true; break;
+    case EXEC_OACC_SERIAL: name = "SERIAL"; is_oacc = true; break;
     case EXEC_OACC_DATA: name = "DATA"; is_oacc = true; break;
     case EXEC_OACC_HOST_DATA: name = "HOST_DATA"; is_oacc = true; break;
     case EXEC_OACC_LOOP: name = "LOOP"; is_oacc = true; break;
@@ -1693,6 +1695,8 @@
     case EXEC_OACC_PARALLEL:
     case EXEC_OACC_KERNELS_LOOP:
     case EXEC_OACC_KERNELS:
+    case EXEC_OACC_SERIAL_LOOP:
+    case EXEC_OACC_SERIAL:
     case EXEC_OACC_DATA:
     case EXEC_OACC_HOST_DATA:
     case EXEC_OACC_LOOP:
@@ -2878,6 +2882,8 @@
     case EXEC_OACC_PARALLEL:
     case EXEC_OACC_KERNELS_LOOP:
     case EXEC_OACC_KERNELS:
+    case EXEC_OACC_SERIAL_LOOP:
+    case EXEC_OACC_SERIAL:
     case EXEC_OACC_DATA:
     case EXEC_OACC_HOST_DATA:
     case EXEC_OACC_LOOP:
diff --git a/gcc/fortran/f95-lang.c b/gcc/fortran/f95-lang.c
index 3e3d304..a1cbd1b 100644
--- a/gcc/fortran/f95-lang.c
+++ b/gcc/fortran/f95-lang.c
@@ -112,6 +112,7 @@
 #undef LANG_HOOKS_MARK_ADDRESSABLE
 #undef LANG_HOOKS_TYPE_FOR_MODE
 #undef LANG_HOOKS_TYPE_FOR_SIZE
+#undef LANG_HOOKS_OMP_ARRAY_DATA
 #undef LANG_HOOKS_INIT_TS
 #undef LANG_HOOKS_OMP_PRIVATIZE_BY_REFERENCE
 #undef LANG_HOOKS_OMP_PREDETERMINED_SHARING
@@ -145,6 +146,7 @@
 #define LANG_HOOKS_TYPE_FOR_MODE	gfc_type_for_mode
 #define LANG_HOOKS_TYPE_FOR_SIZE	gfc_type_for_size
 #define LANG_HOOKS_INIT_TS		gfc_init_ts
+#define LANG_HOOKS_OMP_ARRAY_DATA		gfc_omp_array_data
 #define LANG_HOOKS_OMP_PRIVATIZE_BY_REFERENCE	gfc_omp_privatize_by_reference
 #define LANG_HOOKS_OMP_PREDETERMINED_SHARING	gfc_omp_predetermined_sharing
 #define LANG_HOOKS_OMP_REPORT_DECL		gfc_omp_report_decl
diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index d7071ae..eda5c17 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -223,7 +223,8 @@
   ST_OACC_END_DATA, ST_OACC_HOST_DATA, ST_OACC_END_HOST_DATA, ST_OACC_LOOP,
   ST_OACC_END_LOOP, ST_OACC_DECLARE, ST_OACC_UPDATE, ST_OACC_WAIT,
   ST_OACC_CACHE, ST_OACC_KERNELS_LOOP, ST_OACC_END_KERNELS_LOOP,
-  ST_OACC_ENTER_DATA, ST_OACC_EXIT_DATA, ST_OACC_ROUTINE,
+  ST_OACC_SERIAL_LOOP, ST_OACC_END_SERIAL_LOOP, ST_OACC_SERIAL,
+  ST_OACC_END_SERIAL, ST_OACC_ENTER_DATA, ST_OACC_EXIT_DATA, ST_OACC_ROUTINE,
   ST_OACC_ATOMIC, ST_OACC_END_ATOMIC,
   ST_OMP_ATOMIC, ST_OMP_BARRIER, ST_OMP_CRITICAL, ST_OMP_END_ATOMIC,
   ST_OMP_END_CRITICAL, ST_OMP_END_DO, ST_OMP_END_MASTER, ST_OMP_END_ORDERED,
@@ -916,6 +917,7 @@
 
   /* OpenACC 'routine' directive's level of parallelism.  */
   ENUM_BITFIELD (oacc_routine_lop) oacc_routine_lop:3;
+  unsigned oacc_function_nohost:1;
 
   /* Attributes set by compiler extensions (!GCC$ ATTRIBUTES).  */
   unsigned ext_attr:EXT_ATTR_NUM;
@@ -1187,10 +1189,13 @@
 enum gfc_omp_map_op
 {
   OMP_MAP_ALLOC,
+  OMP_MAP_NO_ALLOC,
+  OMP_MAP_ATTACH,
   OMP_MAP_TO,
   OMP_MAP_FROM,
   OMP_MAP_TOFROM,
   OMP_MAP_DELETE,
+  OMP_MAP_DETACH,
   OMP_MAP_FORCE_ALLOC,
   OMP_MAP_FORCE_TO,
   OMP_MAP_FORCE_FROM,
@@ -1202,7 +1207,9 @@
   OMP_MAP_RELEASE,
   OMP_MAP_ALWAYS_TO,
   OMP_MAP_ALWAYS_FROM,
-  OMP_MAP_ALWAYS_TOFROM
+  OMP_MAP_ALWAYS_TOFROM,
+  OMP_MAP_DECLARE_ALLOCATE,
+  OMP_MAP_DECLARE_DEALLOCATE
 };
 
 enum gfc_omp_linear_op
@@ -1358,7 +1365,7 @@
   gfc_expr_list *tile_list;
   unsigned async:1, gang:1, worker:1, vector:1, seq:1, independent:1;
   unsigned par_auto:1, gang_static:1;
-  unsigned if_present:1, finalize:1;
+  unsigned if_present:1, finalize:1, nohost:1, update_allocatable:1;
   locus loc;
 
 }
@@ -2550,11 +2557,11 @@
   EXEC_BACKSPACE, EXEC_ENDFILE, EXEC_INQUIRE, EXEC_REWIND, EXEC_FLUSH,
   EXEC_FORM_TEAM, EXEC_CHANGE_TEAM, EXEC_END_TEAM, EXEC_SYNC_TEAM,
   EXEC_LOCK, EXEC_UNLOCK, EXEC_EVENT_POST, EXEC_EVENT_WAIT, EXEC_FAIL_IMAGE,
-  EXEC_OACC_KERNELS_LOOP, EXEC_OACC_PARALLEL_LOOP, EXEC_OACC_ROUTINE,
-  EXEC_OACC_PARALLEL, EXEC_OACC_KERNELS, EXEC_OACC_DATA, EXEC_OACC_HOST_DATA,
-  EXEC_OACC_LOOP, EXEC_OACC_UPDATE, EXEC_OACC_WAIT, EXEC_OACC_CACHE,
-  EXEC_OACC_ENTER_DATA, EXEC_OACC_EXIT_DATA, EXEC_OACC_ATOMIC,
-  EXEC_OACC_DECLARE,
+  EXEC_OACC_KERNELS_LOOP, EXEC_OACC_PARALLEL_LOOP, EXEC_OACC_SERIAL_LOOP,
+  EXEC_OACC_ROUTINE, EXEC_OACC_PARALLEL, EXEC_OACC_KERNELS, EXEC_OACC_SERIAL,
+  EXEC_OACC_DATA, EXEC_OACC_HOST_DATA, EXEC_OACC_LOOP, EXEC_OACC_UPDATE,
+  EXEC_OACC_WAIT, EXEC_OACC_CACHE, EXEC_OACC_ENTER_DATA, EXEC_OACC_EXIT_DATA,
+  EXEC_OACC_ATOMIC, EXEC_OACC_DECLARE,
   EXEC_OMP_CRITICAL, EXEC_OMP_DO, EXEC_OMP_FLUSH, EXEC_OMP_MASTER,
   EXEC_OMP_ORDERED, EXEC_OMP_PARALLEL, EXEC_OMP_PARALLEL_DO,
   EXEC_OMP_PARALLEL_SECTIONS, EXEC_OMP_PARALLEL_WORKSHARE,
diff --git a/gcc/fortran/gfortran.texi b/gcc/fortran/gfortran.texi
index 733403a..62bd5d0 100644
--- a/gcc/fortran/gfortran.texi
+++ b/gcc/fortran/gfortran.texi
@@ -546,10 +546,8 @@
 Additionally, the GNU Fortran compilers supports the OpenMP specification
 (version 4.0 and most of the features of the 4.5 version,
 @url{http://openmp.org/@/wp/@/openmp-specifications/}).
-There also is initial support for the OpenACC specification (targeting
-version 2.0, @uref{http://www.openacc.org/}).
-Note that this is an experimental feature, incomplete, and subject to
-change in future versions of GCC.  See
+There also is support for the OpenACC specification (targeting
+version 2.6, @uref{http://www.openacc.org/}).  See
 @uref{https://gcc.gnu.org/wiki/OpenACC} for more information.
 
 @node Varying Length Character Strings
@@ -2204,7 +2202,7 @@
 
 GNU Fortran strives to be compatible to the
 @uref{http://www.openacc.org/, OpenACC Application Programming
-Interface v2.0}.
+Interface v2.6}.
 
 To enable the processing of the OpenACC directive @code{!$acc} in
 free-form source code; the @code{c$acc}, @code{*$acc} and @code{!$acc}
diff --git a/gcc/fortran/gfortranspec.c b/gcc/fortran/gfortranspec.c
index 33e6e57..a8e87d8 100644
--- a/gcc/fortran/gfortranspec.c
+++ b/gcc/fortran/gfortranspec.c
@@ -441,7 +441,7 @@
 lang_specific_pre_link (void)
 {
   if (library)
-    do_spec ("%:include(libgfortran.spec)");
+    do_spec ("%:include(libgfortran.spec)", 0);
 
   return 0;
 }
diff --git a/gcc/fortran/intrinsic.texi b/gcc/fortran/intrinsic.texi
index b39c576..a53ee6e 100644
--- a/gcc/fortran/intrinsic.texi
+++ b/gcc/fortran/intrinsic.texi
@@ -15081,7 +15081,7 @@
 @section OpenACC Module @code{OPENACC}
 @table @asis
 @item @emph{Standard}:
-OpenACC Application Programming Interface v2.0
+OpenACC Application Programming Interface v2.6
 @end table
 
 
@@ -15095,9 +15095,9 @@
 
 For details refer to the actual
 @uref{http://www.openacc.org/,
-OpenACC Application Programming Interface v2.0}.
+OpenACC Application Programming Interface v2.6}.
 
 @code{OPENACC} provides the scalar default-integer
 named constant @code{openacc_version} with a value of the form
 @var{yyyymm}, where @code{yyyy} is the year and @var{mm} the month
-of the OpenACC version; for OpenACC v2.0 the value is @code{201306}.
+of the OpenACC version; for OpenACC v2.6 the value is @code{201711}.
diff --git a/gcc/fortran/lang.opt b/gcc/fortran/lang.opt
index 90b09f9..a381a19 100644
--- a/gcc/fortran/lang.opt
+++ b/gcc/fortran/lang.opt
@@ -658,6 +658,10 @@
 Fortran LTO Joined Var(flag_openacc_dims)
 ; Documented in C
 
+fopenacc-kernels=
+Fortran RejectNegative Joined Enum(openacc_kernels) Var(flag_openacc_kernels) Init(OPENACC_KERNELS_SPLIT)
+; Documented in C
+
 fopenmp
 Fortran LTO
 ; Documented in C
diff --git a/gcc/fortran/match.c b/gcc/fortran/match.c
index 088b69f..97fab1d 100644
--- a/gcc/fortran/match.c
+++ b/gcc/fortran/match.c
@@ -2837,7 +2837,8 @@
       && o != NULL
       && o->state == COMP_OMP_STRUCTURED_BLOCK
       && (o->head->op == EXEC_OACC_LOOP
-	  || o->head->op == EXEC_OACC_PARALLEL_LOOP))
+	  || o->head->op == EXEC_OACC_PARALLEL_LOOP
+	  || o->head->op == EXEC_OACC_SERIAL_LOOP))
     {
       int collapse = 1;
       gcc_assert (o->head->next != NULL
diff --git a/gcc/fortran/match.h b/gcc/fortran/match.h
index 29854ee..7bc392b 100644
--- a/gcc/fortran/match.h
+++ b/gcc/fortran/match.h
@@ -145,11 +145,13 @@
 match gfc_match_oacc_parallel (void);
 match gfc_match_oacc_parallel_loop (void);
 match gfc_match_oacc_enter_data (void);
+match gfc_match_oacc_serial (void);
+match gfc_match_oacc_serial_loop (void);
 match gfc_match_oacc_exit_data (void);
 match gfc_match_oacc_routine (void);
 
 /* OpenMP directive matchers.  */
-match gfc_match_omp_eos (void);
+match gfc_match_omp_eos_error (void);
 match gfc_match_omp_atomic (void);
 match gfc_match_omp_barrier (void);
 match gfc_match_omp_cancel (void);
diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index 83b1c44..6060941 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -31,7 +31,7 @@
 /* Match an end of OpenMP directive.  End of OpenMP directive is optional
    whitespace, followed by '\n' or comment '!'.  */
 
-match
+static match
 gfc_match_omp_eos (void)
 {
   locus old_loc;
@@ -57,6 +57,17 @@
   return MATCH_NO;
 }
 
+match
+gfc_match_omp_eos_error (void)
+{
+  if (gfc_match_omp_eos() == MATCH_YES)
+    return MATCH_YES;
+
+  gfc_error ("Unexpected junk at %C");
+  return MATCH_ERROR;
+}
+
+
 /* Free an omp_clauses structure.  */
 
 void
@@ -222,7 +233,8 @@
 gfc_match_omp_variable_list (const char *str, gfc_omp_namelist **list,
 			     bool allow_common, bool *end_colon = NULL,
 			     gfc_omp_namelist ***headp = NULL,
-			     bool allow_sections = false)
+			     bool allow_sections = false,
+			     bool allow_derived = false)
 {
   gfc_omp_namelist *head, *tail, *p;
   locus old_loc, cur_loc;
@@ -248,7 +260,8 @@
 	case MATCH_YES:
 	  gfc_expr *expr;
 	  expr = NULL;
-	  if (allow_sections && gfc_peek_ascii_char () == '(')
+	  if ((allow_sections && gfc_peek_ascii_char () == '(')
+	      || (allow_derived && gfc_peek_ascii_char () == '%'))
 	    {
 	      gfc_current_locus = cur_loc;
 	      m = gfc_match_variable (&expr, 0);
@@ -785,7 +798,7 @@
   OMP_MASK1_LAST
 };
 
-/* OpenACC 2.0 specific clauses. */
+/* OpenACC 2.0+ specific clauses. */
 enum omp_mask2
 {
   OMP_CLAUSE_ASYNC,
@@ -795,6 +808,7 @@
   OMP_CLAUSE_COPY,
   OMP_CLAUSE_COPYOUT,
   OMP_CLAUSE_CREATE,
+  OMP_CLAUSE_NO_CREATE,
   OMP_CLAUSE_PRESENT,
   OMP_CLAUSE_DEVICEPTR,
   OMP_CLAUSE_GANG,
@@ -811,6 +825,9 @@
   OMP_CLAUSE_TILE,
   OMP_CLAUSE_IF_PRESENT,
   OMP_CLAUSE_FINALIZE,
+  OMP_CLAUSE_ATTACH,
+  OMP_CLAUSE_DETACH,
+  OMP_CLAUSE_NOHOST,
   /* This must come last.  */
   OMP_MASK2_LAST
 };
@@ -914,10 +931,12 @@
    mapping.  */
 
 static bool
-gfc_match_omp_map_clause (gfc_omp_namelist **list, gfc_omp_map_op map_op)
+gfc_match_omp_map_clause (gfc_omp_namelist **list, gfc_omp_map_op map_op,
+			  bool common_blocks, bool allow_derived = false)
 {
   gfc_omp_namelist **head = NULL;
-  if (gfc_match_omp_variable_list ("", list, false, NULL, &head, true)
+  if (gfc_match_omp_variable_list ("", list, common_blocks, NULL, &head, true,
+				   allow_derived)
       == MATCH_YES)
     {
       gfc_omp_namelist *n;
@@ -939,6 +958,14 @@
 {
   gfc_omp_clauses *c = gfc_get_omp_clauses ();
   locus old_loc;
+  /* Determine whether we're dealing with an OpenACC directive that permits
+     derived type member accesses.  This in particular disallows
+     "!$acc declare" from using such accesses, because it's not clear if/how
+     that should work.  */
+  bool allow_derived = (openacc
+			&& ((mask & OMP_CLAUSE_ATTACH)
+			    || (mask & OMP_CLAUSE_DETACH)
+			    || (mask & OMP_CLAUSE_HOST_SELF)));
 
   gcc_checking_assert (OMP_MASK1_LAST <= 64 && OMP_MASK2_LAST <= 64);
   *cp = NULL;
@@ -1012,6 +1039,12 @@
 	      needs_space = true;
 	      continue;
 	    }
+	  if ((mask & OMP_CLAUSE_ATTACH)
+	      && gfc_match ("attach ( ") == MATCH_YES
+	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
+					   OMP_MAP_ATTACH, false,
+					   allow_derived))
+	    continue;
 	  break;
 	case 'c':
 	  if ((mask & OMP_CLAUSE_COLLAPSE)
@@ -1039,7 +1072,8 @@
 	  if ((mask & OMP_CLAUSE_COPY)
 	      && gfc_match ("copy ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_TOFROM))
+					   OMP_MAP_TOFROM, openacc,
+					   allow_derived))
 	    continue;
 	  if (mask & OMP_CLAUSE_COPYIN)
 	    {
@@ -1047,7 +1081,8 @@
 		{
 		  if (gfc_match ("copyin ( ") == MATCH_YES
 		      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-						   OMP_MAP_TO))
+						   OMP_MAP_TO, true,
+						   allow_derived))
 		    continue;
 		}
 	      else if (gfc_match_omp_variable_list ("copyin (",
@@ -1058,7 +1093,7 @@
 	  if ((mask & OMP_CLAUSE_COPYOUT)
 	      && gfc_match ("copyout ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_FROM))
+					   OMP_MAP_FROM, true, allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_COPYPRIVATE)
 	      && gfc_match_omp_variable_list ("copyprivate (",
@@ -1068,7 +1103,7 @@
 	  if ((mask & OMP_CLAUSE_CREATE)
 	      && gfc_match ("create ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_ALLOC))
+					   OMP_MAP_ALLOC, true, allow_derived))
 	    continue;
 	  break;
 	case 'd':
@@ -1104,7 +1139,8 @@
 	  if ((mask & OMP_CLAUSE_DELETE)
 	      && gfc_match ("delete ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_RELEASE))
+					   OMP_MAP_RELEASE, true,
+					   allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_DEPEND)
 	      && gfc_match ("depend ( ") == MATCH_YES)
@@ -1147,6 +1183,12 @@
 	      else
 		gfc_current_locus = old_loc;
 	    }
+	  if ((mask & OMP_CLAUSE_DETACH)
+	      && gfc_match ("detach ( ") == MATCH_YES
+	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
+					   OMP_MAP_DETACH, false,
+					   allow_derived))
+	    continue;
 	  if ((mask & OMP_CLAUSE_DEVICE)
 	      && !openacc
 	      && c->device == NULL
@@ -1156,12 +1198,14 @@
 	      && openacc
 	      && gfc_match ("device ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_FORCE_TO))
+					   OMP_MAP_FORCE_TO, false,
+					   allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_DEVICEPTR)
 	      && gfc_match ("deviceptr ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_FORCE_DEVICEPTR))
+					   OMP_MAP_FORCE_DEVICEPTR, false,
+					   allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_DEVICE_RESIDENT)
 	      && gfc_match_omp_variable_list
@@ -1239,7 +1283,8 @@
 	  if ((mask & OMP_CLAUSE_HOST_SELF)
 	      && gfc_match ("host ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_FORCE_FROM))
+					   OMP_MAP_FORCE_FROM, true,
+					   allow_derived))
 	    continue;
 	  break;
 	case 'i':
@@ -1432,6 +1477,12 @@
 	    }
 	  break;
 	case 'n':
+	  if ((mask & OMP_CLAUSE_NO_CREATE)
+	      && gfc_match ("no_create ( ") == MATCH_YES
+	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
+					   OMP_MAP_NO_ALLOC, true,
+					   allow_derived))
+	    continue;
 	  if ((mask & OMP_CLAUSE_NOGROUP)
 	      && !c->nogroup
 	      && gfc_match ("nogroup") == MATCH_YES)
@@ -1439,6 +1490,13 @@
 	      c->nogroup = needs_space = true;
 	      continue;
 	    }
+	  if ((mask & OMP_CLAUSE_NOHOST)
+	      && !c->nohost
+	      && gfc_match ("nohost") == MATCH_YES)
+	    {
+	      c->nohost = true;
+	      continue;
+	    }
 	  if ((mask & OMP_CLAUSE_NOTINBRANCH)
 	      && !c->notinbranch
 	      && !c->inbranch
@@ -1511,47 +1569,50 @@
 	  if ((mask & OMP_CLAUSE_COPY)
 	      && gfc_match ("pcopy ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_TOFROM))
+					   OMP_MAP_TOFROM, true,
+					   allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_COPYIN)
 	      && gfc_match ("pcopyin ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_TO))
+					   OMP_MAP_TO, true, allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_COPYOUT)
 	      && gfc_match ("pcopyout ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_FROM))
+					   OMP_MAP_FROM, true, allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_CREATE)
 	      && gfc_match ("pcreate ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_ALLOC))
+					   OMP_MAP_ALLOC, true, allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_PRESENT)
 	      && gfc_match ("present ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_FORCE_PRESENT))
+					   OMP_MAP_FORCE_PRESENT, false,
+					   allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_COPY)
 	      && gfc_match ("present_or_copy ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_TOFROM))
+					   OMP_MAP_TOFROM, true,
+					   allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_COPYIN)
 	      && gfc_match ("present_or_copyin ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_TO))
+					   OMP_MAP_TO, true, allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_COPYOUT)
 	      && gfc_match ("present_or_copyout ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_FROM))
+					   OMP_MAP_FROM, true, allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_CREATE)
 	      && gfc_match ("present_or_create ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_ALLOC))
+					   OMP_MAP_ALLOC, true, allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_PRIORITY)
 	      && c->priority == NULL
@@ -1669,8 +1730,8 @@
 
 	      if (gfc_match_omp_variable_list (" :",
 					       &c->lists[OMP_LIST_REDUCTION],
-					       false, NULL, &head,
-					       openacc) == MATCH_YES)
+					       false, NULL, &head, openacc,
+					       allow_derived) == MATCH_YES)
 		{
 		  gfc_omp_namelist *n;
 		  if (rop == OMP_REDUCTION_NONE)
@@ -1769,7 +1830,8 @@
 	  if ((mask & OMP_CLAUSE_HOST_SELF)
 	      && gfc_match ("self ( ") == MATCH_YES
 	      && gfc_match_omp_map_clause (&c->lists[OMP_LIST_MAP],
-					   OMP_MAP_FORCE_FROM))
+					   OMP_MAP_FORCE_FROM, true,
+					   allow_derived))
 	    continue;
 	  if ((mask & OMP_CLAUSE_SEQ)
 	      && !c->seq
@@ -1922,6 +1984,8 @@
 
   if (gfc_match_omp_eos () != MATCH_YES)
     {
+      if (!gfc_error_flag_test ())
+	gfc_error ("Failed to match clause at %C");
       gfc_free_omp_clauses (c);
       return MATCH_ERROR;
     }
@@ -1935,19 +1999,28 @@
   (omp_mask (OMP_CLAUSE_IF) | OMP_CLAUSE_ASYNC | OMP_CLAUSE_NUM_GANGS	      \
    | OMP_CLAUSE_NUM_WORKERS | OMP_CLAUSE_VECTOR_LENGTH | OMP_CLAUSE_REDUCTION \
    | OMP_CLAUSE_COPY | OMP_CLAUSE_COPYIN | OMP_CLAUSE_COPYOUT		      \
-   | OMP_CLAUSE_CREATE | OMP_CLAUSE_PRESENT | OMP_CLAUSE_DEVICEPTR	      \
-   | OMP_CLAUSE_PRIVATE | OMP_CLAUSE_FIRSTPRIVATE | OMP_CLAUSE_DEFAULT	      \
-   | OMP_CLAUSE_WAIT)
+   | OMP_CLAUSE_CREATE | OMP_CLAUSE_NO_CREATE | OMP_CLAUSE_PRESENT	      \
+   | OMP_CLAUSE_DEVICEPTR | OMP_CLAUSE_PRIVATE | OMP_CLAUSE_FIRSTPRIVATE      \
+   | OMP_CLAUSE_DEFAULT | OMP_CLAUSE_WAIT | OMP_CLAUSE_ATTACH)
 #define OACC_KERNELS_CLAUSES \
   (omp_mask (OMP_CLAUSE_IF) | OMP_CLAUSE_ASYNC | OMP_CLAUSE_NUM_GANGS	      \
    | OMP_CLAUSE_NUM_WORKERS | OMP_CLAUSE_VECTOR_LENGTH | OMP_CLAUSE_DEVICEPTR \
    | OMP_CLAUSE_COPY | OMP_CLAUSE_COPYIN | OMP_CLAUSE_COPYOUT		      \
-   | OMP_CLAUSE_CREATE | OMP_CLAUSE_PRESENT | OMP_CLAUSE_DEFAULT	      \
-   | OMP_CLAUSE_WAIT)
+   | OMP_CLAUSE_CREATE | OMP_CLAUSE_NO_CREATE | OMP_CLAUSE_PRESENT	      \
+   | OMP_CLAUSE_DEFAULT | OMP_CLAUSE_WAIT | OMP_CLAUSE_ATTACH)
+#define OACC_SERIAL_CLAUSES \
+  (omp_mask (OMP_CLAUSE_ASYNC) | OMP_CLAUSE_WAIT			      \
+   | OMP_CLAUSE_IF							      \
+   | OMP_CLAUSE_REDUCTION						      \
+   | OMP_CLAUSE_COPY | OMP_CLAUSE_COPYIN | OMP_CLAUSE_COPYOUT		      \
+   | OMP_CLAUSE_CREATE | OMP_CLAUSE_NO_CREATE | OMP_CLAUSE_PRESENT	      \
+   | OMP_CLAUSE_DEVICEPTR						      \
+   | OMP_CLAUSE_PRIVATE | OMP_CLAUSE_FIRSTPRIVATE			      \
+   | OMP_CLAUSE_DEFAULT | OMP_CLAUSE_ATTACH)
 #define OACC_DATA_CLAUSES \
   (omp_mask (OMP_CLAUSE_IF) | OMP_CLAUSE_DEVICEPTR  | OMP_CLAUSE_COPY	      \
    | OMP_CLAUSE_COPYIN | OMP_CLAUSE_COPYOUT | OMP_CLAUSE_CREATE		      \
-   | OMP_CLAUSE_PRESENT)
+   | OMP_CLAUSE_NO_CREATE | OMP_CLAUSE_PRESENT | OMP_CLAUSE_ATTACH)
 #define OACC_LOOP_CLAUSES \
   (omp_mask (OMP_CLAUSE_COLLAPSE) | OMP_CLAUSE_GANG | OMP_CLAUSE_WORKER	      \
    | OMP_CLAUSE_VECTOR | OMP_CLAUSE_SEQ | OMP_CLAUSE_INDEPENDENT	      \
@@ -1957,7 +2030,10 @@
   (OACC_LOOP_CLAUSES | OACC_PARALLEL_CLAUSES)
 #define OACC_KERNELS_LOOP_CLAUSES \
   (OACC_LOOP_CLAUSES | OACC_KERNELS_CLAUSES)
-#define OACC_HOST_DATA_CLAUSES omp_mask (OMP_CLAUSE_USE_DEVICE)
+#define OACC_HOST_DATA_CLAUSES \
+  (omp_mask (OMP_CLAUSE_USE_DEVICE)					      \
+   | OMP_CLAUSE_IF							      \
+   | OMP_CLAUSE_IF_PRESENT)
 #define OACC_DECLARE_CLAUSES \
   (omp_mask (OMP_CLAUSE_COPY) | OMP_CLAUSE_COPYIN | OMP_CLAUSE_COPYOUT	      \
    | OMP_CLAUSE_CREATE | OMP_CLAUSE_DEVICEPTR | OMP_CLAUSE_DEVICE_RESIDENT    \
@@ -1968,15 +2044,21 @@
    | OMP_CLAUSE_DEVICE | OMP_CLAUSE_WAIT | OMP_CLAUSE_IF_PRESENT)
 #define OACC_ENTER_DATA_CLAUSES \
   (omp_mask (OMP_CLAUSE_IF) | OMP_CLAUSE_ASYNC | OMP_CLAUSE_WAIT	      \
-   | OMP_CLAUSE_COPYIN | OMP_CLAUSE_CREATE)
+   | OMP_CLAUSE_COPYIN | OMP_CLAUSE_CREATE | OMP_CLAUSE_ATTACH)
 #define OACC_EXIT_DATA_CLAUSES \
   (omp_mask (OMP_CLAUSE_IF) | OMP_CLAUSE_ASYNC | OMP_CLAUSE_WAIT	      \
-   | OMP_CLAUSE_COPYOUT | OMP_CLAUSE_DELETE | OMP_CLAUSE_FINALIZE)
+   | OMP_CLAUSE_COPYOUT | OMP_CLAUSE_DELETE | OMP_CLAUSE_FINALIZE	      \
+   | OMP_CLAUSE_DETACH)
 #define OACC_WAIT_CLAUSES \
   omp_mask (OMP_CLAUSE_ASYNC)
 #define OACC_ROUTINE_CLAUSES \
   (omp_mask (OMP_CLAUSE_GANG) | OMP_CLAUSE_WORKER | OMP_CLAUSE_VECTOR	      \
-   | OMP_CLAUSE_SEQ)
+   | OMP_CLAUSE_SEQ							      \
+   | OMP_CLAUSE_NOHOST)
+
+#define OACC_SERIAL_CLAUSE_DEVICE_TYPE_MASK \
+  (omp_mask (OMP_CLAUSE_ASYNC) | OMP_CLAUSE_WAIT			      \
+   | OMP_CLAUSE_DEVICE_TYPE)
 
 
 static match
@@ -2019,6 +2101,21 @@
 
 
 match
+gfc_match_oacc_serial_loop (void)
+{
+  return match_acc (EXEC_OACC_SERIAL_LOOP,
+		    OACC_SERIAL_CLAUSES | OACC_LOOP_CLAUSES);
+}
+
+
+match
+gfc_match_oacc_serial (void)
+{
+  return match_acc (EXEC_OACC_SERIAL, OACC_SERIAL_CLAUSES);
+}
+
+
+match
 gfc_match_oacc_data (void)
 {
   return match_acc (EXEC_OACC_DATA, OACC_DATA_CLAUSES);
@@ -2418,6 +2515,8 @@
 				       &old_loc))
 	goto cleanup;
       gfc_current_ns->proc_name->attr.oacc_routine_lop = lop;
+      gfc_current_ns->proc_name->attr.oacc_function_nohost
+	= c ? c->nohost : false;
     }
   else
     /* Something has gone wrong, possibly a syntax error.  */
@@ -3763,6 +3862,7 @@
 {
   return code->op == EXEC_OACC_PARALLEL_LOOP
 	 || code->op == EXEC_OACC_KERNELS_LOOP
+	 || code->op == EXEC_OACC_SERIAL_LOOP
 	 || code->op == EXEC_OACC_LOOP;
 }
 
@@ -3783,8 +3883,8 @@
   if (expr->expr_type == EXPR_CONSTANT
       && expr->ts.type == BT_INTEGER
       && mpz_sgn (expr->value.integer) <= 0)
-    gfc_warning (0, "INTEGER expression of %s clause at %L must be positive",
-		 clause, &expr->where);
+    gfc_error ("INTEGER expression of %s clause at %L must be positive",
+	       clause, &expr->where);
 }
 
 static void
@@ -3804,9 +3904,6 @@
 static void
 check_symbol_not_pointer (gfc_symbol *sym, locus loc, const char *name)
 {
-  if (sym->ts.type == BT_DERIVED && sym->attr.pointer)
-    gfc_error ("POINTER object %qs of derived type in %s clause at %L",
-	       sym->name, name, &loc);
   if (sym->ts.type == BT_DERIVED && sym->attr.cray_pointer)
     gfc_error ("Cray pointer object %qs of derived type in %s clause at %L",
 	       sym->name, name, &loc);
@@ -3842,24 +3939,11 @@
   if (sym->as && sym->as->type == AS_ASSUMED_RANK)
     gfc_error ("Assumed rank array %qs in %s clause at %L",
 	       sym->name, name, &loc);
-  if (sym->as && sym->as->type == AS_DEFERRED && sym->attr.pointer
-      && !sym->attr.contiguous)
-    gfc_error ("Noncontiguous deferred shape array %qs in %s clause at %L",
-	       sym->name, name, &loc);
 }
 
 static void
 resolve_oacc_data_clauses (gfc_symbol *sym, locus loc, const char *name)
 {
-  if (sym->ts.type == BT_DERIVED && sym->attr.allocatable)
-    gfc_error ("ALLOCATABLE object %qs of derived type in %s clause at %L",
-	       sym->name, name, &loc);
-  if ((sym->ts.type == BT_ASSUMED && sym->attr.allocatable)
-      || (sym->ts.type == BT_CLASS && CLASS_DATA (sym)
-	  && CLASS_DATA (sym)->attr.allocatable))
-    gfc_error ("ALLOCATABLE object %qs of polymorphic type "
-	       "in %s clause at %L", sym->name, name, &loc);
-  check_symbol_not_pointer (sym, loc, name);
   check_array_not_assumed (sym, loc, name);
 }
 
@@ -4208,8 +4292,21 @@
 		  continue;
 	      }
 	  }
-	gfc_error ("Object %qs is not a variable at %L", n->sym->name,
-		   &n->where);
+	if (list == OMP_LIST_MAP
+	    && n->sym->attr.flavor == FL_PARAMETER)
+	  {
+	    if (openacc)
+	      gfc_error ("Object %qs is not a variable at %L; parameters"
+			 " cannot be and need not be copied", n->sym->name,
+			 &n->where);
+	    else
+	      gfc_error ("Object %qs is not a variable at %L; parameters"
+			 " cannot be and need not be mapped", n->sym->name,
+			 &n->where);
+	  }
+	else
+	  gfc_error ("Object %qs is not a variable at %L", n->sym->name,
+		     &n->where);
       }
 
   for (list = 0; list < OMP_LIST_NUM; list++)
@@ -4223,11 +4320,26 @@
 	&& (list != OMP_LIST_REDUCTION || !openacc))
       for (n = omp_clauses->lists[list]; n; n = n->next)
 	{
-	  if (n->sym->mark)
-	    gfc_error ("Symbol %qs present on multiple clauses at %L",
-		       n->sym->name, &n->where);
-	  else
-	    n->sym->mark = 1;
+	  bool array_only_p = true;
+	  /* Disallow duplicate bare variable references and multiple
+	     subarrays of the same array here, but allow multiple components of
+	     the same (e.g. derived-type) variable.  For the latter, duplicate
+	     components are detected elsewhere.  */
+	  if (openacc && n->expr && n->expr->expr_type == EXPR_VARIABLE)
+	    for (gfc_ref *ref = n->expr->ref; ref; ref = ref->next)
+	      if (ref->type != REF_ARRAY)
+	        {
+		  array_only_p = false;
+		  break;
+		}
+	  if (array_only_p)
+	    {
+	      if (n->sym->mark)
+		gfc_error ("Symbol %qs present on multiple clauses at %L",
+			   n->sym->name, &n->where);
+	      else
+		n->sym->mark = 1;
+	    }
 	}
 
   gcc_assert (OMP_LIST_LASTPRIVATE == OMP_LIST_FIRSTPRIVATE + 1);
@@ -4418,23 +4530,43 @@
 				 "are allowed on ORDERED directive at %L",
 				 &n->where);
 		  }
+		gfc_ref *array_ref = NULL;
+		bool resolved = false;
 		if (n->expr)
 		  {
-		    if (!gfc_resolve_expr (n->expr)
+		    array_ref = n->expr->ref;
+		    resolved = gfc_resolve_expr (n->expr);
+
+		    /* Look through component refs to find last array
+		       reference.  */
+		    if (openacc)
+		      while (resolved
+			     && array_ref
+			     && (array_ref->type == REF_COMPONENT
+				 || (array_ref->type == REF_ARRAY
+				     && array_ref->next
+			             && (array_ref->next->type
+					 == REF_COMPONENT))))
+			array_ref = array_ref->next;
+		  }
+		if (array_ref
+		    || (n->expr
+			&& (!resolved || n->expr->expr_type != EXPR_VARIABLE)))
+		  {
+		    if (!resolved
 			|| n->expr->expr_type != EXPR_VARIABLE
-			|| n->expr->ref == NULL
-			|| n->expr->ref->next
-			|| n->expr->ref->type != REF_ARRAY)
+			|| array_ref->next
+			|| array_ref->type != REF_ARRAY)
 		      gfc_error ("%qs in %s clause at %L is not a proper "
 				 "array section", n->sym->name, name,
 				 &n->where);
-		    else if (n->expr->ref->u.ar.codimen)
+		    else if (array_ref->u.ar.codimen)
 		      gfc_error ("Coarrays not supported in %s clause at %L",
 				 name, &n->where);
 		    else
 		      {
 			int i;
-			gfc_array_ref *ar = &n->expr->ref->u.ar;
+			gfc_array_ref *ar = &array_ref->u.ar;
 			for (i = 0; i < ar->dimen; i++)
 			  if (ar->stride[i])
 			    {
@@ -4579,7 +4711,9 @@
 				 n->sym->name, name, &n->where);
 		  }
 		if (code
-		    && (oacc_is_loop (code) || code->op == EXEC_OACC_PARALLEL))
+		    && (oacc_is_loop (code)
+			|| code->op == EXEC_OACC_PARALLEL
+			|| code->op == EXEC_OACC_SERIAL))
 		  check_array_not_assumed (n->sym, n->where, name);
 		else if (n->sym->as && n->sym->as->type == AS_ASSUMED_SIZE)
 		  gfc_error ("Assumed size array %qs in %s clause at %L",
@@ -5730,6 +5864,12 @@
   return code->op == EXEC_OACC_PARALLEL || code->op == EXEC_OACC_PARALLEL_LOOP;
 }
 
+static bool
+oacc_is_serial (gfc_code *code)
+{
+  return code->op == EXEC_OACC_SERIAL || code->op == EXEC_OACC_SERIAL_LOOP;
+}
+
 static gfc_statement
 omp_code_to_statement (gfc_code *code)
 {
@@ -5846,6 +5986,8 @@
       return ST_OACC_PARALLEL;
     case EXEC_OACC_KERNELS:
       return ST_OACC_KERNELS;
+    case EXEC_OACC_SERIAL:
+      return ST_OACC_SERIAL;
     case EXEC_OACC_DATA:
       return ST_OACC_DATA;
     case EXEC_OACC_HOST_DATA:
@@ -5854,6 +5996,8 @@
       return ST_OACC_PARALLEL_LOOP;
     case EXEC_OACC_KERNELS_LOOP:
       return ST_OACC_KERNELS_LOOP;
+    case EXEC_OACC_SERIAL_LOOP:
+      return ST_OACC_SERIAL_LOOP;
     case EXEC_OACC_LOOP:
       return ST_OACC_LOOP;
     case EXEC_OACC_ATOMIC:
@@ -6009,6 +6153,18 @@
   if (!oacc_is_loop (code))
     return;
 
+  if (code->op == EXEC_OACC_LOOP
+      && code->ext.omp_clauses->lists[OMP_LIST_REDUCTION]
+      && code->ext.omp_clauses->gang)
+    {
+      fortran_omp_context *c;
+      for (c = omp_current_ctx; c; c = c->previous)
+	if (!oacc_is_loop (c->code))
+	  break;
+      if (c == NULL || !oacc_is_parallel (c->code))
+	gfc_error ("gang reduction on an orphan loop at %L", &code->loc);
+    }
+
   if (code->ext.omp_clauses->tile_list && code->ext.omp_clauses->gang
       && code->ext.omp_clauses->worker && code->ext.omp_clauses->vector)
     gfc_error ("Tiled loop cannot be parallelized across gangs, workers and "
@@ -6124,18 +6280,14 @@
 	for (n = oc->clauses->lists[list]; n; n = n->next)
 	  {
 	    n->sym->mark = 0;
-	    if (n->sym->attr.function || n->sym->attr.subroutine)
+	    if (n->sym->attr.flavor != FL_VARIABLE
+		&& (n->sym->attr.flavor != FL_PROCEDURE
+		    || n->sym->result != n->sym))
 	      {
 		gfc_error ("Object %qs is not a variable at %L",
 			   n->sym->name, &oc->loc);
 		continue;
 	      }
-	    if (n->sym->attr.flavor == FL_PARAMETER)
-	      {
-		gfc_error ("PARAMETER object %qs is not allowed at %L",
-			   n->sym->name, &oc->loc);
-		continue;
-	      }
 
 	    if (n->expr && n->expr->ref->type == REF_ARRAY)
 	      {
@@ -6209,6 +6361,7 @@
     {
     case EXEC_OACC_PARALLEL:
     case EXEC_OACC_KERNELS:
+    case EXEC_OACC_SERIAL:
     case EXEC_OACC_DATA:
     case EXEC_OACC_HOST_DATA:
     case EXEC_OACC_UPDATE:
@@ -6220,6 +6373,7 @@
       break;
     case EXEC_OACC_PARALLEL_LOOP:
     case EXEC_OACC_KERNELS_LOOP:
+    case EXEC_OACC_SERIAL_LOOP:
     case EXEC_OACC_LOOP:
       resolve_oacc_loop (code);
       break;
diff --git a/gcc/fortran/parse.c b/gcc/fortran/parse.c
index 9e662a6..3f4d182 100644
--- a/gcc/fortran/parse.c
+++ b/gcc/fortran/parse.c
@@ -607,13 +607,18 @@
 
 /* Like match and if spec_only, goto do_spec_only without actually
    matching.  */
+/* If the directive matched but the clauses failed, do not start
+   matching the next directive in the same switch statement. */
 #define matcha(keyword, subr, st)				\
     do {							\
+      match m2;							\
       if (spec_only && gfc_match (keyword) == MATCH_YES)	\
 	goto do_spec_only;					\
-      else if (match_word (keyword, subr, &old_locus)		\
+      else if ((m2 = match_word (keyword, subr, &old_locus))	\
 	       == MATCH_YES)					\
 	return st;						\
+      else if (m2 == MATCH_ERROR)				\
+	goto error_handling;					\
       else							\
 	undo_new_statement ();				  	\
     } while (0)
@@ -667,15 +672,15 @@
       match ("declare", gfc_match_oacc_declare, ST_OACC_DECLARE);
       break;
     case 'e':
-      matcha ("end atomic", gfc_match_omp_eos, ST_OACC_END_ATOMIC);
-      matcha ("end data", gfc_match_omp_eos, ST_OACC_END_DATA);
-      matcha ("end host_data", gfc_match_omp_eos, ST_OACC_END_HOST_DATA);
-      matcha ("end kernels loop", gfc_match_omp_eos, ST_OACC_END_KERNELS_LOOP);
-      matcha ("end kernels", gfc_match_omp_eos, ST_OACC_END_KERNELS);
-      matcha ("end loop", gfc_match_omp_eos, ST_OACC_END_LOOP);
-      matcha ("end parallel loop", gfc_match_omp_eos,
+      matcha ("end atomic", gfc_match_omp_eos_error, ST_OACC_END_ATOMIC);
+      matcha ("end data", gfc_match_omp_eos_error, ST_OACC_END_DATA);
+      matcha ("end host_data", gfc_match_omp_eos_error, ST_OACC_END_HOST_DATA);
+      matcha ("end kernels loop", gfc_match_omp_eos_error, ST_OACC_END_KERNELS_LOOP);
+      matcha ("end kernels", gfc_match_omp_eos_error, ST_OACC_END_KERNELS);
+      matcha ("end loop", gfc_match_omp_eos_error, ST_OACC_END_LOOP);
+      matcha ("end parallel loop", gfc_match_omp_eos_error,
 	      ST_OACC_END_PARALLEL_LOOP);
-      matcha ("end parallel", gfc_match_omp_eos, ST_OACC_END_PARALLEL);
+      matcha ("end parallel", gfc_match_omp_eos_error, ST_OACC_END_PARALLEL);
       matcha ("enter data", gfc_match_oacc_enter_data, ST_OACC_ENTER_DATA);
       matcha ("exit data", gfc_match_oacc_exit_data, ST_OACC_EXIT_DATA);
       break;
@@ -698,6 +703,10 @@
     case 'r':
       match ("routine", gfc_match_oacc_routine, ST_OACC_ROUTINE);
       break;
+    case 's':
+      matcha ("serial loop", gfc_match_oacc_serial_loop, ST_OACC_SERIAL_LOOP);
+      matcha ("serial", gfc_match_oacc_serial, ST_OACC_SERIAL);
+      break;
     case 'u':
       matcha ("update", gfc_match_oacc_update, ST_OACC_UPDATE);
       break;
@@ -709,6 +718,7 @@
   /* Directive not found or stored an error message.
      Check and give up.  */
 
+ error_handling:
   if (gfc_error_check () == 0)
     gfc_error_now ("Unclassifiable OpenACC directive at %C");
 
@@ -730,32 +740,40 @@
    and if spec_only, goto do_spec_only without actually matching.  */
 #define matchs(keyword, subr, st)				\
     do {							\
+      match m2;							\
       if (spec_only && gfc_match (keyword) == MATCH_YES)	\
 	goto do_spec_only;					\
-      if (match_word_omp_simd (keyword, subr, &old_locus,	\
-			       &simd_matched) == MATCH_YES)	\
+      if ((m2 = match_word_omp_simd (keyword, subr, &old_locus,	\
+			       &simd_matched)) == MATCH_YES)	\
 	{							\
 	  ret = st;						\
 	  goto finish;						\
 	}							\
+      else if (m2 == MATCH_ERROR)				\
+	goto error_handling;					\
       else							\
 	undo_new_statement ();				  	\
     } while (0)
 
 /* Like match, but don't match anything if not -fopenmp
    and if spec_only, goto do_spec_only without actually matching.  */
+/* If the directive matched but the clauses failed, do not start
+   matching the next directive in the same switch statement. */
 #define matcho(keyword, subr, st)				\
     do {							\
+      match m2;							\
       if (!flag_openmp)						\
 	;							\
       else if (spec_only && gfc_match (keyword) == MATCH_YES)	\
 	goto do_spec_only;					\
-      else if (match_word (keyword, subr, &old_locus)		\
+      else if ((m2 = match_word (keyword, subr, &old_locus))	\
 	       == MATCH_YES)					\
 	{							\
 	  ret = st;						\
 	  goto finish;						\
 	}							\
+      else if (m2 == MATCH_ERROR)				\
+	goto error_handling;					\
       else							\
 	undo_new_statement ();				  	\
     } while (0)
@@ -763,12 +781,15 @@
 /* Like match, but set a flag simd_matched if keyword matched.  */
 #define matchds(keyword, subr, st)				\
     do {							\
-      if (match_word_omp_simd (keyword, subr, &old_locus,	\
-			       &simd_matched) == MATCH_YES)	\
+      match m2;							\
+      if ((m2 = match_word_omp_simd (keyword, subr, &old_locus,	\
+			       &simd_matched)) == MATCH_YES)	\
 	{							\
 	  ret = st;						\
 	  goto finish;						\
 	}							\
+      else if (m2 == MATCH_ERROR)				\
+	goto error_handling;					\
       else							\
 	undo_new_statement ();				  	\
     } while (0)
@@ -776,14 +797,17 @@
 /* Like match, but don't match anything if not -fopenmp.  */
 #define matchdo(keyword, subr, st)				\
     do {							\
+      match m2;							\
       if (!flag_openmp)						\
 	;							\
-      else if (match_word (keyword, subr, &old_locus)		\
+      else if ((m2 = match_word (keyword, subr, &old_locus))	\
 	       == MATCH_YES)					\
 	{							\
 	  ret = st;						\
 	  goto finish;						\
 	}							\
+      else if (m2 == MATCH_ERROR)				\
+	goto error_handling;					\
       else							\
 	undo_new_statement ();				  	\
     } while (0)
@@ -876,63 +900,63 @@
       matcho ("do", gfc_match_omp_do, ST_OMP_DO);
       break;
     case 'e':
-      matcho ("end atomic", gfc_match_omp_eos, ST_OMP_END_ATOMIC);
+      matcho ("end atomic", gfc_match_omp_eos_error, ST_OMP_END_ATOMIC);
       matcho ("end critical", gfc_match_omp_end_critical, ST_OMP_END_CRITICAL);
-      matchs ("end distribute parallel do simd", gfc_match_omp_eos,
+      matchs ("end distribute parallel do simd", gfc_match_omp_eos_error,
 	      ST_OMP_END_DISTRIBUTE_PARALLEL_DO_SIMD);
-      matcho ("end distribute parallel do", gfc_match_omp_eos,
+      matcho ("end distribute parallel do", gfc_match_omp_eos_error,
 	      ST_OMP_END_DISTRIBUTE_PARALLEL_DO);
-      matchs ("end distribute simd", gfc_match_omp_eos,
+      matchs ("end distribute simd", gfc_match_omp_eos_error,
 	      ST_OMP_END_DISTRIBUTE_SIMD);
-      matcho ("end distribute", gfc_match_omp_eos, ST_OMP_END_DISTRIBUTE);
+      matcho ("end distribute", gfc_match_omp_eos_error, ST_OMP_END_DISTRIBUTE);
       matchs ("end do simd", gfc_match_omp_end_nowait, ST_OMP_END_DO_SIMD);
       matcho ("end do", gfc_match_omp_end_nowait, ST_OMP_END_DO);
-      matchs ("end simd", gfc_match_omp_eos, ST_OMP_END_SIMD);
-      matcho ("end master", gfc_match_omp_eos, ST_OMP_END_MASTER);
-      matchs ("end ordered", gfc_match_omp_eos, ST_OMP_END_ORDERED);
-      matchs ("end parallel do simd", gfc_match_omp_eos,
+      matchs ("end simd", gfc_match_omp_eos_error, ST_OMP_END_SIMD);
+      matcho ("end master", gfc_match_omp_eos_error, ST_OMP_END_MASTER);
+      matchs ("end ordered", gfc_match_omp_eos_error, ST_OMP_END_ORDERED);
+      matchs ("end parallel do simd", gfc_match_omp_eos_error,
 	      ST_OMP_END_PARALLEL_DO_SIMD);
-      matcho ("end parallel do", gfc_match_omp_eos, ST_OMP_END_PARALLEL_DO);
-      matcho ("end parallel sections", gfc_match_omp_eos,
+      matcho ("end parallel do", gfc_match_omp_eos_error, ST_OMP_END_PARALLEL_DO);
+      matcho ("end parallel sections", gfc_match_omp_eos_error,
 	      ST_OMP_END_PARALLEL_SECTIONS);
-      matcho ("end parallel workshare", gfc_match_omp_eos,
+      matcho ("end parallel workshare", gfc_match_omp_eos_error,
 	      ST_OMP_END_PARALLEL_WORKSHARE);
-      matcho ("end parallel", gfc_match_omp_eos, ST_OMP_END_PARALLEL);
+      matcho ("end parallel", gfc_match_omp_eos_error, ST_OMP_END_PARALLEL);
       matcho ("end sections", gfc_match_omp_end_nowait, ST_OMP_END_SECTIONS);
       matcho ("end single", gfc_match_omp_end_single, ST_OMP_END_SINGLE);
-      matcho ("end target data", gfc_match_omp_eos, ST_OMP_END_TARGET_DATA);
-      matchs ("end target parallel do simd", gfc_match_omp_eos,
+      matcho ("end target data", gfc_match_omp_eos_error, ST_OMP_END_TARGET_DATA);
+      matchs ("end target parallel do simd", gfc_match_omp_eos_error,
 	      ST_OMP_END_TARGET_PARALLEL_DO_SIMD);
-      matcho ("end target parallel do", gfc_match_omp_eos,
+      matcho ("end target parallel do", gfc_match_omp_eos_error,
 	      ST_OMP_END_TARGET_PARALLEL_DO);
-      matcho ("end target parallel", gfc_match_omp_eos,
+      matcho ("end target parallel", gfc_match_omp_eos_error,
 	      ST_OMP_END_TARGET_PARALLEL);
-      matchs ("end target simd", gfc_match_omp_eos, ST_OMP_END_TARGET_SIMD);
+      matchs ("end target simd", gfc_match_omp_eos_error, ST_OMP_END_TARGET_SIMD);
       matchs ("end target teams distribute parallel do simd",
-	      gfc_match_omp_eos,
+	      gfc_match_omp_eos_error,
 	      ST_OMP_END_TARGET_TEAMS_DISTRIBUTE_PARALLEL_DO_SIMD);
-      matcho ("end target teams distribute parallel do", gfc_match_omp_eos,
+      matcho ("end target teams distribute parallel do", gfc_match_omp_eos_error,
 	      ST_OMP_END_TARGET_TEAMS_DISTRIBUTE_PARALLEL_DO);
-      matchs ("end target teams distribute simd", gfc_match_omp_eos,
+      matchs ("end target teams distribute simd", gfc_match_omp_eos_error,
 	      ST_OMP_END_TARGET_TEAMS_DISTRIBUTE_SIMD);
-      matcho ("end target teams distribute", gfc_match_omp_eos,
+      matcho ("end target teams distribute", gfc_match_omp_eos_error,
 	      ST_OMP_END_TARGET_TEAMS_DISTRIBUTE);
-      matcho ("end target teams", gfc_match_omp_eos, ST_OMP_END_TARGET_TEAMS);
-      matcho ("end target", gfc_match_omp_eos, ST_OMP_END_TARGET);
-      matcho ("end taskgroup", gfc_match_omp_eos, ST_OMP_END_TASKGROUP);
-      matchs ("end taskloop simd", gfc_match_omp_eos,
+      matcho ("end target teams", gfc_match_omp_eos_error, ST_OMP_END_TARGET_TEAMS);
+      matcho ("end target", gfc_match_omp_eos_error, ST_OMP_END_TARGET);
+      matcho ("end taskgroup", gfc_match_omp_eos_error, ST_OMP_END_TASKGROUP);
+      matchs ("end taskloop simd", gfc_match_omp_eos_error,
 	      ST_OMP_END_TASKLOOP_SIMD);
-      matcho ("end taskloop", gfc_match_omp_eos, ST_OMP_END_TASKLOOP);
-      matcho ("end task", gfc_match_omp_eos, ST_OMP_END_TASK);
-      matchs ("end teams distribute parallel do simd", gfc_match_omp_eos,
+      matcho ("end taskloop", gfc_match_omp_eos_error, ST_OMP_END_TASKLOOP);
+      matcho ("end task", gfc_match_omp_eos_error, ST_OMP_END_TASK);
+      matchs ("end teams distribute parallel do simd", gfc_match_omp_eos_error,
 	      ST_OMP_END_TEAMS_DISTRIBUTE_PARALLEL_DO_SIMD);
-      matcho ("end teams distribute parallel do", gfc_match_omp_eos,
+      matcho ("end teams distribute parallel do", gfc_match_omp_eos_error,
 	      ST_OMP_END_TEAMS_DISTRIBUTE_PARALLEL_DO);
-      matchs ("end teams distribute simd", gfc_match_omp_eos,
+      matchs ("end teams distribute simd", gfc_match_omp_eos_error,
 	      ST_OMP_END_TEAMS_DISTRIBUTE_SIMD);
-      matcho ("end teams distribute", gfc_match_omp_eos,
+      matcho ("end teams distribute", gfc_match_omp_eos_error,
 	      ST_OMP_END_TEAMS_DISTRIBUTE);
-      matcho ("end teams", gfc_match_omp_eos, ST_OMP_END_TEAMS);
+      matcho ("end teams", gfc_match_omp_eos_error, ST_OMP_END_TEAMS);
       matcho ("end workshare", gfc_match_omp_end_nowait,
 	      ST_OMP_END_WORKSHARE);
       break;
@@ -966,7 +990,7 @@
       break;
     case 's':
       matcho ("sections", gfc_match_omp_sections, ST_OMP_SECTIONS);
-      matcho ("section", gfc_match_omp_eos, ST_OMP_SECTION);
+      matcho ("section", gfc_match_omp_eos_error, ST_OMP_SECTION);
       matcho ("single", gfc_match_omp_single, ST_OMP_SINGLE);
       break;
     case 't':
@@ -1028,6 +1052,7 @@
      not -fopenmp and simd_matched is false, i.e. if a directive other
      than one marked with match has been seen.  */
 
+ error_handling:
   if (flag_openmp || simd_matched)
     {
       if (!gfc_error_check ())
@@ -1560,7 +1585,8 @@
   case ST_CRITICAL: \
   case ST_OACC_PARALLEL_LOOP: case ST_OACC_PARALLEL: case ST_OACC_KERNELS: \
   case ST_OACC_DATA: case ST_OACC_HOST_DATA: case ST_OACC_LOOP: \
-  case ST_OACC_KERNELS_LOOP: case ST_OACC_ATOMIC
+  case ST_OACC_KERNELS_LOOP: case ST_OACC_SERIAL_LOOP: case ST_OACC_SERIAL: \
+  case ST_OACC_ATOMIC
 
 /* Declaration statements */
 
@@ -2128,6 +2154,18 @@
     case ST_OACC_END_KERNELS_LOOP:
       p = "!$ACC END KERNELS LOOP";
       break;
+    case ST_OACC_SERIAL_LOOP:
+      p = "!$ACC SERIAL LOOP";
+      break;
+    case ST_OACC_END_SERIAL_LOOP:
+      p = "!$ACC END SERIAL LOOP";
+      break;
+    case ST_OACC_SERIAL:
+      p = "!$ACC SERIAL";
+      break;
+    case ST_OACC_END_SERIAL:
+      p = "!$ACC END SERIAL";
+      break;
     case ST_OACC_DATA:
       p = "!$ACC DATA";
       break;
@@ -4961,6 +4999,9 @@
     case ST_OACC_KERNELS:
       acc_end_st = ST_OACC_END_KERNELS;
       break;
+    case ST_OACC_SERIAL:
+      acc_end_st = ST_OACC_END_SERIAL;
+      break;
     case ST_OACC_DATA:
       acc_end_st = ST_OACC_END_DATA;
       break;
@@ -5045,6 +5086,7 @@
     gfc_warning (0, "Redundant !$ACC END LOOP at %C");
   if ((acc_st == ST_OACC_PARALLEL_LOOP && st == ST_OACC_END_PARALLEL_LOOP) ||
       (acc_st == ST_OACC_KERNELS_LOOP && st == ST_OACC_END_KERNELS_LOOP) ||
+      (acc_st == ST_OACC_SERIAL_LOOP && st == ST_OACC_END_SERIAL_LOOP) ||
       (acc_st == ST_OACC_LOOP && st == ST_OACC_END_LOOP))
     {
       gcc_assert (new_st.op == EXEC_NOP);
@@ -5383,6 +5425,7 @@
 
 	case ST_OACC_PARALLEL_LOOP:
 	case ST_OACC_KERNELS_LOOP:
+	case ST_OACC_SERIAL_LOOP:
 	case ST_OACC_LOOP:
 	  st = parse_oacc_loop (st);
 	  if (st == ST_IMPLIED_ENDDO)
@@ -5391,6 +5434,7 @@
 
 	case ST_OACC_PARALLEL:
 	case ST_OACC_KERNELS:
+	case ST_OACC_SERIAL:
 	case ST_OACC_DATA:
 	case ST_OACC_HOST_DATA:
 	  parse_oacc_structured_block (st);
@@ -5429,6 +5473,7 @@
 	case ST_OMP_SIMD:
 	case ST_OMP_TARGET_PARALLEL_DO:
 	case ST_OMP_TARGET_PARALLEL_DO_SIMD:
+	case ST_OMP_TARGET_SIMD:
 	case ST_OMP_TARGET_TEAMS_DISTRIBUTE:
 	case ST_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_DO:
 	case ST_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_DO_SIMD:
@@ -6432,6 +6477,8 @@
     case EXEC_OACC_PARALLEL:
     case EXEC_OACC_KERNELS_LOOP:
     case EXEC_OACC_KERNELS:
+    case EXEC_OACC_SERIAL_LOOP:
+    case EXEC_OACC_SERIAL:
     case EXEC_OACC_DATA:
     case EXEC_OACC_HOST_DATA:
     case EXEC_OACC_LOOP:
diff --git a/gcc/fortran/resolve.c b/gcc/fortran/resolve.c
index a4ad26e..203adc7 100644
--- a/gcc/fortran/resolve.c
+++ b/gcc/fortran/resolve.c
@@ -10361,6 +10361,8 @@
 	case EXEC_OACC_PARALLEL:
 	case EXEC_OACC_KERNELS_LOOP:
 	case EXEC_OACC_KERNELS:
+	case EXEC_OACC_SERIAL_LOOP:
+	case EXEC_OACC_SERIAL:
 	case EXEC_OACC_DATA:
 	case EXEC_OACC_HOST_DATA:
 	case EXEC_OACC_LOOP:
@@ -11321,6 +11323,8 @@
 	    case EXEC_OACC_PARALLEL:
 	    case EXEC_OACC_KERNELS_LOOP:
 	    case EXEC_OACC_KERNELS:
+	    case EXEC_OACC_SERIAL_LOOP:
+	    case EXEC_OACC_SERIAL:
 	    case EXEC_OACC_DATA:
 	    case EXEC_OACC_HOST_DATA:
 	    case EXEC_OACC_LOOP:
@@ -11730,6 +11734,8 @@
 	case EXEC_OACC_PARALLEL:
 	case EXEC_OACC_KERNELS_LOOP:
 	case EXEC_OACC_KERNELS:
+	case EXEC_OACC_SERIAL_LOOP:
+	case EXEC_OACC_SERIAL:
 	case EXEC_OACC_DATA:
 	case EXEC_OACC_HOST_DATA:
 	case EXEC_OACC_LOOP:
diff --git a/gcc/fortran/st.c b/gcc/fortran/st.c
index ade2fce..69641e3 100644
--- a/gcc/fortran/st.c
+++ b/gcc/fortran/st.c
@@ -201,6 +201,8 @@
     case EXEC_OACC_PARALLEL:
     case EXEC_OACC_KERNELS_LOOP:
     case EXEC_OACC_KERNELS:
+    case EXEC_OACC_SERIAL_LOOP:
+    case EXEC_OACC_SERIAL:
     case EXEC_OACC_DATA:
     case EXEC_OACC_HOST_DATA:
     case EXEC_OACC_LOOP:
diff --git a/gcc/fortran/trans-array.c b/gcc/fortran/trans-array.c
index 2e5eb4f..e13420b 100644
--- a/gcc/fortran/trans-array.c
+++ b/gcc/fortran/trans-array.c
@@ -142,6 +142,9 @@
   tree field, type, t;
 
   type = TREE_TYPE (desc);
+  if (TREE_CODE (type) == REFERENCE_TYPE)
+    type = TREE_TYPE (type);
+
   gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
 
   field = TYPE_FIELDS (type);
@@ -5796,6 +5799,7 @@
   gfc_ref *ref, *prev_ref = NULL, *coref;
   bool allocatable, coarray, dimension, alloc_w_e3_arr_spec = false,
       non_ulimate_coarray_ptr_comp;
+  bool oacc_declare = false;
 
   ref = expr->ref;
 
@@ -5810,6 +5814,7 @@
       allocatable = expr->symtree->n.sym->attr.allocatable;
       dimension = expr->symtree->n.sym->attr.dimension;
       non_ulimate_coarray_ptr_comp = false;
+      oacc_declare = expr->symtree->n.sym->attr.oacc_declare_create;
     }
   else
     {
@@ -5994,6 +5999,8 @@
       gfc_conv_descriptor_offset_set (&set_descriptor_block, se->expr, offset);
       tmp = fold_convert (gfc_array_index_type, element_size);
       gfc_conv_descriptor_span_set (&set_descriptor_block, se->expr, tmp);
+      if (oacc_declare)
+        gfc_trans_oacc_declare_allocate (&set_descriptor_block, expr, true);
     }
 
   set_descriptor = gfc_finish_block (&set_descriptor_block);
diff --git a/gcc/fortran/trans-decl.c b/gcc/fortran/trans-decl.c
index 71f751d..a8ccf6a 100644
--- a/gcc/fortran/trans-decl.c
+++ b/gcc/fortran/trans-decl.c
@@ -1425,9 +1425,16 @@
   if (sym_attr.omp_declare_target_link)
     list = tree_cons (get_identifier ("omp declare target link"),
 		      NULL_TREE, list);
-  else if (sym_attr.omp_declare_target)
-    list = tree_cons (get_identifier ("omp declare target"),
-		      NULL_TREE, list);
+  else if (sym_attr.omp_declare_target
+	   || sym_attr.oacc_declare_create
+	   || sym_attr.oacc_declare_copyin
+	   || sym_attr.oacc_declare_deviceptr)
+    {
+      tree c = NULL_TREE;
+      if (sym_attr.oacc_function_nohost)
+	c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_NOHOST);
+      list = tree_cons (get_identifier ("omp declare target"), c, list);
+    }
 
   if (sym_attr.oacc_routine_lop != OACC_ROUTINE_LOP_NONE)
     {
@@ -6523,10 +6530,10 @@
       gfc_omp_map_op map_op;
 
       if (sym->attr.oacc_declare_create)
-	map_op = OMP_MAP_FORCE_ALLOC;
+	map_op = OMP_MAP_ALLOC;
 
       if (sym->attr.oacc_declare_copyin)
-	map_op = OMP_MAP_FORCE_TO;
+	map_op = OMP_MAP_TO;
 
       if (sym->attr.oacc_declare_deviceptr)
 	map_op = OMP_MAP_FORCE_DEVICEPTR;
@@ -6555,6 +6562,8 @@
   gfc_omp_clauses *omp_clauses = NULL;
   gfc_omp_namelist *n, *p;
 
+  module_oacc_clauses = NULL;
+
   gfc_traverse_ns (ns, find_module_oacc_declare_clauses);
 
   if (module_oacc_clauses && sym->attr.flavor == FL_PROGRAM)
@@ -6566,7 +6575,6 @@
       new_oc->clauses = module_oacc_clauses;
 
       ns->oacc_declare = new_oc;
-      module_oacc_clauses = NULL;
     }
 
   if (!ns->oacc_declare)
diff --git a/gcc/fortran/trans-expr.c b/gcc/fortran/trans-expr.c
index fe10d52..2be4fbc 100644
--- a/gcc/fortran/trans-expr.c
+++ b/gcc/fortran/trans-expr.c
@@ -2403,7 +2403,7 @@
 
 /* Convert a derived type component reference.  */
 
-static void
+void
 gfc_conv_component_ref (gfc_se * se, gfc_ref * ref)
 {
   gfc_component *c;
@@ -2493,7 +2493,7 @@
 
 /* This function deals with component references to components of the
    parent type for derived type extensions.  */
-static void
+void
 conv_parent_component_references (gfc_se * se, gfc_ref * ref)
 {
   gfc_component *c;
@@ -2559,6 +2559,95 @@
   se->expr = res;
 }
 
+/* Transparently dereference VAR if it is a pointer, reference, etc.
+   according to Fortran semantics.  */
+
+tree
+gfc_auto_dereference_var (location_t loc, gfc_symbol *sym, tree var,
+			  bool descriptor_only_p, bool is_classarray)
+{
+  /* Characters are entirely different from other types, they are treated
+     separately.  */
+  if (sym->ts.type == BT_CHARACTER)
+    {
+      /* Dereference character pointer dummy arguments
+	 or results.  */
+      if ((sym->attr.pointer || sym->attr.allocatable)
+	  && (sym->attr.dummy
+	      || sym->attr.function
+	      || sym->attr.result))
+	var = build_fold_indirect_ref_loc (loc, var);
+    }
+  else if (!sym->attr.value)
+    {
+      /* Dereference temporaries for class array dummy arguments.  */
+      if (sym->attr.dummy && is_classarray
+	  && GFC_ARRAY_TYPE_P (TREE_TYPE (var)))
+	{
+	  if (!descriptor_only_p)
+	    var = GFC_DECL_SAVED_DESCRIPTOR (var);
+
+	  var = build_fold_indirect_ref_loc (loc, var);
+	}
+
+      /* Dereference non-character scalar dummy arguments.  */
+      if (sym->attr.dummy && !sym->attr.dimension
+	  && !(sym->attr.codimension && sym->attr.allocatable)
+	  && (sym->ts.type != BT_CLASS
+	      || (!CLASS_DATA (sym)->attr.dimension
+		  && !(CLASS_DATA (sym)->attr.codimension
+		       && CLASS_DATA (sym)->attr.allocatable))))
+	var = build_fold_indirect_ref_loc (loc, var);
+
+      /* Dereference scalar hidden result.  */
+      if (flag_f2c && sym->ts.type == BT_COMPLEX
+	  && (sym->attr.function || sym->attr.result)
+	  && !sym->attr.dimension && !sym->attr.pointer
+	  && !sym->attr.always_explicit)
+	var = build_fold_indirect_ref_loc (loc, var);
+
+      /* Dereference non-character, non-class pointer variables.
+	 These must be dummies, results, or scalars.  */
+      if (!is_classarray
+	  && (sym->attr.pointer || sym->attr.allocatable
+	      || gfc_is_associate_pointer (sym)
+	      || (sym->as && sym->as->type == AS_ASSUMED_RANK))
+	  && (sym->attr.dummy
+	      || sym->attr.function
+	      || sym->attr.result
+	      || (!sym->attr.dimension
+		  && (!sym->attr.codimension || !sym->attr.allocatable))))
+	var = build_fold_indirect_ref_loc (loc, var);
+      /* Now treat the class array pointer variables accordingly.  */
+      else if (sym->ts.type == BT_CLASS
+	       && sym->attr.dummy
+	       && (CLASS_DATA (sym)->attr.dimension
+		   || CLASS_DATA (sym)->attr.codimension)
+	       && ((CLASS_DATA (sym)->as
+		    && CLASS_DATA (sym)->as->type == AS_ASSUMED_RANK)
+		   || CLASS_DATA (sym)->attr.allocatable
+		   || CLASS_DATA (sym)->attr.class_pointer))
+	var = build_fold_indirect_ref_loc (loc, var);
+      /* And the case where a non-dummy, non-result, non-function,
+	 non-allotable and non-pointer classarray is present.  This case was
+	 previously covered by the first if, but with introducing the
+	 condition !is_classarray there, that case has to be covered
+	 explicitly.  */
+      else if (sym->ts.type == BT_CLASS
+	       && !sym->attr.dummy
+	       && !sym->attr.function
+	       && !sym->attr.result
+	       && (CLASS_DATA (sym)->attr.dimension
+		   || CLASS_DATA (sym)->attr.codimension)
+	       && (sym->assoc
+		   || !CLASS_DATA (sym)->attr.allocatable)
+	       && !CLASS_DATA (sym)->attr.class_pointer)
+	var = build_fold_indirect_ref_loc (loc, var);
+    }
+
+  return var;
+}
+
 /* Return the contents of a variable. Also handles reference/pointer
    variables (all Fortran pointer references are implicit).  */
 
@@ -2665,94 +2754,9 @@
 	  return;
 	}
 
-
-      /* Dereference the expression, where needed. Since characters
-	 are entirely different from other types, they are treated
-	 separately.  */
-      if (sym->ts.type == BT_CHARACTER)
-	{
-	  /* Dereference character pointer dummy arguments
-	     or results.  */
-	  if ((sym->attr.pointer || sym->attr.allocatable)
-	      && (sym->attr.dummy
-		  || sym->attr.function
-		  || sym->attr.result))
-	    se->expr = build_fold_indirect_ref_loc (input_location,
-						se->expr);
-
-	}
-      else if (!sym->attr.value)
-	{
-	  /* Dereference temporaries for class array dummy arguments.  */
-	  if (sym->attr.dummy && is_classarray
-	      && GFC_ARRAY_TYPE_P (TREE_TYPE (se->expr)))
-	    {
-	      if (!se->descriptor_only)
-		se->expr = GFC_DECL_SAVED_DESCRIPTOR (se->expr);
-
-	      se->expr = build_fold_indirect_ref_loc (input_location,
-						      se->expr);
-	    }
-
-	  /* Dereference non-character scalar dummy arguments.  */
-	  if (sym->attr.dummy && !sym->attr.dimension
-	      && !(sym->attr.codimension && sym->attr.allocatable)
-	      && (sym->ts.type != BT_CLASS
-		  || (!CLASS_DATA (sym)->attr.dimension
-		      && !(CLASS_DATA (sym)->attr.codimension
-			   && CLASS_DATA (sym)->attr.allocatable))))
-	    se->expr = build_fold_indirect_ref_loc (input_location,
-						se->expr);
-
-          /* Dereference scalar hidden result.  */
-	  if (flag_f2c && sym->ts.type == BT_COMPLEX
-	      && (sym->attr.function || sym->attr.result)
-	      && !sym->attr.dimension && !sym->attr.pointer
-	      && !sym->attr.always_explicit)
-	    se->expr = build_fold_indirect_ref_loc (input_location,
-						se->expr);
-
-	  /* Dereference non-character, non-class pointer variables.
-	     These must be dummies, results, or scalars.  */
-	  if (!is_classarray
-	      && (sym->attr.pointer || sym->attr.allocatable
-		  || gfc_is_associate_pointer (sym)
-		  || (sym->as && sym->as->type == AS_ASSUMED_RANK))
-	      && (sym->attr.dummy
-		  || sym->attr.function
-		  || sym->attr.result
-		  || (!sym->attr.dimension
-		      && (!sym->attr.codimension || !sym->attr.allocatable))))
-	    se->expr = build_fold_indirect_ref_loc (input_location,
-						se->expr);
-	  /* Now treat the class array pointer variables accordingly.  */
-	  else if (sym->ts.type == BT_CLASS
-		   && sym->attr.dummy
-		   && (CLASS_DATA (sym)->attr.dimension
-		       || CLASS_DATA (sym)->attr.codimension)
-		   && ((CLASS_DATA (sym)->as
-			&& CLASS_DATA (sym)->as->type == AS_ASSUMED_RANK)
-		       || CLASS_DATA (sym)->attr.allocatable
-		       || CLASS_DATA (sym)->attr.class_pointer))
-	    se->expr = build_fold_indirect_ref_loc (input_location,
-						se->expr);
-	  /* And the case where a non-dummy, non-result, non-function,
-	     non-allotable and non-pointer classarray is present.  This case was
-	     previously covered by the first if, but with introducing the
-	     condition !is_classarray there, that case has to be covered
-	     explicitly.  */
-	  else if (sym->ts.type == BT_CLASS
-		   && !sym->attr.dummy
-		   && !sym->attr.function
-		   && !sym->attr.result
-		   && (CLASS_DATA (sym)->attr.dimension
-		       || CLASS_DATA (sym)->attr.codimension)
-		   && (sym->assoc
-		       || !CLASS_DATA (sym)->attr.allocatable)
-		   && !CLASS_DATA (sym)->attr.class_pointer)
-	    se->expr = build_fold_indirect_ref_loc (input_location,
-						se->expr);
-	}
+      /* Dereference the expression, where needed.  */
+      se->expr = gfc_auto_dereference_var (input_location, sym, se->expr,
+					   se->descriptor_only, is_classarray);
 
       ref = expr->ref;
     }
diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index 52a2cc8..2e7aec9 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -47,6 +47,25 @@
 
 int ompws_flags;
 
+tree
+gfc_omp_array_data (tree decl)
+{
+  tree type = TREE_TYPE (decl);
+
+  if (TREE_CODE (type) == REFERENCE_TYPE || POINTER_TYPE_P (type))
+    type = TREE_TYPE (type);
+
+  if (!GFC_DESCRIPTOR_TYPE_P (type))
+    return NULL_TREE;
+
+  if (POINTER_TYPE_P (TREE_TYPE (decl)))
+    decl = build_fold_indirect_ref (decl);
+
+  decl = gfc_conv_descriptor_data_get (decl);
+  STRIP_NOPS (decl);
+  return decl;
+}
+
 /* True if OpenMP should privatize what this DECL points to rather
    than the DECL itself.  */
 
@@ -61,6 +80,9 @@
 
   if (TREE_CODE (type) == POINTER_TYPE)
     {
+      while (TREE_CODE (decl) == COMPONENT_REF)
+        decl = TREE_OPERAND (decl, 1);
+
       /* Array POINTER/ALLOCATABLE have aggregate types, all user variables
 	 that have POINTER_TYPE type and aren't scalar pointers, scalar
 	 allocatables, Cray pointees or C pointers are supposed to be
@@ -578,7 +600,8 @@
   stmtblock_t block, cond_block;
 
   gcc_assert (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_FIRSTPRIVATE
-	      || OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_LINEAR);
+	      || OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_LINEAR
+	      || OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_REDUCTION);
 
   if ((! GFC_DESCRIPTOR_TYPE_P (type)
        || GFC_TYPE_ARRAY_AKIND (type) != GFC_ARRAY_ALLOCATABLE)
@@ -1067,6 +1090,62 @@
   return tem;
 }
 
+/* Build a conditional expression in BLOCK.  If COND_VAL is not
+   null, then the block THEN_B is executed, otherwise ELSE_VAL
+   is assigned to VAL.  */
+
+static void
+gfc_build_conditional_assign (stmtblock_t *block,
+			      tree val,
+			      tree cond_val,
+			      tree then_b,
+			      tree else_val)
+{
+  stmtblock_t cond_block;
+  tree cond, else_b;
+  tree val_ty = TREE_TYPE (val);
+
+  gfc_init_block (&cond_block);
+  gfc_add_modify (&cond_block, val, fold_convert (val_ty, else_val));
+  else_b = gfc_finish_block (&cond_block);
+  cond = fold_convert (pvoid_type_node, cond_val);
+  cond = fold_build2_loc (input_location, NE_EXPR,
+			  logical_type_node,
+			  cond, null_pointer_node);
+  gfc_add_expr_to_block (block,
+			 build3_loc (input_location,
+				     COND_EXPR,
+				     void_type_node,
+				     cond, then_b,
+				     else_b));
+}
+
+/* Build a conditional expression in BLOCK, returning a temporary
+   variable containing the result.  If COND_VAL is not null, then
+   THEN_VAL will be assigned to the variable, otherwise ELSE_VAL
+   is assigned.
+ */
+
+static tree
+gfc_build_conditional_assign_expr (stmtblock_t *block,
+				   tree cond_val,
+				   tree then_val,
+				   tree else_val)
+{
+  tree val;
+  tree val_ty = TREE_TYPE (then_val);
+  stmtblock_t cond_block;
+
+  val = create_tmp_var (val_ty);
+
+  gfc_init_block (&cond_block);
+  gfc_add_modify (&cond_block, val, then_val);
+  tree then_b = gfc_finish_block (&cond_block);
+
+  gfc_build_conditional_assign (block, val, cond_val, then_b, else_val);
+
+  return val;
+}
 
 void
 gfc_omp_finish_clause (tree c, gimple_seq *pre_p)
@@ -1076,22 +1155,32 @@
 
   tree decl = OMP_CLAUSE_DECL (c);
 
-  /* Assumed-size arrays can't be mapped implicitly, they have to be
-     mapped explicitly using array sections.  */
-  if (TREE_CODE (decl) == PARM_DECL
+  /* Assumed-size arrays can't be mapped implicitly, they have to be mapped
+     explicitly using array sections.  For OpenACC this restriction is lifted
+     if the array has already been mapped:
+
+       - Using a lexically-enclosing data region: in that case we see the
+         GOMP_MAP_FORCE_PRESENT mapping kind here.
+
+       - Using a non-lexical data mapping ("acc enter data").
+
+     In the latter case we change the mapping type to GOMP_MAP_FORCE_PRESENT.
+     This raises an error for OpenMP in the caller
+     (gimplify.c:gimplify_adjust_omp_clauses_1).  OpenACC will raise a runtime
+     error if the implicitly-referenced assumed-size array is not mapped.  */
+  if (OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_FORCE_PRESENT
+      && TREE_CODE (decl) == PARM_DECL
       && GFC_ARRAY_TYPE_P (TREE_TYPE (decl))
       && GFC_TYPE_ARRAY_AKIND (TREE_TYPE (decl)) == GFC_ARRAY_UNKNOWN
       && GFC_TYPE_ARRAY_UBOUND (TREE_TYPE (decl),
 				GFC_TYPE_ARRAY_RANK (TREE_TYPE (decl)) - 1)
 	 == NULL)
-    {
-      error_at (OMP_CLAUSE_LOCATION (c),
-		"implicit mapping of assumed size array %qD", decl);
-      return;
-    }
+    OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_PRESENT);
 
   tree c2 = NULL_TREE, c3 = NULL_TREE, c4 = NULL_TREE;
-  if (POINTER_TYPE_P (TREE_TYPE (decl)))
+  if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FORCE_DEVICEPTR)
+    return;
+  if (DECL_P (decl) && POINTER_TYPE_P (TREE_TYPE (decl)))
     {
       if (!gfc_omp_privatize_by_reference (decl)
 	  && !GFC_DECL_GET_SCALAR_POINTER (decl)
@@ -1124,17 +1213,46 @@
       stmtblock_t block;
       gfc_start_block (&block);
       tree type = TREE_TYPE (decl);
-      tree ptr = gfc_conv_descriptor_data_get (decl);
+      bool optional_arg_p =
+	        TREE_CODE (decl) == INDIRECT_REF
+	      && TREE_CODE (TREE_OPERAND (decl, 0)) == PARM_DECL
+	      && DECL_BY_REFERENCE (TREE_OPERAND (decl, 0))
+	      && TREE_CODE (TREE_TYPE (TREE_OPERAND (decl, 0))) == POINTER_TYPE;
+      tree ptr;
+
+      if (optional_arg_p)
+	ptr = gfc_build_conditional_assign_expr (
+		&block,
+		TREE_OPERAND (decl, 0),
+		gfc_conv_descriptor_data_get (decl),
+		null_pointer_node);
+      else
+	ptr = gfc_conv_descriptor_data_get (decl);
       ptr = fold_convert (build_pointer_type (char_type_node), ptr);
       ptr = build_fold_indirect_ref (ptr);
       OMP_CLAUSE_DECL (c) = ptr;
       c2 = build_omp_clause (input_location, OMP_CLAUSE_MAP);
       OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_TO_PSET);
-      OMP_CLAUSE_DECL (c2) = decl;
+      if (optional_arg_p)
+	{
+	  ptr = create_tmp_var (TREE_TYPE (TREE_OPERAND (decl, 0)));
+	  gfc_add_modify (&block, ptr, TREE_OPERAND (decl, 0));
+
+	  OMP_CLAUSE_DECL (c2) = build_fold_indirect_ref (ptr);
+	}
+      else
+	OMP_CLAUSE_DECL (c2) = decl;
       OMP_CLAUSE_SIZE (c2) = TYPE_SIZE_UNIT (type);
       c3 = build_omp_clause (OMP_CLAUSE_LOCATION (c), OMP_CLAUSE_MAP);
       OMP_CLAUSE_SET_MAP_KIND (c3, GOMP_MAP_POINTER);
-      OMP_CLAUSE_DECL (c3) = gfc_conv_descriptor_data_get (decl);
+      if (optional_arg_p)
+	OMP_CLAUSE_DECL (c3) = gfc_build_conditional_assign_expr (
+		&block,
+		TREE_OPERAND (decl, 0),
+		gfc_conv_descriptor_data_get (decl),
+		null_pointer_node);
+      else
+	OMP_CLAUSE_DECL (c3) = gfc_conv_descriptor_data_get (decl);
       OMP_CLAUSE_SIZE (c3) = size_int (0);
       tree size = create_tmp_var (gfc_array_index_type);
       tree elemsz = TYPE_SIZE_UNIT (gfc_get_element_type (type));
@@ -1165,6 +1283,27 @@
 						     void_type_node, cond,
 						     then_b, else_b));
 	}
+      else if (optional_arg_p)
+	{
+	  stmtblock_t cond_block;
+	  tree then_b;
+
+	  gfc_init_block (&cond_block);
+	  gfc_add_modify (&cond_block, size,
+			  gfc_full_array_size (&cond_block, decl,
+					       GFC_TYPE_ARRAY_RANK (type)));
+	  gfc_add_modify (&cond_block, size,
+			  fold_build2 (MULT_EXPR, gfc_array_index_type,
+				       size, elemsz));
+	  then_b = gfc_finish_block (&cond_block);
+
+	  gfc_build_conditional_assign (
+		  &block,
+		  size,
+		  TREE_OPERAND (decl, 0),
+		  then_b,
+		  build_int_cst (gfc_array_index_type, 0));
+	}
       else
 	{
 	  gfc_add_modify (&block, size,
@@ -1312,7 +1451,6 @@
     }
 }
 
-
 static inline tree
 gfc_trans_add_clause (tree node, tree tail)
 {
@@ -1822,6 +1960,92 @@
 
 static vec<tree, va_heap, vl_embed> *doacross_steps;
 
+
+/* Translate an array section or array element.  */
+
+static void
+gfc_trans_omp_array_section (stmtblock_t *block, gfc_omp_namelist *n,
+			     tree decl, bool element, gomp_map_kind ptr_kind,
+			     tree node, tree &node2, tree &node3, tree &node4)
+{
+  gfc_se se;
+  tree ptr, ptr2;
+
+  gfc_init_se (&se, NULL);
+
+  if (element)
+    {
+      gfc_conv_expr_reference (&se, n->expr);
+      gfc_add_block_to_block (block, &se.pre);
+      ptr = se.expr;
+      OMP_CLAUSE_SIZE (node)
+	= TYPE_SIZE_UNIT (TREE_TYPE (ptr));
+    }
+  else
+    {
+      gfc_conv_expr_descriptor (&se, n->expr);
+      ptr = gfc_conv_array_data (se.expr);
+      tree type = TREE_TYPE (se.expr);
+      gfc_add_block_to_block (block, &se.pre);
+      OMP_CLAUSE_SIZE (node) = gfc_full_array_size (block, se.expr,
+						    GFC_TYPE_ARRAY_RANK (type));
+      tree elemsz = TYPE_SIZE_UNIT (gfc_get_element_type (type));
+      elemsz = fold_convert (gfc_array_index_type, elemsz);
+      OMP_CLAUSE_SIZE (node) = fold_build2 (MULT_EXPR, gfc_array_index_type,
+					    OMP_CLAUSE_SIZE (node), elemsz);
+    }
+  gfc_add_block_to_block (block, &se.post);
+  ptr = fold_convert (build_pointer_type (char_type_node), ptr);
+  OMP_CLAUSE_DECL (node) = build_fold_indirect_ref (ptr);
+
+  if (POINTER_TYPE_P (TREE_TYPE (decl))
+      && GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (TREE_TYPE (decl)))
+      && ptr_kind == GOMP_MAP_POINTER)
+    {
+      node4 = build_omp_clause (input_location,
+				OMP_CLAUSE_MAP);
+      OMP_CLAUSE_SET_MAP_KIND (node4, GOMP_MAP_POINTER);
+      OMP_CLAUSE_DECL (node4) = decl;
+      OMP_CLAUSE_SIZE (node4) = size_int (0);
+      decl = build_fold_indirect_ref (decl);
+    }
+  ptr = fold_convert (sizetype, ptr);
+  if (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (decl)))
+    {
+      tree type = TREE_TYPE (decl);
+      ptr2 = gfc_conv_descriptor_data_get (decl);
+      node2 = build_omp_clause (input_location,
+				OMP_CLAUSE_MAP);
+      OMP_CLAUSE_SET_MAP_KIND (node2, GOMP_MAP_TO_PSET);
+      OMP_CLAUSE_DECL (node2) = decl;
+      OMP_CLAUSE_SIZE (node2) = TYPE_SIZE_UNIT (type);
+      node3 = build_omp_clause (input_location,
+				OMP_CLAUSE_MAP);
+      OMP_CLAUSE_SET_MAP_KIND (node3, ptr_kind);
+      OMP_CLAUSE_DECL (node3)
+	= gfc_conv_descriptor_data_get (decl);
+      if (ptr_kind == GOMP_MAP_ATTACH_DETACH)
+        STRIP_NOPS (OMP_CLAUSE_DECL (node3));
+    }
+  else
+    {
+      if (TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE)
+	ptr2 = build_fold_addr_expr (decl);
+      else
+	{
+	  gcc_assert (POINTER_TYPE_P (TREE_TYPE (decl)));
+	  ptr2 = decl;
+	}
+      node3 = build_omp_clause (input_location,
+				OMP_CLAUSE_MAP);
+      OMP_CLAUSE_SET_MAP_KIND (node3, ptr_kind);
+      OMP_CLAUSE_DECL (node3) = decl;
+    }
+  ptr2 = fold_convert (sizetype, ptr2);
+  OMP_CLAUSE_SIZE (node3)
+    = fold_build2 (MINUS_EXPR, sizetype, ptr, ptr2);
+}
+
 static tree
 gfc_trans_omp_clauses (stmtblock_t *block, gfc_omp_clauses *clauses,
 		       locus where, bool declare_simd = false)
@@ -2075,7 +2299,7 @@
 	      tree node = build_omp_clause (input_location, OMP_CLAUSE_DEPEND);
 	      if (n->expr == NULL || n->expr->ref->u.ar.type == AR_FULL)
 		{
-		  tree decl = gfc_get_symbol_decl (n->sym);
+		  tree decl = gfc_trans_omp_variable (n->sym, false);
 		  if (gfc_omp_privatize_by_reference (decl))
 		    decl = build_fold_indirect_ref (decl);
 		  if (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (decl)))
@@ -2136,23 +2360,68 @@
 	      tree node2 = NULL_TREE;
 	      tree node3 = NULL_TREE;
 	      tree node4 = NULL_TREE;
-	      tree decl = gfc_get_symbol_decl (n->sym);
+	      tree decl = gfc_trans_omp_variable (n->sym, false);
 	      if (DECL_P (decl))
 		TREE_ADDRESSABLE (decl) = 1;
+
 	      if (n->expr == NULL || n->expr->ref->u.ar.type == AR_FULL)
 		{
 		  if (POINTER_TYPE_P (TREE_TYPE (decl))
-		      && (gfc_omp_privatize_by_reference (decl)
-			  || GFC_DECL_GET_SCALAR_POINTER (decl)
-			  || GFC_DECL_GET_SCALAR_ALLOCATABLE (decl)
-			  || GFC_DECL_CRAY_POINTEE (decl)
-			  || GFC_DESCRIPTOR_TYPE_P
-					(TREE_TYPE (TREE_TYPE (decl)))))
+		      && n->u.map_op == OMP_MAP_FORCE_DEVICEPTR)
+		    {
+		      OMP_CLAUSE_DECL (node) = decl;
+		      goto finalize_map_clause;
+		    }
+		  else if (n->sym->ts.type == BT_CLASS)
+		    {
+		      tree type = TREE_TYPE (decl);
+		      if (n->sym->attr.optional)
+		        sorry ("optional class parameter");
+		      if (POINTER_TYPE_P (type))
+		        {
+			  node4 = build_omp_clause (input_location,
+						    OMP_CLAUSE_MAP);
+			  OMP_CLAUSE_SET_MAP_KIND (node4, GOMP_MAP_POINTER);
+			  OMP_CLAUSE_DECL (node4) = decl;
+			  OMP_CLAUSE_SIZE (node4) = size_int (0);
+			  decl = build_fold_indirect_ref (decl);
+			}
+		      tree ptr = gfc_class_data_get (decl);
+		      ptr = build_fold_indirect_ref (ptr);
+		      OMP_CLAUSE_DECL (node) = ptr;
+		      OMP_CLAUSE_SIZE (node) = gfc_class_vtab_size_get (decl);
+		      node2 = build_omp_clause (input_location, OMP_CLAUSE_MAP);
+		      OMP_CLAUSE_SET_MAP_KIND (node2, GOMP_MAP_TO_PSET);
+		      OMP_CLAUSE_DECL (node2) = decl;
+		      OMP_CLAUSE_SIZE (node2) = TYPE_SIZE_UNIT (type);
+		      node3 = build_omp_clause (input_location, OMP_CLAUSE_MAP);
+		      OMP_CLAUSE_SET_MAP_KIND (node3, GOMP_MAP_ATTACH_DETACH);
+		      OMP_CLAUSE_DECL (node3) = gfc_class_data_get (decl);
+		      OMP_CLAUSE_SIZE (node3) = size_int (0);
+		      goto finalize_map_clause;
+		    }
+		  else if (POINTER_TYPE_P (TREE_TYPE (decl))
+			   && (gfc_omp_privatize_by_reference (decl)
+			       || GFC_DECL_GET_SCALAR_POINTER (decl)
+			       || GFC_DECL_GET_SCALAR_ALLOCATABLE (decl)
+			       || GFC_DECL_CRAY_POINTEE (decl)
+			       || GFC_DESCRIPTOR_TYPE_P
+					     (TREE_TYPE (TREE_TYPE (decl)))
+			       || n->sym->ts.type == BT_DERIVED))
 		    {
 		      tree orig_decl = decl;
+		      enum gomp_map_kind gmk = GOMP_MAP_POINTER;
+		      if (GFC_DECL_GET_SCALAR_ALLOCATABLE (decl)
+			  && n->sym->attr.oacc_declare_create)
+			{
+			  if (clauses->update_allocatable)
+			    gmk = GOMP_MAP_ALWAYS_POINTER;
+			  else
+			    gmk = GOMP_MAP_FIRSTPRIVATE_POINTER;
+			}
 		      node4 = build_omp_clause (input_location,
 						OMP_CLAUSE_MAP);
-		      OMP_CLAUSE_SET_MAP_KIND (node4, GOMP_MAP_POINTER);
+		      OMP_CLAUSE_SET_MAP_KIND (node4, gmk);
 		      OMP_CLAUSE_DECL (node4) = decl;
 		      OMP_CLAUSE_SIZE (node4) = size_int (0);
 		      decl = build_fold_indirect_ref (decl);
@@ -2168,10 +2437,22 @@
 			  decl = build_fold_indirect_ref (decl);
 			}
 		    }
-		  if (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (decl)))
+		  if (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (decl))
+		      && n->u.map_op != OMP_MAP_ATTACH
+		      && n->u.map_op != OMP_MAP_DETACH)
 		    {
 		      tree type = TREE_TYPE (decl);
-		      tree ptr = gfc_conv_descriptor_data_get (decl);
+		      tree ptr;
+
+		      if (n->sym->attr.optional)
+			ptr = gfc_build_conditional_assign_expr (
+				block,
+				TREE_OPERAND (decl, 0),
+				gfc_conv_descriptor_data_get (decl),
+				null_pointer_node);
+		      else
+			ptr = gfc_conv_descriptor_data_get (decl);
+
 		      ptr = fold_convert (build_pointer_type (char_type_node),
 					  ptr);
 		      ptr = build_fold_indirect_ref (ptr);
@@ -2190,34 +2471,30 @@
 
 		      /* We have to check for n->sym->attr.dimension because
 			 of scalar coarrays.  */
-		      if (n->sym->attr.pointer && n->sym->attr.dimension)
+		      if ((n->sym->attr.pointer || n->sym->attr.optional)
+			  && n->sym->attr.dimension)
 			{
 			  stmtblock_t cond_block;
 			  tree size
 			    = gfc_create_var (gfc_array_index_type, NULL);
-			  tree tem, then_b, else_b, zero, cond;
+			  tree cond = n->sym->attr.optional
+			      ? TREE_OPERAND (decl, 0)
+			      : gfc_conv_descriptor_data_get (decl);
 
 			  gfc_init_block (&cond_block);
-			  tem
-			    = gfc_full_array_size (&cond_block, decl,
-						   GFC_TYPE_ARRAY_RANK (type));
-			  gfc_add_modify (&cond_block, size, tem);
-			  then_b = gfc_finish_block (&cond_block);
-			  gfc_init_block (&cond_block);
-			  zero = build_int_cst (gfc_array_index_type, 0);
-			  gfc_add_modify (&cond_block, size, zero);
-			  else_b = gfc_finish_block (&cond_block);
-			  tem = gfc_conv_descriptor_data_get (decl);
-			  tem = fold_convert (pvoid_type_node, tem);
-			  cond = fold_build2_loc (input_location, NE_EXPR,
-						  logical_type_node,
-						  tem, null_pointer_node);
-			  gfc_add_expr_to_block (block,
-						 build3_loc (input_location,
-							     COND_EXPR,
-							     void_type_node,
-							     cond, then_b,
-							     else_b));
+			  gfc_add_modify (&cond_block, size,
+					  gfc_full_array_size (
+					      &cond_block, decl,
+					      GFC_TYPE_ARRAY_RANK (type)));
+			  tree then_b = gfc_finish_block (&cond_block);
+
+			  gfc_build_conditional_assign (
+				  block,
+				  size,
+				  cond,
+				  then_b,
+				  build_int_cst (gfc_array_index_type, 0));
+
 			  OMP_CLAUSE_SIZE (node) = size;
 			}
 		      else if (n->sym->attr.dimension)
@@ -2237,88 +2514,172 @@
 		  else
 		    OMP_CLAUSE_DECL (node) = decl;
 		}
-	      else
+	      else if (n->expr
+		       && n->expr->expr_type == EXPR_VARIABLE
+		       && n->expr->ref->type == REF_COMPONENT)
 		{
-		  tree ptr, ptr2;
-		  gfc_init_se (&se, NULL);
-		  if (n->expr->ref->u.ar.type == AR_ELEMENT)
-		    {
-		      gfc_conv_expr_reference (&se, n->expr);
-		      gfc_add_block_to_block (block, &se.pre);
-		      ptr = se.expr;
-		      OMP_CLAUSE_SIZE (node)
-			= TYPE_SIZE_UNIT (TREE_TYPE (ptr));
-		    }
-		  else
-		    {
-		      gfc_conv_expr_descriptor (&se, n->expr);
-		      ptr = gfc_conv_array_data (se.expr);
-		      tree type = TREE_TYPE (se.expr);
-		      gfc_add_block_to_block (block, &se.pre);
-		      OMP_CLAUSE_SIZE (node)
-			= gfc_full_array_size (block, se.expr,
-					       GFC_TYPE_ARRAY_RANK (type));
-		      tree elemsz
-			= TYPE_SIZE_UNIT (gfc_get_element_type (type));
-		      elemsz = fold_convert (gfc_array_index_type, elemsz);
-		      OMP_CLAUSE_SIZE (node)
-			= fold_build2 (MULT_EXPR, gfc_array_index_type,
-				       OMP_CLAUSE_SIZE (node), elemsz);
-		    }
-		  gfc_add_block_to_block (block, &se.post);
-		  ptr = fold_convert (build_pointer_type (char_type_node),
-				      ptr);
-		  OMP_CLAUSE_DECL (node) = build_fold_indirect_ref (ptr);
+		  gfc_ref *lastcomp;
 
-		  if (POINTER_TYPE_P (TREE_TYPE (decl))
-		      && GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
-		    {
-		      node4 = build_omp_clause (input_location,
-						OMP_CLAUSE_MAP);
-		      OMP_CLAUSE_SET_MAP_KIND (node4, GOMP_MAP_POINTER);
-		      OMP_CLAUSE_DECL (node4) = decl;
-		      OMP_CLAUSE_SIZE (node4) = size_int (0);
-		      decl = build_fold_indirect_ref (decl);
-		    }
-		  ptr = fold_convert (sizetype, ptr);
-		  if (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (decl)))
-		    {
-		      tree type = TREE_TYPE (decl);
-		      ptr2 = gfc_conv_descriptor_data_get (decl);
-		      node2 = build_omp_clause (input_location,
-						OMP_CLAUSE_MAP);
-		      OMP_CLAUSE_SET_MAP_KIND (node2, GOMP_MAP_TO_PSET);
-		      OMP_CLAUSE_DECL (node2) = decl;
-		      OMP_CLAUSE_SIZE (node2) = TYPE_SIZE_UNIT (type);
-		      node3 = build_omp_clause (input_location,
-						OMP_CLAUSE_MAP);
-		      OMP_CLAUSE_SET_MAP_KIND (node3, GOMP_MAP_POINTER);
-		      OMP_CLAUSE_DECL (node3)
-			= gfc_conv_descriptor_data_get (decl);
-		    }
+		  for (gfc_ref *ref = n->expr->ref; ref; ref = ref->next)
+		    if (ref->type == REF_COMPONENT)
+		      lastcomp = ref;
+
+		  symbol_attribute sym_attr;
+
+		  if (lastcomp->u.c.component->ts.type == BT_CLASS)
+		    sym_attr = CLASS_DATA (lastcomp->u.c.component)->attr;
 		  else
+		    sym_attr = lastcomp->u.c.component->attr;
+
+		  gfc_init_se (&se, NULL);
+
+		  if (!sym_attr.dimension
+		      && lastcomp->u.c.component->ts.type != BT_CLASS
+		      && lastcomp->u.c.component->ts.type != BT_DERIVED)
 		    {
-		      if (TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE)
-			ptr2 = build_fold_addr_expr (decl);
-		      else
-			{
-			  gcc_assert (POINTER_TYPE_P (TREE_TYPE (decl)));
-			  ptr2 = decl;
-			}
-		      node3 = build_omp_clause (input_location,
-						OMP_CLAUSE_MAP);
-		      OMP_CLAUSE_SET_MAP_KIND (node3, GOMP_MAP_POINTER);
-		      OMP_CLAUSE_DECL (node3) = decl;
+		      /* Last component is a scalar.  */
+		      gfc_conv_expr (&se, n->expr);
+		      gfc_add_block_to_block (block, &se.pre);
+		      OMP_CLAUSE_DECL (node) = se.expr;
+		      gfc_add_block_to_block (block, &se.post);
+		      goto finalize_map_clause;
 		    }
-		  ptr2 = fold_convert (sizetype, ptr2);
-		  OMP_CLAUSE_SIZE (node3)
-		    = fold_build2 (MINUS_EXPR, sizetype, ptr, ptr2);
+
+		  se.expr
+		    = gfc_auto_dereference_var (input_location, n->sym,
+						decl);
+
+		  for (gfc_ref *ref = n->expr->ref;
+		       ref && ref != lastcomp->next;
+		       ref = ref->next)
+		    {
+		      if (ref->type == REF_COMPONENT)
+			{
+			  if (ref->u.c.sym->attr.extension)
+		            conv_parent_component_references (&se, ref);
+
+			  gfc_conv_component_ref (&se, ref);
+			}
+		      else
+			sorry ("unhandled derived-type component");
+		    }
+
+		  tree inner = se.expr;
+
+		  /* Last component is a derived type or class pointer.  */
+		  if (lastcomp->u.c.component->ts.type == BT_DERIVED
+		      || lastcomp->u.c.component->ts.type == BT_CLASS)
+		    {
+		      if (sym_attr.allocatable || sym_attr.pointer)
+		        {
+			  tree data, size;
+
+			  if (lastcomp->u.c.component->ts.type == BT_CLASS)
+			    {
+			      data = gfc_class_data_get (inner);
+			      size = gfc_class_vtab_size_get (inner);
+			    }
+			  else  /* BT_DERIVED.  */
+			    {
+			      data = inner;
+			      size = TYPE_SIZE_UNIT (TREE_TYPE (inner));
+			    }
+
+			  OMP_CLAUSE_DECL (node)
+			    = build_fold_indirect_ref (data);
+			  OMP_CLAUSE_SIZE (node) = size;
+			  node2 = build_omp_clause (input_location,
+						    OMP_CLAUSE_MAP);
+			  OMP_CLAUSE_SET_MAP_KIND (node2,
+						   GOMP_MAP_ATTACH_DETACH);
+			  OMP_CLAUSE_DECL (node2) = data;
+			  OMP_CLAUSE_SIZE (node2) = size_int (0);
+			}
+		      else
+		        {
+		          OMP_CLAUSE_DECL (node) = decl;
+			  OMP_CLAUSE_SIZE (node)
+			    = TYPE_SIZE_UNIT (TREE_TYPE (decl));
+			}
+		    }
+		  else if (lastcomp->next
+			   && lastcomp->next->type == REF_ARRAY
+			   && lastcomp->next->u.ar.type == AR_FULL)
+		    {
+		      /* Just pass the (auto-dereferenced) decl through for
+			 bare attach and detach clauses.  */
+		      if (n->u.map_op == OMP_MAP_ATTACH
+			  || n->u.map_op == OMP_MAP_DETACH)
+			{
+			  OMP_CLAUSE_DECL (node) = inner;
+			  OMP_CLAUSE_SIZE (node) = size_zero_node;
+			  goto finalize_map_clause;
+			}
+
+		      if (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (inner)))
+		        {
+			  tree type = TREE_TYPE (inner);
+			  tree ptr = gfc_conv_descriptor_data_get (inner);
+			  ptr = build_fold_indirect_ref (ptr);
+			  OMP_CLAUSE_DECL (node) = ptr;
+			  node2 = build_omp_clause (input_location,
+						    OMP_CLAUSE_MAP);
+			  OMP_CLAUSE_SET_MAP_KIND (node2, GOMP_MAP_TO_PSET);
+			  OMP_CLAUSE_DECL (node2) = inner;
+			  OMP_CLAUSE_SIZE (node2) = TYPE_SIZE_UNIT (type);
+			  node3 = build_omp_clause (input_location,
+						    OMP_CLAUSE_MAP);
+			  OMP_CLAUSE_SET_MAP_KIND (node3,
+						   GOMP_MAP_ATTACH_DETACH);
+			  OMP_CLAUSE_DECL (node3)
+			    = gfc_conv_descriptor_data_get (inner);
+		          STRIP_NOPS (OMP_CLAUSE_DECL (node3));
+			  OMP_CLAUSE_SIZE (node3) = size_int (0);
+			  int rank = GFC_TYPE_ARRAY_RANK (type);
+			  OMP_CLAUSE_SIZE (node)
+			    = gfc_full_array_size (block, inner, rank);
+			  tree elemsz
+			    = TYPE_SIZE_UNIT (gfc_get_element_type (type));
+			  elemsz = fold_convert (gfc_array_index_type, elemsz);
+			  OMP_CLAUSE_SIZE (node)
+			    = fold_build2 (MULT_EXPR, gfc_array_index_type,
+					   OMP_CLAUSE_SIZE (node), elemsz);
+			}
+		      else
+		        OMP_CLAUSE_DECL (node) = inner;
+		    }
+		  else  /* An array element or section.  */
+		    {
+		      bool element
+			= (lastcomp->next
+			   && lastcomp->next->type == REF_ARRAY
+			   && lastcomp->next->u.ar.type == AR_ELEMENT);
+
+		      gfc_trans_omp_array_section (block, n, inner, element,
+						   GOMP_MAP_ATTACH_DETACH,
+						   node, node2, node3, node4);
+		    }
 		}
+	      else  /* An array element or array section.  */
+	        {
+		  bool element = n->expr->ref->u.ar.type == AR_ELEMENT;
+		  gfc_trans_omp_array_section (block, n, decl, element,
+					       GOMP_MAP_POINTER, node, node2,
+					       node3, node4);
+		}
+
+	      finalize_map_clause:
 	      switch (n->u.map_op)
 		{
 		case OMP_MAP_ALLOC:
 		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_ALLOC);
 		  break;
+		case OMP_MAP_NO_ALLOC:
+		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_NO_ALLOC);
+		  break;
+		case OMP_MAP_ATTACH:
+		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_ATTACH);
+		  break;
 		case OMP_MAP_TO:
 		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_TO);
 		  break;
@@ -2343,6 +2704,9 @@
 		case OMP_MAP_DELETE:
 		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_DELETE);
 		  break;
+		case OMP_MAP_DETACH:
+		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_DETACH);
+		  break;
 		case OMP_MAP_FORCE_ALLOC:
 		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_FORCE_ALLOC);
 		  break;
@@ -2361,6 +2725,12 @@
 		case OMP_MAP_FORCE_DEVICEPTR:
 		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_FORCE_DEVICEPTR);
 		  break;
+		case OMP_MAP_DECLARE_ALLOCATE:
+		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_DECLARE_ALLOCATE);
+		  break;
+		case OMP_MAP_DECLARE_DEALLOCATE:
+		  OMP_CLAUSE_SET_MAP_KIND (node, GOMP_MAP_DECLARE_DEALLOCATE);
+		  break;
 		default:
 		  gcc_unreachable ();
 		}
@@ -2398,7 +2768,7 @@
 	      tree node = build_omp_clause (input_location, clause_code);
 	      if (n->expr == NULL || n->expr->ref->u.ar.type == AR_FULL)
 		{
-		  tree decl = gfc_get_symbol_decl (n->sym);
+		  tree decl = gfc_trans_omp_variable (n->sym, false);
 		  if (gfc_omp_privatize_by_reference (decl))
 		    decl = build_fold_indirect_ref (decl);
 		  else if (DECL_P (decl))
@@ -3044,6 +3414,13 @@
 	  OMP_CLAUSE_GANG_STATIC_EXPR (c) = arg;
 	}
     }
+  if (clauses->nohost)
+    {
+      c = build_omp_clause (where.lb->location, OMP_CLAUSE_NOHOST);
+      omp_clauses = gfc_trans_add_clause (c, omp_clauses);
+      //TODO
+      gcc_unreachable();
+    }
 
   return nreverse (omp_clauses);
 }
@@ -3073,7 +3450,7 @@
 }
 
 /* Trans OpenACC directives. */
-/* parallel, kernels, data and host_data. */
+/* parallel, serial, kernels, data and host_data. */
 static tree
 gfc_trans_oacc_construct (gfc_code *code)
 {
@@ -3089,6 +3466,9 @@
       case EXEC_OACC_KERNELS:
 	construct_code = OACC_KERNELS;
 	break;
+      case EXEC_OACC_SERIAL:
+	construct_code = OACC_SERIAL;
+	break;
       case EXEC_OACC_DATA:
 	construct_code = OACC_DATA;
 	break;
@@ -3115,12 +3495,14 @@
 {
   stmtblock_t block;
   tree stmt, oacc_clauses;
+  gfc_omp_clauses *clauses = code->ext.omp_clauses;
   enum tree_code construct_code;
 
   switch (code->op)
     {
       case EXEC_OACC_UPDATE:
 	construct_code = OACC_UPDATE;
+	clauses->update_allocatable = 1;
 	break;
       case EXEC_OACC_ENTER_DATA:
 	construct_code = OACC_ENTER_DATA;
@@ -3136,8 +3518,7 @@
     }
 
   gfc_start_block (&block);
-  oacc_clauses = gfc_trans_omp_clauses (&block, code->ext.omp_clauses,
-					code->loc);
+  oacc_clauses = gfc_trans_omp_clauses (&block, clauses, code->loc);
   stmt = build1_loc (input_location, construct_code, void_type_node, 
 		     oacc_clauses);
   gfc_add_expr_to_block (&block, stmt);
@@ -3895,7 +4276,8 @@
   return gfc_finish_block (&block);
 }
 
-/* parallel loop and kernels loop. */
+/* Combined OpenACC parallel loop, kernels loop and serial loop. */
+
 static tree
 gfc_trans_oacc_combined_directive (gfc_code *code)
 {
@@ -3913,6 +4295,9 @@
       case EXEC_OACC_KERNELS_LOOP:
 	construct_code = OACC_KERNELS;
 	break;
+      case EXEC_OACC_SERIAL_LOOP:
+	construct_code = OACC_SERIAL;
+	break;
       default:
 	gcc_unreachable ();
     }
@@ -5139,6 +5524,41 @@
   return gfc_finish_block (&block);
 }
 
+/* Create an OpenACC enter or exit data construct for an OpenACC declared
+   variable that has been allocated or deallocated.  */
+
+tree
+gfc_trans_oacc_declare_allocate (stmtblock_t *block, gfc_expr *expr,
+				 bool allocate)
+{
+  gfc_omp_clauses *clauses = gfc_get_omp_clauses ();
+  gfc_omp_namelist *p = gfc_get_omp_namelist ();
+  tree oacc_clauses, stmt;
+  enum tree_code construct_code;
+
+  p->sym = expr->symtree->n.sym;
+  p->where = expr->where;
+
+  if (allocate)
+    {
+      p->u.map_op = OMP_MAP_DECLARE_ALLOCATE;
+      construct_code = OACC_ENTER_DATA;
+    }
+  else
+    {
+      p->u.map_op = OMP_MAP_DECLARE_DEALLOCATE;
+      construct_code = OACC_EXIT_DATA;
+    }
+  clauses->lists[OMP_LIST_MAP] = p;
+
+  oacc_clauses = gfc_trans_omp_clauses (block, clauses, expr->where);
+  stmt = build1_loc (input_location, construct_code, void_type_node,
+		     oacc_clauses);
+  gfc_add_expr_to_block (block, stmt);
+
+  return stmt;
+}
+
 tree
 gfc_trans_oacc_directive (gfc_code *code)
 {
@@ -5146,9 +5566,11 @@
     {
     case EXEC_OACC_PARALLEL_LOOP:
     case EXEC_OACC_KERNELS_LOOP:
+    case EXEC_OACC_SERIAL_LOOP:
       return gfc_trans_oacc_combined_directive (code);
     case EXEC_OACC_PARALLEL:
     case EXEC_OACC_KERNELS:
+    case EXEC_OACC_SERIAL:
     case EXEC_OACC_DATA:
     case EXEC_OACC_HOST_DATA:
       return gfc_trans_oacc_construct (code);
diff --git a/gcc/fortran/trans-stmt.c b/gcc/fortran/trans-stmt.c
index b839d6c..0314937 100644
--- a/gcc/fortran/trans-stmt.c
+++ b/gcc/fortran/trans-stmt.c
@@ -6476,6 +6476,10 @@
 				      label_finish, expr, 0);
 	  else
 	    gfc_allocate_using_malloc (&se.pre, se.expr, memsz, stat);
+
+	  /* Allocate memory for OpenACC declared variables.  */
+	  if (expr->symtree->n.sym->attr.oacc_declare_create)
+	    gfc_trans_oacc_declare_allocate (&se.pre, expr, true);
 	}
       else
 	{
@@ -6948,6 +6952,10 @@
 
 	  if (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (se.expr)))
 	    {
+	      if (!is_coarray
+		  && expr->symtree->n.sym->attr.oacc_declare_create)
+		gfc_trans_oacc_declare_allocate (&se.pre, expr, false);
+
 	      gfc_coarray_deregtype caf_dtype;
 
 	      if (is_coarray)
@@ -7001,6 +7009,10 @@
 	}
       else
 	{
+	  /* Deallocate memory for OpenACC declared variables.  */
+	  if (expr->symtree->n.sym->attr.oacc_declare_create)
+	    gfc_trans_oacc_declare_allocate (&se.pre, expr, false);
+
 	  tmp = gfc_deallocate_scalar_with_status (se.expr, pstat, label_finish,
 						   false, al->expr,
 						   al->expr->ts, is_coarray);
diff --git a/gcc/fortran/trans.c b/gcc/fortran/trans.c
index 167fcb2..794fd02 100644
--- a/gcc/fortran/trans.c
+++ b/gcc/fortran/trans.c
@@ -2121,6 +2121,8 @@
 	case EXEC_OACC_KERNELS_LOOP:
 	case EXEC_OACC_PARALLEL:
 	case EXEC_OACC_PARALLEL_LOOP:
+	case EXEC_OACC_SERIAL:
+	case EXEC_OACC_SERIAL_LOOP:
 	case EXEC_OACC_ENTER_DATA:
 	case EXEC_OACC_EXIT_DATA:
 	case EXEC_OACC_ATOMIC:
diff --git a/gcc/fortran/trans.h b/gcc/fortran/trans.h
index f6af09f..84dc581 100644
--- a/gcc/fortran/trans.h
+++ b/gcc/fortran/trans.h
@@ -550,6 +550,15 @@
 /* Convert a missing, dummy argument into a null or zero.  */
 void gfc_conv_missing_dummy (gfc_se *, gfc_expr *, gfc_typespec, int);
 
+/* Lowering of component references.  */
+void gfc_conv_component_ref (gfc_se * se, gfc_ref * ref);
+void conv_parent_component_references (gfc_se * se, gfc_ref * ref);
+
+/* Automatically dereference var.  */
+tree gfc_auto_dereference_var (location_t, gfc_symbol *, tree,
+			       bool desc_only = false,
+			       bool is_classarray = false);
+
 /* Generate code to allocate a string temporary.  */
 tree gfc_conv_string_tmp (gfc_se *, tree, tree);
 /* Get the string length variable belonging to an expression.  */
@@ -771,6 +780,7 @@
 bool gfc_get_array_descr_info (const_tree, struct array_descr_info *);
 
 /* In trans-openmp.c */
+tree gfc_omp_array_data (tree);
 bool gfc_omp_privatize_by_reference (const_tree);
 enum omp_clause_default_kind gfc_omp_predetermined_sharing (tree);
 tree gfc_omp_report_decl (tree);
@@ -786,6 +796,7 @@
 bool gfc_omp_private_outer_ref (tree);
 struct gimplify_omp_ctx;
 void gfc_omp_firstprivatize_type_sizes (struct gimplify_omp_ctx *, tree);
+tree gfc_trans_oacc_declare_allocate (stmtblock_t *, gfc_expr *, bool);
 
 /* Runtime library function decls.  */
 extern GTY(()) tree gfor_fndecl_pause_numeric;
diff --git a/gcc/gcc.c b/gcc/gcc.c
index 4f57765..940e87f 100644
--- a/gcc/gcc.c
+++ b/gcc/gcc.c
@@ -409,6 +409,7 @@
 static const char *greater_than_spec_func (int, const char **);
 static const char *debug_level_greater_than_spec_func (int, const char **);
 static const char *find_fortran_preinclude_file (int, const char **);
+static const char *add_omp_infile_spec_func (int, const char **);
 static char *convert_white_space (char *);
 
 /* The Specs Language
@@ -1222,6 +1223,9 @@
 
 static const char *const driver_self_specs[] = {
   "%{fdump-final-insns:-fdump-final-insns=.} %<fdump-final-insns",
+  /* If linking against libgomp, add a setup file.  */
+  "%{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1):" \
+  "%:add-omp-infile()}",
   DRIVER_SELF_SPECS, CONFIGURE_SPECS, GOMP_SELF_SPECS, GTM_SELF_SPECS
 };
 
@@ -1649,6 +1653,7 @@
   { "gt",			greater_than_spec_func },
   { "debug-level-gt",		debug_level_greater_than_spec_func },
   { "fortran-preinclude-file",	find_fortran_preinclude_file},
+  { "add-omp-infile",		add_omp_infile_spec_func },
 #ifdef EXTRA_SPEC_FUNCTIONS
   EXTRA_SPEC_FUNCTIONS
 #endif
@@ -3389,7 +3394,8 @@
    The `validated' field describes whether any spec has looked at this switch;
    if it remains false at the end of the run, the switch must be meaningless.
    The `ordering' field is used to temporarily mark switches that have to be
-   kept in a specific order.  */
+   kept in a specific order.
+   The `lang_mask' field stores the flags associated with this option.  */
 
 #define SWITCH_LIVE    			(1 << 0)
 #define SWITCH_FALSE   			(1 << 1)
@@ -3405,6 +3411,7 @@
   bool known;
   bool validated;
   bool ordering;
+  unsigned int lang_mask;
 };
 
 static struct switchstr *switches;
@@ -3413,6 +3420,10 @@
 
 static int n_switches_alloc;
 
+/* If nonzero, do not pass through switches for languages not matching
+   this mask.  */
+static unsigned int spec_lang_mask_accept;
+
 /* Set to zero if -fcompare-debug is disabled, positive if it's
    enabled and we're running the first compilation, negative if it's
    enabled and we're running the second compilation.  For most of the
@@ -3450,6 +3461,7 @@
   const char *name;
   const char *language;
   struct compiler *incompiler;
+  unsigned int lang_mask;
   bool compiled;
   bool preprocessed;
 };
@@ -3649,15 +3661,16 @@
     }
 }
 
-/* Store an input file with the given NAME and LANGUAGE in
+/* Store an input file with the given NAME and LANGUAGE and LANG_MASK in
    infiles.  */
 
 static void
-add_infile (const char *name, const char *language)
+add_infile (const char *name, const char *language, unsigned int lang_mask)
 {
   alloc_infile ();
   infiles[n_infiles].name = name;
-  infiles[n_infiles++].language = language;
+  infiles[n_infiles].language = language;
+  infiles[n_infiles++].lang_mask = lang_mask;
 }
 
 /* Allocate space for a switch in switches.  */
@@ -3678,11 +3691,12 @@
 }
 
 /* Save an option OPT with N_ARGS arguments in array ARGS, marking it
-   as validated if VALIDATED and KNOWN if it is an internal switch.  */
+   as validated if VALIDATED and KNOWN if it is an internal switch.
+   LANG_MASK is the flags associated with this option.  */
 
 static void
 save_switch (const char *opt, size_t n_args, const char *const *args,
-	     bool validated, bool known)
+	     bool validated, bool known, unsigned int lang_mask)
 {
   alloc_switch ();
   switches[n_switches].part1 = opt + 1;
@@ -3699,6 +3713,7 @@
   switches[n_switches].validated = validated;
   switches[n_switches].known = known;
   switches[n_switches].ordering = 0;
+  switches[n_switches].lang_mask = lang_mask;
   n_switches++;
 }
 
@@ -3739,7 +3754,8 @@
 	 diagnosed only if there are warnings.  */
       save_switch (decoded->canonical_option[0],
 		   decoded->canonical_option_num_elements - 1,
-		   &decoded->canonical_option[1], false, true);
+		   &decoded->canonical_option[1], false, true,
+		   cl_options[decoded->opt_index].flags);
       return false;
     }
   if (decoded->opt_index == OPT_SPECIAL_unknown)
@@ -3747,7 +3763,8 @@
       /* Give it a chance to define it a spec file.  */
       save_switch (decoded->canonical_option[0],
 		   decoded->canonical_option_num_elements - 1,
-		   &decoded->canonical_option[1], false, false);
+		   &decoded->canonical_option[1], false, false,
+		   cl_options[decoded->opt_index].flags);
       return false;
     }
   else
@@ -3774,7 +3791,8 @@
   else
     save_switch (decoded->canonical_option[0],
 		 decoded->canonical_option_num_elements - 1,
-		 &decoded->canonical_option[1], false, true);
+		 &decoded->canonical_option[1], false, true,
+		 option->flags);
 }
 
 static const char *spec_lang = 0;
@@ -4033,7 +4051,8 @@
 	compare_debug_opt = NULL;
       else
 	compare_debug_opt = arg;
-      save_switch (compare_debug_replacement_opt, 0, NULL, validated, true);
+      save_switch (compare_debug_replacement_opt, 0, NULL, validated, true,
+		   cl_options[opt_index].flags);
       set_source_date_epoch_envvar ();
       return true;
 
@@ -4094,17 +4113,17 @@
 	for (j = 0; arg[j]; j++)
 	  if (arg[j] == ',')
 	    {
-	      add_infile (save_string (arg + prev, j - prev), "*");
+	      add_infile (save_string (arg + prev, j - prev), "*", 0);
 	      prev = j + 1;
 	    }
 	/* Record the part after the last comma.  */
-	add_infile (arg + prev, "*");
+	add_infile (arg + prev, "*", 0);
       }
       do_save = false;
       break;
 
     case OPT_Xlinker:
-      add_infile (arg, "*");
+      add_infile (arg, "*", 0);
       do_save = false;
       break;
 
@@ -4121,19 +4140,21 @@
     case OPT_l:
       /* POSIX allows separation of -l and the lib arg; canonicalize
 	 by concatenating -l with its arg */
-      add_infile (concat ("-l", arg, NULL), "*");
+      add_infile (concat ("-l", arg, NULL), "*", 0);
       do_save = false;
       break;
 
     case OPT_L:
       /* Similarly, canonicalize -L for linkers that may not accept
 	 separate arguments.  */
-      save_switch (concat ("-L", arg, NULL), 0, NULL, validated, true);
+      save_switch (concat ("-L", arg, NULL), 0, NULL, validated, true,
+		   cl_options[opt_index].flags);
       return true;
 
     case OPT_F:
       /* Likewise -F.  */
-      save_switch (concat ("-F", arg, NULL), 0, NULL, validated, true);
+      save_switch (concat ("-F", arg, NULL), 0, NULL, validated, true,
+		   cl_options[opt_index].flags);
       return true;
 
     case OPT_save_temps:
@@ -4260,7 +4281,8 @@
       save_temps_prefix = xstrdup (arg);
       /* On some systems, ld cannot handle "-o" without a space.  So
 	 split the option from its argument.  */
-      save_switch ("-o", 1, &arg, validated, true);
+      save_switch ("-o", 1, &arg, validated, true,
+		   cl_options[opt_index].flags);
       return true;
 
 #ifdef ENABLE_DEFAULT_PIE
@@ -4294,9 +4316,12 @@
     }
 
   if (do_save)
+    {
     save_switch (decoded->canonical_option[0],
 		 decoded->canonical_option_num_elements - 1,
-		 &decoded->canonical_option[1], validated, true);
+		 &decoded->canonical_option[1], validated, true,
+		 cl_options[opt_index].flags);
+    }
   return true;
 }
 
@@ -4596,7 +4621,7 @@
 	      error ("%s: %m", fname + resp);
 	    }
           else
-	    add_infile (arg, spec_lang);
+	    add_infile (arg, spec_lang, 0);
 
           free (fname);
 	  continue;
@@ -4746,7 +4771,8 @@
   if (compare_debug == 2 || compare_debug == 3)
     {
       const char *opt = concat ("-fcompare-debug=", compare_debug_opt, NULL);
-      save_switch (opt, 0, NULL, false, true);
+      save_switch (opt, 0, NULL, false, true,
+		   cl_options[OPT_fcompare_debug_].flags);
       compare_debug = 1;
     }
 
@@ -4757,7 +4783,7 @@
 
       /* Create a dummy input file, so that we can pass
 	 the help option on to the various sub-processes.  */
-      add_infile ("help-dummy", "c");
+      add_infile ("help-dummy", "c", 0);
     }
 
   /* Decide if undefined variable references are allowed in specs.  */
@@ -4978,13 +5004,15 @@
 }
 
 /* Process the spec SPEC and run the commands specified therein.
+   If LANG_MASK is nonzero, switches for other languages are discarded.
    Returns 0 if the spec is successfully processed; -1 if failed.  */
 
 int
-do_spec (const char *spec)
+do_spec (const char *spec, unsigned int lang_mask)
 {
   int value;
 
+  spec_lang_mask_accept = lang_mask;
   value = do_spec_2 (spec, NULL);
 
   /* Force out any unfinished command.
@@ -5144,7 +5172,8 @@
 	      save_switch (decoded_options[j].canonical_option[0],
 			   (decoded_options[j].canonical_option_num_elements
 			    - 1),
-			   &decoded_options[j].canonical_option[1], false, true);
+			   &decoded_options[j].canonical_option[1], false, true,
+			   cl_options[decoded_options[j].opt_index].flags);
 	      break;
 
 	    default:
@@ -6723,6 +6752,14 @@
 static void
 give_switch (int switchnum, int omit_first_word)
 {
+  int lang_mask = switches[switchnum].lang_mask & ((1U << cl_lang_count) - 1);
+  unsigned int lang_mask_accept = (1U << cl_lang_count) - 1;
+  if (spec_lang_mask_accept != 0)
+    lang_mask_accept = spec_lang_mask_accept;
+  /* Drop switches specific to a language not in the given mask.  */
+  if (lang_mask != 0 && !(lang_mask & lang_mask_accept))
+    return;
+
   if ((switches[switchnum].live_cond & SWITCH_IGNORE) != 0)
     return;
 
@@ -7829,9 +7866,6 @@
 		    strlen (offload_targets) + 1);
       xputenv (XOBFINISH (&collect_obstack, char *));
     }
-
-  free (offload_targets);
-  offload_targets = NULL;
 }
 
 /* Reject switches that no pass was interested in.  */
@@ -8145,7 +8179,8 @@
 		  debug_check_temp_file[1] = NULL;
 		}
 
-	      value = do_spec (input_file_compiler->spec);
+	      value = do_spec (input_file_compiler->spec,
+			       infiles[i].lang_mask);
 	      infiles[i].compiled = true;
 	      if (value < 0)
 		this_file_error = 1;
@@ -8160,7 +8195,8 @@
 		  n_switches_alloc = n_switches_alloc_debug_check[1];
 		  switches = switches_debug_check[1];
 
-		  value = do_spec (input_file_compiler->spec);
+		  value = do_spec (input_file_compiler->spec,
+				   infiles[i].lang_mask);
 
 		  compare_debug = -compare_debug;
 		  n_switches = n_switches_debug_check[0];
@@ -8315,7 +8351,7 @@
 		    " to the linker.\n\n"));
 	  fflush (stdout);
 	}
-      int value = do_spec (link_command_spec);
+      int value = do_spec (link_command_spec, 0);
       if (value < 0)
 	errorcount = 1;
       linker_was_run = (tmp != execution_count);
@@ -9992,6 +10028,53 @@
   return result;
 }
 
+/* If applicable, generate a C source file containing a constructor call to
+   GOMP_set_offload_targets, to inform libgomp which offload targets have
+   actually been requested (-foffload=[...]), and adds that as an infile.  */
+
+static const char *
+add_omp_infile_spec_func (int argc, const char **)
+{
+  gcc_assert (argc == 0);
+
+  /* Nothing to do if we're not actually offloading.  */
+  if (!ENABLE_OFFLOADING)
+    return NULL;
+  gcc_assert (offload_targets != NULL);
+
+  /* Nothing to do if we're not actually linking.  */
+  if (have_c)
+    return NULL;
+
+  int err;
+  const char *tmp_filename;
+  tmp_filename = make_temp_file (".c");
+  record_temp_file (tmp_filename, !save_temps_flag, 0);
+  FILE *f = fopen (tmp_filename, "w");
+  if (f == NULL)
+    fatal_error (input_location,
+		 "could not open temporary file %s", tmp_filename);
+  /* As libgomp uses constructors internally, and this code is only added when
+     linking against libgomp, it is fine to use a constructor here.  */
+  err = fprintf (f,
+		 "extern void GOMP_set_offload_targets (const char *);\n"
+		 "static __attribute__ ((constructor)) void\n"
+		 "init (void)\n"
+		 "{\n"
+		 "  GOMP_set_offload_targets (\"%s\");\n"
+		 "}\n",
+		 offload_targets);
+  if (err < 0)
+    fatal_error (input_location,
+		 "could not write to temporary file %s", tmp_filename);
+  err = fclose (f);
+  if (err == EOF)
+    fatal_error (input_location,
+		 "could not close temporary file %s", tmp_filename);
+
+  add_infile (tmp_filename, "cpp-output", CL_C);
+  return NULL;
+}
 
 /* Insert backslash before spaces in ORIG (usually a file path), to 
    avoid being broken by spec parser.
diff --git a/gcc/gcc.h b/gcc/gcc.h
index a0a1d94..21bd036 100644
--- a/gcc/gcc.h
+++ b/gcc/gcc.h
@@ -69,7 +69,7 @@
 };
 
 /* These are exported by gcc.c.  */
-extern int do_spec (const char *);
+extern int do_spec (const char *, unsigned int);
 extern void record_temp_file (const char *, int, int);
 extern void set_input (const char *);
 
diff --git a/gcc/gimple-pretty-print.c b/gcc/gimple-pretty-print.c
index 69bae0d..400c73b 100644
--- a/gcc/gimple-pretty-print.c
+++ b/gcc/gimple-pretty-print.c
@@ -1614,6 +1614,9 @@
     case GF_OMP_TARGET_KIND_OACC_PARALLEL:
       kind = " oacc_parallel";
       break;
+    case GF_OMP_TARGET_KIND_OACC_SERIAL:
+      kind = " oacc_serial";
+      break;
     case GF_OMP_TARGET_KIND_OACC_DATA:
       kind = " oacc_data";
       break;
@@ -1629,6 +1632,15 @@
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
       kind = " oacc_host_data";
       break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+      kind = " oacc_parallel_kernels_parallelized";
+      break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+      kind = " oacc_parallel_kernels_gang_single";
+      break;
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
+      kind = " oacc_data_kernels";
+      break;
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/gimple.h b/gcc/gimple.h
index 8b5c9e2..2cc5e6f 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -184,6 +184,16 @@
     GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA = 9,
     GF_OMP_TARGET_KIND_OACC_DECLARE = 10,
     GF_OMP_TARGET_KIND_OACC_HOST_DATA = 11,
+    GF_OMP_TARGET_KIND_OACC_SERIAL = 12,
+    /* A GF_OMP_TARGET_KIND_OACC_PARALLEL that originates from a 'kernels'
+       construct, parallelized.  */
+    GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED = 13,
+    /* A GF_OMP_TARGET_KIND_OACC_PARALLEL that originates from a 'kernels'
+       construct, "gang-single".  */
+    GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE = 14,
+    /* A GF_OMP_TARGET_KIND_OACC_DATA that originates from a 'kernels'
+       construct.  */
+    GF_OMP_TARGET_KIND_OACC_DATA_KERNELS = 15,
     GF_OMP_TEAMS_GRID_PHONY	= 1 << 0,
     GF_OMP_TEAMS_HOST		= 1 << 1,
 
@@ -6419,11 +6429,15 @@
 	{
 	case GF_OMP_TARGET_KIND_OACC_PARALLEL:
 	case GF_OMP_TARGET_KIND_OACC_KERNELS:
+	case GF_OMP_TARGET_KIND_OACC_SERIAL:
 	case GF_OMP_TARGET_KIND_OACC_DATA:
 	case GF_OMP_TARGET_KIND_OACC_UPDATE:
 	case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
 	case GF_OMP_TARGET_KIND_OACC_DECLARE:
 	case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+	case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
 	  return true;
 	default:
 	  return false;
@@ -6448,6 +6462,9 @@
 	case GF_OMP_TARGET_KIND_REGION:
 	case GF_OMP_TARGET_KIND_OACC_PARALLEL:
 	case GF_OMP_TARGET_KIND_OACC_KERNELS:
+	case GF_OMP_TARGET_KIND_OACC_SERIAL:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
 	  return true;
 	default:
 	  return false;
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index bd8bd6d..8f759d0 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -113,6 +113,13 @@
 
   GOVD_NONTEMPORAL = 4194304,
 
+  /* Flag for GOVD_MAP: (struct) vars that have pointer attachments for
+     fields.  */
+  GOVD_MAP_HAS_ATTACHMENTS = 8388608,
+
+  /* Flag for OpenACC deviceptrs.  */
+  GOVD_DEVICEPTR = (1<<24),
+
   GOVD_DATA_SHARE_CLASS = (GOVD_SHARED | GOVD_PRIVATE | GOVD_FIRSTPRIVATE
 			   | GOVD_LASTPRIVATE | GOVD_REDUCTION | GOVD_LINEAR
 			   | GOVD_LOCAL)
@@ -150,6 +157,7 @@
   ORT_ACC_DATA	= ORT_ACC | ORT_TARGET_DATA, /* Data construct.  */
   ORT_ACC_PARALLEL = ORT_ACC | ORT_TARGET,  /* Parallel construct */
   ORT_ACC_KERNELS  = ORT_ACC | ORT_TARGET | 2,  /* Kernels construct.  */
+  ORT_ACC_SERIAL = ORT_ACC | ORT_TARGET | 4,  /* Serial construct.  */
   ORT_ACC_HOST_DATA = ORT_ACC | ORT_TARGET_DATA | 2,  /* Host data.  */
 
   /* Dummy OpenMP region, used to disable expansion of
@@ -197,6 +205,18 @@
   GDMK_POINTER
 };
 
+/* Used to record clauses representing array slices on data directives that
+   may affect implicit mapping semantics on enclosed OpenACC parallel/kernels
+   regions.  PSET is used for Fortran array slices with array descriptors,
+   or NULL otherwise.  */
+struct oacc_array_mapping_info
+{
+  tree mapping;
+  tree pset;
+  tree pointer;
+  tree ref;
+};
+
 struct gimplify_omp_ctx
 {
   struct gimplify_omp_ctx *outer_context;
@@ -212,6 +232,12 @@
   bool target_firstprivatize_array_bases;
   bool add_safelen1;
   int defaultmap[4];
+  hash_map<tree, oacc_array_mapping_info> *decl_data_clause;
+};
+
+struct privatize_reduction
+{
+  tree ref_var, local_var;
 };
 
 static struct gimplify_ctx *gimplify_ctxp;
@@ -438,6 +464,7 @@
   c->defaultmap[GDMK_AGGREGATE] = GOVD_MAP;
   c->defaultmap[GDMK_ALLOCATABLE] = GOVD_MAP;
   c->defaultmap[GDMK_POINTER] = GOVD_MAP;
+  c->decl_data_clause = NULL;
 
   return c;
 }
@@ -450,6 +477,8 @@
   splay_tree_delete (c->variables);
   delete c->privatized_types;
   c->loop_iter_var.release ();
+  if (c->decl_data_clause)
+    delete c->decl_data_clause;
   XDELETE (c);
 }
 
@@ -5479,6 +5508,7 @@
     case STATEMENT_LIST:
     case OACC_PARALLEL:
     case OACC_KERNELS:
+    case OACC_SERIAL:
     case OACC_DATA:
     case OACC_HOST_DATA:
     case OACC_DECLARE:
@@ -6929,20 +6959,27 @@
   else
     splay_tree_insert (ctx->variables, (splay_tree_key)decl, flags);
 
-  /* For reductions clauses in OpenACC loop directives, by default create a
-     copy clause on the enclosing parallel construct for carrying back the
-     results.  */
+  /* For OpenACC loop directives, when a reduction clause is placed on
+     the outermost acc loop within an acc parallel or kernels
+     construct, it must have an implied copy data mapping. E.g.
+
+       #pragma acc parallel
+	 {
+	   #pragma acc loop reduction (+:sum)
+
+     a copy clause for sum should be added on the enclosing parallel
+     construct for carrying back the results.  */
   if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION))
     {
       struct gimplify_omp_ctx *outer_ctx = ctx->outer_context;
-      while (outer_ctx)
+      if (outer_ctx)
 	{
 	  n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl);
 	  if (n != NULL)
 	    {
 	      /* Ignore local variables and explicitly declared clauses.  */
 	      if (n->value & (GOVD_LOCAL | GOVD_EXPLICIT))
-		break;
+		;
 	      else if (outer_ctx->region_type == ORT_ACC_KERNELS)
 		{
 		  /* According to the OpenACC spec, such a reduction variable
@@ -6962,9 +6999,7 @@
 	    {
 	      splay_tree_insert (outer_ctx->variables, (splay_tree_key)decl,
 				 GOVD_MAP | GOVD_SEEN);
-	      break;
 	    }
-	  outer_ctx = outer_ctx->outer_context;
 	}
     }
 }
@@ -7145,15 +7180,20 @@
 {
   const char *rkind;
   bool on_device = false;
+  bool is_private = false;
   bool declared = is_oacc_declared (decl);
   tree type = TREE_TYPE (decl);
 
   if (lang_hooks.decls.omp_privatize_by_reference (decl))
     type = TREE_TYPE (type);
 
+  if (RECORD_OR_UNION_TYPE_P (type))
+    is_private = lang_hooks.decls.omp_disregard_value_expr (decl, false);
+
   if ((ctx->region_type & (ORT_ACC_PARALLEL | ORT_ACC_KERNELS)) != 0
       && is_global_var (decl)
-      && device_resident_p (decl))
+      && device_resident_p (decl)
+      && !is_private)
     {
       on_device = true;
       flags |= GOVD_MAP_TO_ONLY;
@@ -7164,7 +7204,9 @@
     case ORT_ACC_KERNELS:
       rkind = "kernels";
 
-      if (AGGREGATE_TYPE_P (type))
+      if (is_private)
+	flags |= GOVD_MAP;
+      else if (AGGREGATE_TYPE_P (type))
 	{
 	  /* Aggregates default to 'present_or_copy', or 'present'.  */
 	  if (ctx->default_kind != OMP_CLAUSE_DEFAULT_PRESENT)
@@ -7179,9 +7221,12 @@
       break;
 
     case ORT_ACC_PARALLEL:
-      rkind = "parallel";
+    case ORT_ACC_SERIAL:
+      rkind = ctx->region_type == ORT_ACC_PARALLEL ? "parallel" : "serial";
 
-      if (on_device || declared)
+      if (is_private)
+	flags |= GOVD_FIRSTPRIVATE;
+      else if (on_device || declared)
 	flags |= GOVD_MAP;
       else if (AGGREGATE_TYPE_P (type))
 	{
@@ -7247,7 +7292,8 @@
 	{
 	  tree value = get_base_address (DECL_VALUE_EXPR (decl));
 
-	  if (value && DECL_P (value) && DECL_THREAD_LOCAL_P (value))
+	  if (!(ctx->region_type & ORT_ACC)
+	      && value && DECL_P (value) && DECL_THREAD_LOCAL_P (value))
 	    return omp_notice_threadprivate_variable (ctx, decl, value);
 	}
 
@@ -7279,7 +7325,8 @@
   n = splay_tree_lookup (ctx->variables, (splay_tree_key)decl);
   if ((ctx->region_type & ORT_TARGET) != 0)
     {
-      ret = lang_hooks.decls.omp_disregard_value_expr (decl, true);
+      shared = !(ctx->region_type & ORT_ACC);
+      ret = lang_hooks.decls.omp_disregard_value_expr (decl, shared);
       if (n == NULL)
 	{
 	  unsigned nflags = flags;
@@ -7352,6 +7399,7 @@
 		        error ("variable %qE declared in enclosing "
 			       "%<host_data%> region", DECL_NAME (decl));
 		      nflags |= GOVD_MAP;
+		      nflags |= (n2->value & GOVD_DEVICEPTR);
 		      if (octx->region_type == ORT_ACC_DATA
 			  && (n2->value & GOVD_MAP_0LEN_ARRAY))
 			nflags |= GOVD_MAP_0LEN_ARRAY;
@@ -7447,6 +7495,8 @@
     }
 
   shared = ((flags | n->value) & GOVD_SHARED) != 0;
+  if (ctx->region_type & ORT_ACC)
+    shared = false;
   ret = lang_hooks.decls.omp_disregard_value_expr (decl, shared);
 
   /* If nothing changed, there's nothing left to do.  */
@@ -8047,6 +8097,168 @@
   return 1;
 }
 
+/* Insert a GOMP_MAP_ALLOC or GOMP_MAP_RELEASE node following a
+   GOMP_MAP_STRUCT mapping.  C is an always_pointer mapping.  STRUCT_NODE is
+   the struct node to insert the new mapping after (when the struct node is
+   initially created).  PREV_NODE is the first of two or three mappings for a
+   pointer, and is either:
+     - the node before C, when a pair of mappings is used, e.g. for a C/C++
+       array section.
+     - not the node before C.  This is true when we have a reference-to-pointer
+       type (with a mapping for the reference and for the pointer), or for
+       Fortran derived-type mappings with a GOMP_MAP_TO_PSET.
+   If SCP is non-null, the new node is inserted before *SCP.
+   if SCP is null, the new node is inserted before PREV_NODE.
+   The return type is:
+     - PREV_NODE, if SCP is non-null.
+     - The newly-created ALLOC or RELEASE node, if SCP is null.
+     - The second newly-created ALLOC or RELEASE node, if we are mapping a
+       reference to a pointer.  */
+
+static tree
+insert_struct_comp_map (enum tree_code code, tree c, tree struct_node,
+			tree prev_node, tree *scp)
+{
+  enum gomp_map_kind mkind
+    = ((code == OMP_TARGET_EXIT_DATA || code == OACC_EXIT_DATA)
+       ? GOMP_MAP_RELEASE : GOMP_MAP_ALLOC);
+
+  tree c2 = build_omp_clause (OMP_CLAUSE_LOCATION (c), OMP_CLAUSE_MAP);
+  tree cl = scp ? prev_node : c2;
+  OMP_CLAUSE_SET_MAP_KIND (c2, mkind);
+  OMP_CLAUSE_DECL (c2) = unshare_expr (OMP_CLAUSE_DECL (c));
+  OMP_CLAUSE_CHAIN (c2) = scp ? *scp : prev_node;
+  if (OMP_CLAUSE_CHAIN (prev_node) != c
+      && OMP_CLAUSE_CODE (OMP_CLAUSE_CHAIN (prev_node)) == OMP_CLAUSE_MAP
+      && (OMP_CLAUSE_MAP_KIND (OMP_CLAUSE_CHAIN (prev_node))
+	  == GOMP_MAP_TO_PSET))
+    OMP_CLAUSE_SIZE (c2) = OMP_CLAUSE_SIZE (OMP_CLAUSE_CHAIN (prev_node));
+  else
+    OMP_CLAUSE_SIZE (c2) = TYPE_SIZE_UNIT (ptr_type_node);
+  if (struct_node)
+    OMP_CLAUSE_CHAIN (struct_node) = c2;
+
+  /* We might need to create an additional mapping if we have a reference to a
+     pointer (in C++).  Don't do this if we have something other than a
+     GOMP_MAP_ALWAYS_POINTER though, i.e. a GOMP_MAP_TO_PSET.  */
+  if (OMP_CLAUSE_CHAIN (prev_node) != c
+      && OMP_CLAUSE_CODE (OMP_CLAUSE_CHAIN (prev_node)) == OMP_CLAUSE_MAP
+      && ((OMP_CLAUSE_MAP_KIND (OMP_CLAUSE_CHAIN (prev_node))
+	   == GOMP_MAP_ALWAYS_POINTER)
+	  || (OMP_CLAUSE_MAP_KIND (OMP_CLAUSE_CHAIN (prev_node))
+	      == GOMP_MAP_ATTACH_DETACH)))
+    {
+      tree c4 = OMP_CLAUSE_CHAIN (prev_node);
+      tree c3 = build_omp_clause (OMP_CLAUSE_LOCATION (c), OMP_CLAUSE_MAP);
+      OMP_CLAUSE_SET_MAP_KIND (c3, mkind);
+      OMP_CLAUSE_DECL (c3) = unshare_expr (OMP_CLAUSE_DECL (c4));
+      OMP_CLAUSE_SIZE (c3) = TYPE_SIZE_UNIT (ptr_type_node);
+      OMP_CLAUSE_CHAIN (c3) = prev_node;
+      if (!scp)
+	OMP_CLAUSE_CHAIN (c2) = c3;
+      else
+	cl = c3;
+    }
+
+  if (scp)
+    *scp = c2;
+
+  return cl;
+}
+
+/* Called initially with ORIG_BASE non-null, sets PREV_BITPOS and PREV_POFFSET
+   to the offset of the field given in BASE.  Return type is 1 if BASE is equal
+   to *ORIG_BASE after stripping off ARRAY_REF and INDIRECT_REF nodes and
+   calling get_inner_reference, else 0.
+
+   Called subsequently with ORIG_BASE null, compares the offset of the field
+   given in BASE to PREV_BITPOS, PREV_POFFSET. Returns -1 if the base object
+   has changed, 0 if the new value has a higher bit position than that
+   described by the aforementioned arguments, or 1 if the new value is less
+   than them.  Used for (insertion) sorting components after a GOMP_MAP_STRUCT
+   mapping.  */
+
+static int
+check_base_and_compare_lt (tree base, tree *orig_base, tree decl,
+			   poly_int64 *prev_bitpos,
+			   poly_offset_int *prev_poffset)
+{
+  tree offset;
+  poly_int64 bitsize, bitpos;
+  machine_mode mode;
+  int unsignedp, reversep, volatilep = 0;
+  poly_offset_int poffset;
+
+  if (orig_base)
+    {
+      while (TREE_CODE (base) == ARRAY_REF)
+	base = TREE_OPERAND (base, 0);
+
+      if (TREE_CODE (base) == INDIRECT_REF)
+	base = TREE_OPERAND (base, 0);
+    }
+  else
+    {
+      if (TREE_CODE (base) == ARRAY_REF)
+	{
+	  while (TREE_CODE (base) == ARRAY_REF)
+	    base = TREE_OPERAND (base, 0);
+	  if (TREE_CODE (base) != COMPONENT_REF
+	      || TREE_CODE (TREE_TYPE (base)) != ARRAY_TYPE)
+	    return -1;
+	}
+      else if (TREE_CODE (base) == INDIRECT_REF
+	       && TREE_CODE (TREE_OPERAND (base, 0)) == COMPONENT_REF
+	       && (TREE_CODE (TREE_TYPE (TREE_OPERAND (base, 0)))
+		   == REFERENCE_TYPE))
+	base = TREE_OPERAND (base, 0);
+    }
+
+  base = get_inner_reference (base, &bitsize, &bitpos, &offset, &mode,
+			      &unsignedp, &reversep, &volatilep);
+
+  if (orig_base)
+    *orig_base = base;
+
+  if ((TREE_CODE (base) == INDIRECT_REF
+       || (TREE_CODE (base) == MEM_REF
+	   && integer_zerop (TREE_OPERAND (base, 1))))
+      && DECL_P (TREE_OPERAND (base, 0))
+      && TREE_CODE (TREE_TYPE (TREE_OPERAND (base, 0))) == REFERENCE_TYPE)
+    base = TREE_OPERAND (base, 0);
+
+  gcc_assert (offset == NULL_TREE || poly_int_tree_p (offset));
+
+  if (offset)
+    poffset = wi::to_poly_offset (offset);
+  else
+    poffset = 0;
+
+  if (maybe_ne (bitpos, 0))
+    poffset += bits_to_bytes_round_down (bitpos);
+
+  if (orig_base)
+    {
+      gcc_assert (base == decl);
+
+      *prev_bitpos = bitpos;
+      *prev_poffset = poffset;
+
+      return *orig_base == base;
+    }
+  else
+    {
+      if (base != decl)
+	return -1;
+
+      return (maybe_lt (*prev_poffset, poffset)
+	      || (known_eq (*prev_poffset, poffset)
+		  && maybe_lt (*prev_bitpos, bitpos)));
+    }
+
+  return 0;
+}
+
 /* Scan the OMP clauses in *LIST_P, installing mappings into a new
    and previous omp contexts.  */
 
@@ -8058,6 +8270,7 @@
   struct gimplify_omp_ctx *ctx, *outer_ctx;
   tree c;
   hash_map<tree, tree> *struct_map_to_clause = NULL;
+  hash_set<tree> *struct_deref_set = NULL;
   tree *prev_list_p = NULL;
   int handled_depend_iterators = -1;
   int nowait = -1;
@@ -8424,14 +8637,61 @@
 	    case OMP_TARGET:
 	      break;
 	    case OACC_DATA:
-	      if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
-		break;
+	      {
+		tree base_ptr = OMP_CLAUSE_CHAIN (c);
+		tree pset = NULL;
+		if (base_ptr
+		    && OMP_CLAUSE_CODE (base_ptr) == OMP_CLAUSE_MAP
+		    && OMP_CLAUSE_MAP_KIND (base_ptr) == GOMP_MAP_TO_PSET)
+		  {
+		    pset = base_ptr;
+		    base_ptr = OMP_CLAUSE_CHAIN (base_ptr);
+		  }
+		if (base_ptr
+		    && OMP_CLAUSE_CODE (base_ptr) == OMP_CLAUSE_MAP
+		    && !(OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+			 && (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALLOC
+			     || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_POINTER))
+		    && OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_TO_PSET
+		    && ((OMP_CLAUSE_MAP_KIND (base_ptr)
+			 == GOMP_MAP_FIRSTPRIVATE_POINTER)
+			|| OMP_CLAUSE_MAP_KIND (base_ptr) == GOMP_MAP_POINTER))
+		  {
+		    /* If we have an array descriptor, fish the right base
+		       address variable to use out of that (otherwise we'd have
+		       to deconstruct "arr.data" in the subsequent pointer
+		       mapping).  */
+	            tree base_addr = pset ? OMP_CLAUSE_DECL (pset)
+					  : OMP_CLAUSE_DECL (base_ptr);
+		    if (!ctx->decl_data_clause)
+		      ctx->decl_data_clause
+			= new hash_map<tree, oacc_array_mapping_info>;
+		    oacc_array_mapping_info ai;
+		    ai.mapping = unshare_expr (c);
+		    ai.pset = pset ? unshare_expr (pset) : NULL;
+		    ai.pointer = unshare_expr (base_ptr);
+		    ai.ref = NULL_TREE;
+		    if (TREE_CODE (base_addr) == INDIRECT_REF
+			&& (TREE_CODE (TREE_TYPE (TREE_OPERAND (base_addr, 0)))
+			    == REFERENCE_TYPE))
+		      {
+			base_addr = TREE_OPERAND (base_addr, 0);
+			tree ref_clause = OMP_CLAUSE_CHAIN (base_ptr);
+			gcc_assert ((OMP_CLAUSE_CODE (ref_clause)
+				     == OMP_CLAUSE_MAP)
+				    && (OMP_CLAUSE_MAP_KIND (ref_clause)
+				        == GOMP_MAP_POINTER));
+			ai.ref = unshare_expr (ref_clause);
+		      }
+		    ctx->decl_data_clause->put (base_addr, ai);
+		  }
+		if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
+		  break;
+	      }
 	      /* FALLTHRU */
 	    case OMP_TARGET_DATA:
 	    case OMP_TARGET_ENTER_DATA:
 	    case OMP_TARGET_EXIT_DATA:
-	    case OACC_ENTER_DATA:
-	    case OACC_EXIT_DATA:
 	    case OACC_HOST_DATA:
 	      if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_POINTER
 		  || (OMP_CLAUSE_MAP_KIND (c)
@@ -8440,6 +8700,21 @@
 		   mapped, but not the pointer to it.  */
 		remove = true;
 	      break;
+	    case OACC_ENTER_DATA:
+	    case OACC_EXIT_DATA:
+	      if ((OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_POINTER
+		   || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_TO_PSET
+		   || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_POINTER
+		   || (OMP_CLAUSE_MAP_KIND (c)
+		       == GOMP_MAP_FIRSTPRIVATE_REFERENCE))
+		  && !(prev_list_p
+		       && OMP_CLAUSE_CODE (*prev_list_p) == OMP_CLAUSE_MAP
+		       && ((OMP_CLAUSE_MAP_KIND (*prev_list_p)
+			    == GOMP_MAP_DECLARE_ALLOCATE)
+			   || (OMP_CLAUSE_MAP_KIND (*prev_list_p)
+			       == GOMP_MAP_DECLARE_DEALLOCATE))))
+		remove = true;
+	      break;
 	    default:
 	      break;
 	    }
@@ -8464,8 +8739,28 @@
 	  if (OMP_CLAUSE_SIZE (c) == NULL_TREE)
 	    OMP_CLAUSE_SIZE (c) = DECL_P (decl) ? DECL_SIZE_UNIT (decl)
 				  : TYPE_SIZE_UNIT (TREE_TYPE (decl));
-	  if (gimplify_expr (&OMP_CLAUSE_SIZE (c), pre_p,
-			     NULL, is_gimple_val, fb_rvalue) == GS_ERROR)
+	  if (OMP_CLAUSE_SIZE (c)
+	      && TREE_CODE (OMP_CLAUSE_SIZE (c)) == TREE_LIST
+	      && GOMP_MAP_DYNAMIC_ARRAY_P (OMP_CLAUSE_MAP_KIND (c)))
+	    {
+	      tree dims = OMP_CLAUSE_SIZE (c);
+	      for (tree t = dims; t; t = TREE_CHAIN (t))
+		{
+		  /* If a dimension bias isn't a constant, we have to ensure
+		     that the value gets transferred to the offload target.  */
+		  tree low_bound = TREE_PURPOSE (t);
+		  if (TREE_CODE (low_bound) != INTEGER_CST)
+		    {
+		      low_bound = get_initialized_tmp_var (low_bound, pre_p,
+							   NULL, false);
+		      omp_add_variable (ctx, low_bound,
+					GOVD_FIRSTPRIVATE | GOVD_SEEN);
+		      TREE_PURPOSE (t) = low_bound;
+		    }
+		}
+	    }
+	  else if (gimplify_expr (&OMP_CLAUSE_SIZE (c), pre_p,
+				  NULL, is_gimple_val, fb_rvalue) == GS_ERROR)
 	    {
 	      remove = true;
 	      break;
@@ -8502,7 +8797,35 @@
 		  pd = &TREE_OPERAND (decl, 0);
 		  decl = TREE_OPERAND (decl, 0);
 		}
-	      if (TREE_CODE (decl) == COMPONENT_REF)
+	      bool indir_p = false;
+	      tree orig_decl = decl;
+	      tree decl_ref = NULL_TREE;
+	      if ((region_type & ORT_ACC) != 0
+		  && TREE_CODE (*pd) == COMPONENT_REF
+		  && OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH_DETACH
+		  && code != OACC_UPDATE)
+		{
+		  while (TREE_CODE (decl) == COMPONENT_REF)
+		    {
+		      decl = TREE_OPERAND (decl, 0);
+		      if ((TREE_CODE (decl) == MEM_REF
+			   && integer_zerop (TREE_OPERAND (decl, 1)))
+			  || INDIRECT_REF_P (decl))
+			{
+			  indir_p = true;
+			  decl = TREE_OPERAND (decl, 0);
+			}
+		      if (TREE_CODE (decl) == INDIRECT_REF
+			  && DECL_P (TREE_OPERAND (decl, 0))
+			  && (TREE_CODE (TREE_TYPE (TREE_OPERAND (decl, 0)))
+			      == REFERENCE_TYPE))
+			{
+			  decl_ref = decl;
+			  decl = TREE_OPERAND (decl, 0);
+			}
+		    }
+		}
+	      else if (TREE_CODE (decl) == COMPONENT_REF)
 		{
 		  while (TREE_CODE (decl) == COMPONENT_REF)
 		    decl = TREE_OPERAND (decl, 0);
@@ -8512,13 +8835,76 @@
 			  == REFERENCE_TYPE))
 		    decl = TREE_OPERAND (decl, 0);
 		}
+	      if (decl != orig_decl && DECL_P (decl) && indir_p)
+		{
+		  gomp_map_kind k = (code == OACC_EXIT_DATA) ? GOMP_MAP_DETACH
+							     : GOMP_MAP_ATTACH;
+		  /* We have a dereference of a struct member.  Make this an
+		     attach/detach operation, and ensure the base pointer is
+		     mapped as a FIRSTPRIVATE_POINTER.  */
+		  OMP_CLAUSE_SET_MAP_KIND (c, k);
+		  flags = GOVD_MAP | GOVD_SEEN | GOVD_EXPLICIT;
+		  tree next_clause = OMP_CLAUSE_CHAIN (c);
+		  if (k == GOMP_MAP_ATTACH
+		      && code != OACC_ENTER_DATA
+		      && (!next_clause
+			   || (OMP_CLAUSE_CODE (next_clause) != OMP_CLAUSE_MAP)
+			   || (OMP_CLAUSE_MAP_KIND (next_clause)
+			       != GOMP_MAP_POINTER)
+			   || OMP_CLAUSE_DECL (next_clause) != decl)
+		      && (!struct_deref_set
+			  || !struct_deref_set->contains (decl)))
+		    {
+		      if (!struct_deref_set)
+		        struct_deref_set = new hash_set<tree> ();
+		      /* As well as the attach, we also need a
+			 FIRSTPRIVATE_POINTER clause to properly map the
+			 pointer to the struct base.  */
+		      tree c2 = build_omp_clause (OMP_CLAUSE_LOCATION (c),
+						  OMP_CLAUSE_MAP);
+		      OMP_CLAUSE_SET_MAP_KIND (c2, GOMP_MAP_ALLOC);
+		      OMP_CLAUSE_MAP_MAYBE_ZERO_LENGTH_ARRAY_SECTION (c2)
+			= 1;
+		      tree charptr_zero
+		        = build_int_cst (build_pointer_type (char_type_node),
+					 0);
+		      OMP_CLAUSE_DECL (c2)
+			= build2 (MEM_REF, char_type_node,
+				  decl_ref ? decl_ref : decl, charptr_zero);
+		      OMP_CLAUSE_SIZE (c2) = size_zero_node;
+		      tree c3 = build_omp_clause (OMP_CLAUSE_LOCATION (c),
+						  OMP_CLAUSE_MAP);
+		      OMP_CLAUSE_SET_MAP_KIND (c3,
+					       GOMP_MAP_FIRSTPRIVATE_POINTER);
+		      OMP_CLAUSE_DECL (c3) = decl;
+		      OMP_CLAUSE_SIZE (c3) = size_zero_node;
+		      tree mapgrp = *prev_list_p;
+		      *prev_list_p = c2;
+		      OMP_CLAUSE_CHAIN (c3) = mapgrp;
+		      OMP_CLAUSE_CHAIN (c2) = c3;
+
+		      struct_deref_set->add (decl);
+		    }
+		  goto do_add_decl;
+		}
+	      /* An "attach/detach" operation on an update directive should
+	         behave as a GOMP_MAP_ALWAYS_POINTER.  Beware that
+		 unlike attach or detach map kinds, GOMP_MAP_ALWAYS_POINTER
+		 depends on the previous mapping.  */
+	      if (code == OACC_UPDATE
+		  && OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH_DETACH)
+		OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_ALWAYS_POINTER);
 	      if (gimplify_expr (pd, pre_p, NULL, is_gimple_lvalue, fb_lvalue)
 		  == GS_ERROR)
 		{
 		  remove = true;
 		  break;
 		}
-	      if (DECL_P (decl))
+	      if (DECL_P (decl)
+		  && OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_TO_PSET
+		  && OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_ATTACH
+		  && OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_DETACH
+		  && code != OACC_UPDATE)
 		{
 		  if (error_operand_p (decl))
 		    {
@@ -8539,7 +8925,8 @@
 		      break;
 		    }
 
-		  if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_POINTER)
+		  if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_POINTER
+		      || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH_DETACH)
 		    {
 		      /* Error recovery.  */
 		      if (prev_list_p == NULL)
@@ -8558,75 +8945,62 @@
 			}
 		    }
 
-		  tree offset;
-		  poly_int64 bitsize, bitpos;
-		  machine_mode mode;
-		  int unsignedp, reversep, volatilep = 0;
-		  tree base = OMP_CLAUSE_DECL (c);
-		  while (TREE_CODE (base) == ARRAY_REF)
-		    base = TREE_OPERAND (base, 0);
-		  if (TREE_CODE (base) == INDIRECT_REF)
-		    base = TREE_OPERAND (base, 0);
-		  base = get_inner_reference (base, &bitsize, &bitpos, &offset,
-					      &mode, &unsignedp, &reversep,
-					      &volatilep);
-		  tree orig_base = base;
-		  if ((TREE_CODE (base) == INDIRECT_REF
-		       || (TREE_CODE (base) == MEM_REF
-			   && integer_zerop (TREE_OPERAND (base, 1))))
-		      && DECL_P (TREE_OPERAND (base, 0))
-		      && (TREE_CODE (TREE_TYPE (TREE_OPERAND (base, 0)))
-			  == REFERENCE_TYPE))
-		    base = TREE_OPERAND (base, 0);
-		  gcc_assert (base == decl
-			      && (offset == NULL_TREE
-				  || poly_int_tree_p (offset)));
+		  tree orig_base;
+		  poly_int64 bitpos1;
+		  poly_offset_int offset1;
+
+		  int base_eq_orig_base
+		    = check_base_and_compare_lt (OMP_CLAUSE_DECL (c),
+						 &orig_base, decl, &bitpos1,
+						 &offset1);
 
 		  splay_tree_node n
 		    = splay_tree_lookup (ctx->variables, (splay_tree_key)decl);
 		  bool ptr = (OMP_CLAUSE_MAP_KIND (c)
 			      == GOMP_MAP_ALWAYS_POINTER);
+		  bool attach_detach = (OMP_CLAUSE_MAP_KIND (c)
+					== GOMP_MAP_ATTACH_DETACH);
+		  bool attach = OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ATTACH
+			        || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_DETACH;
+		  bool has_attachments = false;
+		  /* For OpenACC, pointers in structs should trigger an
+		     attach action.  */
+		  if (attach_detach && (region_type & ORT_ACC) != 0)
+		    {
+		      /* Turning a GOMP_MAP_ALWAYS_POINTER clause into a
+			 GOMP_MAP_ATTACH clause after we have detected a case
+			 that needs a GOMP_MAP_STRUCT mapping added.  */
+		      gomp_map_kind k
+			= (code == OACC_EXIT_DATA) ? GOMP_MAP_DETACH
+						   : GOMP_MAP_ATTACH;
+		      OMP_CLAUSE_SET_MAP_KIND (c, k);
+		      has_attachments = true;
+		    }
 		  if (n == NULL || (n->value & GOVD_MAP) == 0)
 		    {
 		      tree l = build_omp_clause (OMP_CLAUSE_LOCATION (c),
 						 OMP_CLAUSE_MAP);
-		      OMP_CLAUSE_SET_MAP_KIND (l, GOMP_MAP_STRUCT);
-		      if (orig_base != base)
+		      gomp_map_kind k = attach ? GOMP_MAP_FORCE_PRESENT
+					       : GOMP_MAP_STRUCT;
+
+		      OMP_CLAUSE_SET_MAP_KIND (l, k);
+		      if (!base_eq_orig_base)
 			OMP_CLAUSE_DECL (l) = unshare_expr (orig_base);
 		      else
 			OMP_CLAUSE_DECL (l) = decl;
-		      OMP_CLAUSE_SIZE (l) = size_int (1);
+		      OMP_CLAUSE_SIZE (l)
+			= (!attach
+			   ? size_int (1)
+			   : DECL_P (OMP_CLAUSE_DECL (l))
+			   ? DECL_SIZE_UNIT (OMP_CLAUSE_DECL (l))
+			   : TYPE_SIZE_UNIT (TREE_TYPE (OMP_CLAUSE_DECL (l))));
 		      if (struct_map_to_clause == NULL)
 			struct_map_to_clause = new hash_map<tree, tree>;
 		      struct_map_to_clause->put (decl, l);
-		      if (ptr)
+		      if (ptr || attach_detach)
 			{
-			  enum gomp_map_kind mkind
-			    = code == OMP_TARGET_EXIT_DATA
-			      ? GOMP_MAP_RELEASE : GOMP_MAP_ALLOC;
-			  tree c2 = build_omp_clause (OMP_CLAUSE_LOCATION (c),
-						      OMP_CLAUSE_MAP);
-			  OMP_CLAUSE_SET_MAP_KIND (c2, mkind);
-			  OMP_CLAUSE_DECL (c2)
-			    = unshare_expr (OMP_CLAUSE_DECL (c));
-			  OMP_CLAUSE_CHAIN (c2) = *prev_list_p;
-			  OMP_CLAUSE_SIZE (c2)
-			    = TYPE_SIZE_UNIT (ptr_type_node);
-			  OMP_CLAUSE_CHAIN (l) = c2;
-			  if (OMP_CLAUSE_CHAIN (*prev_list_p) != c)
-			    {
-			      tree c4 = OMP_CLAUSE_CHAIN (*prev_list_p);
-			      tree c3
-				= build_omp_clause (OMP_CLAUSE_LOCATION (c),
-						    OMP_CLAUSE_MAP);
-			      OMP_CLAUSE_SET_MAP_KIND (c3, mkind);
-			      OMP_CLAUSE_DECL (c3)
-				= unshare_expr (OMP_CLAUSE_DECL (c4));
-			      OMP_CLAUSE_SIZE (c3)
-				= TYPE_SIZE_UNIT (ptr_type_node);
-			      OMP_CLAUSE_CHAIN (c3) = *prev_list_p;
-			      OMP_CLAUSE_CHAIN (c2) = c3;
-			    }
+			  insert_struct_comp_map (code, c, l, *prev_list_p,
+						  NULL);
 			  *prev_list_p = l;
 			  prev_list_p = NULL;
 			}
@@ -8636,7 +9010,7 @@
 			  *list_p = l;
 			  list_p = &OMP_CLAUSE_CHAIN (l);
 			}
-		      if (orig_base != base && code == OMP_TARGET)
+		      if (!base_eq_orig_base && code == OMP_TARGET)
 			{
 			  tree c2 = build_omp_clause (OMP_CLAUSE_LOCATION (c),
 						      OMP_CLAUSE_MAP);
@@ -8649,30 +9023,31 @@
 			  OMP_CLAUSE_CHAIN (l) = c2;
 			}
 		      flags = GOVD_MAP | GOVD_EXPLICIT;
-		      if (GOMP_MAP_ALWAYS_P (OMP_CLAUSE_MAP_KIND (c)) || ptr)
+		      if (GOMP_MAP_ALWAYS_P (OMP_CLAUSE_MAP_KIND (c))
+			  || ptr
+			  || attach_detach)
 			flags |= GOVD_SEEN;
+		      if (has_attachments)
+			flags |= GOVD_MAP_HAS_ATTACHMENTS;
 		      goto do_add_decl;
 		    }
-		  else
+		  else if (struct_map_to_clause)
 		    {
 		      tree *osc = struct_map_to_clause->get (decl);
 		      tree *sc = NULL, *scp = NULL;
-		      if (GOMP_MAP_ALWAYS_P (OMP_CLAUSE_MAP_KIND (c)) || ptr)
+		      if (GOMP_MAP_ALWAYS_P (OMP_CLAUSE_MAP_KIND (c))
+			  || ptr
+			  || attach_detach)
 			n->value |= GOVD_SEEN;
-		      poly_offset_int o1, o2;
-		      if (offset)
-			o1 = wi::to_poly_offset (offset);
-		      else
-			o1 = 0;
-		      if (maybe_ne (bitpos, 0))
-			o1 += bits_to_bytes_round_down (bitpos);
 		      sc = &OMP_CLAUSE_CHAIN (*osc);
 		      if (*sc != c
 			  && (OMP_CLAUSE_MAP_KIND (*sc)
-			      == GOMP_MAP_FIRSTPRIVATE_REFERENCE)) 
+			      == GOMP_MAP_FIRSTPRIVATE_REFERENCE))
 			sc = &OMP_CLAUSE_CHAIN (*sc);
+		      /* Here "prev_list_p" is the end of the inserted
+			 alloc/release nodes after the struct node, OSC.  */
 		      for (; *sc != c; sc = &OMP_CLAUSE_CHAIN (*sc))
-			if (ptr && sc == prev_list_p)
+			if ((ptr || attach_detach) && sc == prev_list_p)
 			  break;
 			else if (TREE_CODE (OMP_CLAUSE_DECL (*sc))
 				 != COMPONENT_REF
@@ -8683,44 +9058,14 @@
 			  break;
 			else
 			  {
-			    tree offset2;
-			    poly_int64 bitsize2, bitpos2;
-			    base = OMP_CLAUSE_DECL (*sc);
-			    if (TREE_CODE (base) == ARRAY_REF)
-			      {
-				while (TREE_CODE (base) == ARRAY_REF)
-				  base = TREE_OPERAND (base, 0);
-				if (TREE_CODE (base) != COMPONENT_REF
-				    || (TREE_CODE (TREE_TYPE (base))
-					!= ARRAY_TYPE))
-				  break;
-			      }
-			    else if (TREE_CODE (base) == INDIRECT_REF
-				     && (TREE_CODE (TREE_OPERAND (base, 0))
-					 == COMPONENT_REF)
-				     && (TREE_CODE (TREE_TYPE
-						     (TREE_OPERAND (base, 0)))
-					 == REFERENCE_TYPE))
-			      base = TREE_OPERAND (base, 0);
-			    base = get_inner_reference (base, &bitsize2,
-							&bitpos2, &offset2,
-							&mode, &unsignedp,
-							&reversep, &volatilep);
-			    if ((TREE_CODE (base) == INDIRECT_REF
-				 || (TREE_CODE (base) == MEM_REF
-				     && integer_zerop (TREE_OPERAND (base,
-								     1))))
-				&& DECL_P (TREE_OPERAND (base, 0))
-				&& (TREE_CODE (TREE_TYPE (TREE_OPERAND (base,
-									0)))
-				    == REFERENCE_TYPE))
-			      base = TREE_OPERAND (base, 0);
-			    if (base != decl)
+			    tree sc_decl = OMP_CLAUSE_DECL (*sc);
+			    int same_decl_offset_lt
+			      = check_base_and_compare_lt (sc_decl, NULL, decl,
+							   &bitpos1, &offset1);
+			    if (same_decl_offset_lt == -1)
 			      break;
 			    if (scp)
 			      continue;
-			    gcc_assert (offset2 == NULL_TREE
-					|| poly_int_tree_p (offset2));
 			    tree d1 = OMP_CLAUSE_DECL (*sc);
 			    tree d2 = OMP_CLAUSE_DECL (c);
 			    while (TREE_CODE (d1) == ARRAY_REF)
@@ -8749,16 +9094,9 @@
 				remove = true;
 				break;
 			      }
-			    if (offset2)
-			      o2 = wi::to_poly_offset (offset2);
-			    else
-			      o2 = 0;
-			    o2 += bits_to_bytes_round_down (bitpos2);
-			    if (maybe_lt (o1, o2)
-				|| (known_eq (o1, o2)
-				    && maybe_lt (bitpos, bitpos2)))
+			    if (same_decl_offset_lt)
 			      {
-				if (ptr)
+				if (ptr || attach_detach)
 				  scp = sc;
 				else
 				  break;
@@ -8766,43 +9104,14 @@
 			  }
 		      if (remove)
 			break;
-		      OMP_CLAUSE_SIZE (*osc)
-			= size_binop (PLUS_EXPR, OMP_CLAUSE_SIZE (*osc),
-				      size_one_node);
-		      if (ptr)
+		      if (!attach)
+			OMP_CLAUSE_SIZE (*osc)
+			  = size_binop (PLUS_EXPR, OMP_CLAUSE_SIZE (*osc),
+					size_one_node);
+		      if (ptr || attach_detach)
 			{
-			  tree c2 = build_omp_clause (OMP_CLAUSE_LOCATION (c),
-						      OMP_CLAUSE_MAP);
-			  tree cl = NULL_TREE;
-			  enum gomp_map_kind mkind
-			    = code == OMP_TARGET_EXIT_DATA
-			      ? GOMP_MAP_RELEASE : GOMP_MAP_ALLOC;
-			  OMP_CLAUSE_SET_MAP_KIND (c2, mkind);
-			  OMP_CLAUSE_DECL (c2)
-			    = unshare_expr (OMP_CLAUSE_DECL (c));
-			  OMP_CLAUSE_CHAIN (c2) = scp ? *scp : *prev_list_p;
-			  OMP_CLAUSE_SIZE (c2)
-			    = TYPE_SIZE_UNIT (ptr_type_node);
-			  cl = scp ? *prev_list_p : c2;
-			  if (OMP_CLAUSE_CHAIN (*prev_list_p) != c)
-			    {
-			      tree c4 = OMP_CLAUSE_CHAIN (*prev_list_p);
-			      tree c3
-				= build_omp_clause (OMP_CLAUSE_LOCATION (c),
-						    OMP_CLAUSE_MAP);
-			      OMP_CLAUSE_SET_MAP_KIND (c3, mkind);
-			      OMP_CLAUSE_DECL (c3)
-				= unshare_expr (OMP_CLAUSE_DECL (c4));
-			      OMP_CLAUSE_SIZE (c3)
-				= TYPE_SIZE_UNIT (ptr_type_node);
-			      OMP_CLAUSE_CHAIN (c3) = *prev_list_p;
-			      if (!scp)
-				OMP_CLAUSE_CHAIN (c2) = c3;
-			      else
-				cl = c3;
-			    }
-			  if (scp)
-			    *scp = c2;
+			  tree cl = insert_struct_comp_map (code, c, NULL,
+							    *prev_list_p, scp);
 			  if (sc == prev_list_p)
 			    {
 			      *sc = cl;
@@ -8829,17 +9138,26 @@
 		}
 	      if (!remove
 		  && OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_ALWAYS_POINTER
+		  && OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_ATTACH_DETACH
+		  && OMP_CLAUSE_MAP_KIND (c) != GOMP_MAP_TO_PSET
 		  && OMP_CLAUSE_CHAIN (c)
 		  && OMP_CLAUSE_CODE (OMP_CLAUSE_CHAIN (c)) == OMP_CLAUSE_MAP
-		  && (OMP_CLAUSE_MAP_KIND (OMP_CLAUSE_CHAIN (c))
-		      == GOMP_MAP_ALWAYS_POINTER))
+		  && ((OMP_CLAUSE_MAP_KIND (OMP_CLAUSE_CHAIN (c))
+		       == GOMP_MAP_ALWAYS_POINTER)
+		      || (OMP_CLAUSE_MAP_KIND (OMP_CLAUSE_CHAIN (c))
+			  == GOMP_MAP_ATTACH_DETACH)
+		      || (OMP_CLAUSE_MAP_KIND (OMP_CLAUSE_CHAIN (c))
+		          == GOMP_MAP_TO_PSET)))
 		prev_list_p = list_p;
+
 	      break;
 	    }
 	  flags = GOVD_MAP | GOVD_EXPLICIT;
 	  if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_TO
 	      || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_ALWAYS_TOFROM)
 	    flags |= GOVD_MAP_ALWAYS_TO;
+	  else if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FORCE_DEVICEPTR)
+	    flags |= GOVD_DEVICEPTR;
 	  goto do_add;
 
 	case OMP_CLAUSE_DEPEND:
@@ -9297,6 +9615,7 @@
 	  ctx->default_kind = OMP_CLAUSE_DEFAULT_KIND (c);
 	  break;
 
+	case OMP_CLAUSE_NOHOST:
 	default:
 	  gcc_unreachable ();
 	}
@@ -9315,6 +9634,8 @@
   gimplify_omp_ctxp = ctx;
   if (struct_map_to_clause)
     delete struct_map_to_clause;
+  if (struct_deref_set)
+    delete struct_deref_set;
 }
 
 /* Return true if DECL is a candidate for shared to firstprivate
@@ -9440,6 +9761,51 @@
   gimple_seq *pre_p;
 };
 
+/* For OpenACC parallel and kernels regions, the implicit data mappings for
+   arrays must respect explicit data clauses set by a containing acc data
+   region.  Specifically, an array section on the data clause must be
+   transformed into an equivalent PRESENT mapping on the inner parallel or
+   kernels region.  This function returns a pointer to an
+   oacc_array_mapping_info if an array slice of DECL is specified in a
+   lexically-enclosing data construct, or returns NULL otherwise.  */
+
+static oacc_array_mapping_info *
+gomp_oacc_needs_data_present (tree decl)
+{
+  gimplify_omp_ctx *ctx = NULL;
+
+  if (gimplify_omp_ctxp->region_type != ORT_ACC_PARALLEL
+      && gimplify_omp_ctxp->region_type != ORT_ACC_KERNELS
+      && gimplify_omp_ctxp->region_type != ORT_ACC_SERIAL)
+    return NULL;
+
+  tree type = TREE_TYPE (decl);
+  if (TREE_CODE (type) == REFERENCE_TYPE)
+    type = TREE_TYPE (type);
+
+  if (TREE_CODE (type) != ARRAY_TYPE
+      && TREE_CODE (type) != POINTER_TYPE
+      && TREE_CODE (type) != RECORD_TYPE
+      && (TREE_CODE (type) != POINTER_TYPE
+	  || TREE_CODE (TREE_TYPE (type)) != ARRAY_TYPE))
+    return NULL;
+
+  decl = get_base_address (decl);
+
+  for (ctx = gimplify_omp_ctxp->outer_context; ctx; ctx = ctx->outer_context)
+    {
+      oacc_array_mapping_info *ret;
+
+      if (ctx->region_type != ORT_ACC_DATA)
+	break;
+
+      if (ctx->decl_data_clause && (ret = ctx->decl_data_clause->get (decl)))
+	return ret;
+    }
+
+  return NULL;
+}
+
 /* For all variables that were not actually used within the context,
    remove PRIVATE, SHARED, and FIRSTPRIVATE clauses.  */
 
@@ -9459,6 +9825,8 @@
     return 0;
   if ((flags & GOVD_SEEN) == 0)
     return 0;
+  if ((flags & GOVD_MAP_HAS_ATTACHMENTS) != 0)
+    return 0;
   if (flags & GOVD_DEBUG_PRIVATE)
     {
       gcc_assert ((flags & GOVD_DATA_SHARE_CLASS) == GOVD_SHARED);
@@ -9532,6 +9900,7 @@
   clause = build_omp_clause (input_location, code);
   OMP_CLAUSE_DECL (clause) = decl;
   OMP_CLAUSE_CHAIN (clause) = chain;
+  oacc_array_mapping_info *array_info;
   if (private_debug)
     OMP_CLAUSE_PRIVATE_DEBUG (clause) = 1;
   else if (code == OMP_CLAUSE_PRIVATE && (flags & GOVD_PRIVATE_OUTER_REF))
@@ -9540,6 +9909,76 @@
 	   && (flags & GOVD_WRITTEN) == 0
 	   && omp_shared_to_firstprivate_optimizable_decl_p (decl))
     OMP_CLAUSE_SHARED_READONLY (clause) = 1;
+  else if ((code == OMP_CLAUSE_MAP || code == OMP_CLAUSE_FIRSTPRIVATE)
+	   && (array_info = gomp_oacc_needs_data_present (decl)))
+    {
+      tree mapping = array_info->mapping;
+      tree pointer = array_info->pointer;
+      gomp_map_kind presence_kind = GOMP_MAP_FORCE_PRESENT;
+      bool no_alloc = (OMP_CLAUSE_CODE (mapping) == OMP_CLAUSE_MAP
+		       && OMP_CLAUSE_MAP_KIND (mapping) == GOMP_MAP_NO_ALLOC);
+
+      if (no_alloc || omp_is_optional_argument (decl))
+        presence_kind = GOMP_MAP_NO_ALLOC;
+
+      if (code == OMP_CLAUSE_FIRSTPRIVATE)
+	/* Oops, we have the wrong type of clause.  Rebuild it.  */
+	clause = build_omp_clause (OMP_CLAUSE_LOCATION (clause),
+				   OMP_CLAUSE_MAP);
+
+      OMP_CLAUSE_DECL (clause) = unshare_expr (OMP_CLAUSE_DECL (mapping));
+      OMP_CLAUSE_SET_MAP_KIND (clause, presence_kind);
+      OMP_CLAUSE_SIZE (clause) = unshare_expr (OMP_CLAUSE_SIZE (mapping));
+
+      /* Create a new data clause for the firstprivate pointer.  */
+      tree nc = build_omp_clause (OMP_CLAUSE_LOCATION (clause),
+				  OMP_CLAUSE_MAP);
+      OMP_CLAUSE_DECL (nc) = unshare_expr (OMP_CLAUSE_DECL (pointer));
+      OMP_CLAUSE_SET_MAP_KIND (nc, no_alloc ? GOMP_MAP_FIRSTPRIVATE_POINTER
+					    : GOMP_MAP_POINTER);
+
+      /* For GOMP_MAP_FIRSTPRIVATE_POINTER, this is a bias, not a size.  */
+      OMP_CLAUSE_SIZE (nc) = unshare_expr (OMP_CLAUSE_SIZE (pointer));
+
+      /* Create a new data clause for the PSET, if present.  */
+      tree psetc = NULL;
+      if (array_info->pset)
+	{
+	  tree pset = array_info->pset;
+	  psetc = build_omp_clause (OMP_CLAUSE_LOCATION (clause),
+				    OMP_CLAUSE_MAP);
+	  OMP_CLAUSE_DECL (psetc) = unshare_expr (OMP_CLAUSE_DECL (pset));
+	  OMP_CLAUSE_SIZE (psetc) = unshare_expr (OMP_CLAUSE_SIZE (pset));
+	  OMP_CLAUSE_SET_MAP_KIND (psetc, GOMP_MAP_TO_PSET);
+	  OMP_CLAUSE_CHAIN (psetc) = nc;
+	}
+
+      gimplify_omp_ctx *ctx = gimplify_omp_ctxp;
+      gimplify_omp_ctxp = ctx->outer_context;
+      gimplify_expr (&OMP_CLAUSE_DECL (clause), pre_p, NULL,
+		     is_gimple_lvalue, fb_lvalue);
+      gimplify_expr (&OMP_CLAUSE_SIZE (clause), pre_p, NULL,
+		     is_gimple_val, fb_rvalue);
+      gimplify_expr (&OMP_CLAUSE_SIZE (nc), pre_p, NULL, is_gimple_val,
+		     fb_rvalue);
+      gimplify_omp_ctxp = ctx;
+
+      OMP_CLAUSE_CHAIN (nc) = OMP_CLAUSE_CHAIN (clause);
+      OMP_CLAUSE_CHAIN (clause) = psetc ? psetc : nc;
+
+      if (array_info->ref)
+	{
+	  tree refc = build_omp_clause (OMP_CLAUSE_LOCATION (clause),
+					OMP_CLAUSE_MAP);
+	  OMP_CLAUSE_DECL (refc)
+	    = unshare_expr (OMP_CLAUSE_DECL (array_info->ref));
+	  OMP_CLAUSE_SIZE (refc)
+	    = unshare_expr (OMP_CLAUSE_SIZE (array_info->ref));
+	  OMP_CLAUSE_SET_MAP_KIND (refc, GOMP_MAP_POINTER);
+	  OMP_CLAUSE_CHAIN (refc) = OMP_CLAUSE_CHAIN (nc);
+	  OMP_CLAUSE_CHAIN (nc) = refc;
+	}
+    }
   else if (code == OMP_CLAUSE_FIRSTPRIVATE && (flags & GOVD_EXPLICIT) == 0)
     OMP_CLAUSE_FIRSTPRIVATE_IMPLICIT (clause) = 1;
   else if (code == OMP_CLAUSE_MAP && (flags & GOVD_MAP_0LEN_ARRAY) != 0)
@@ -9574,7 +10013,8 @@
 		       | GOVD_MAP_FORCE
 		       | GOVD_MAP_FORCE_PRESENT
 		       | GOVD_MAP_ALLOC_ONLY
-		       | GOVD_MAP_FROM_ONLY))
+		       | GOVD_MAP_FROM_ONLY
+		       | GOVD_DEVICEPTR))
 	{
 	case 0:
 	  kind = GOMP_MAP_TOFROM;
@@ -9597,6 +10037,9 @@
 	case GOVD_MAP_FORCE_PRESENT:
 	  kind = GOMP_MAP_FORCE_PRESENT;
 	  break;
+	case GOVD_DEVICEPTR:
+	  kind = GOMP_MAP_FORCE_DEVICEPTR;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
@@ -9665,7 +10108,21 @@
   *list_p = clause;
   struct gimplify_omp_ctx *ctx = gimplify_omp_ctxp;
   gimplify_omp_ctxp = ctx->outer_context;
+  gomp_map_kind kind = (code == OMP_CLAUSE_MAP) ? OMP_CLAUSE_MAP_KIND (clause)
+						: (gomp_map_kind) GOMP_MAP_LAST;
   lang_hooks.decls.omp_finish_clause (clause, pre_p);
+  /* Allow OpenACC to have implicit assumed-size arrays via FORCE_PRESENT,
+     which should work as long as the array has previously been mapped
+     explicitly on the target (e.g. by "enter data").  Raise an error for
+     OpenMP.  */
+  if (lang_GNU_Fortran ()
+      && code == OMP_CLAUSE_MAP
+      && (ctx->region_type & ORT_ACC) == 0
+      && kind == GOMP_MAP_TOFROM
+      && OMP_CLAUSE_MAP_KIND (clause) == GOMP_MAP_FORCE_PRESENT)
+    error_at (OMP_CLAUSE_LOCATION (clause),
+	      "implicit mapping of assumed size array %qD",
+	      OMP_CLAUSE_DECL (clause));
   if (gimplify_omp_ctxp)
     for (; clause != chain; clause = OMP_CLAUSE_CHAIN (clause))
       if (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_MAP
@@ -9845,7 +10302,8 @@
 	  /* Data clauses associated with acc parallel reductions must be
 	     compatible with present_or_copy.  Warn and adjust the clause
 	     if that is not the case.  */
-	  if (ctx->region_type == ORT_ACC_PARALLEL)
+	  if (ctx->region_type == ORT_ACC_PARALLEL
+	      || ctx->region_type == ORT_ACC_SERIAL)
 	    {
 	      tree t = DECL_P (decl) ? decl : TREE_OPERAND (decl, 0);
 	      n = NULL;
@@ -10005,8 +10463,10 @@
 	case OMP_CLAUSE_TASK_REDUCTION:
 	  decl = OMP_CLAUSE_DECL (c);
 	  /* OpenACC reductions need a present_or_copy data clause.
-	     Add one if necessary.  Emit error when the reduction is private.  */
-	  if (ctx->region_type == ORT_ACC_PARALLEL)
+	     Add one if necessary.  Emit error when the reduction is
+	     private.  */
+	  if (DECL_P (decl) && (ctx->region_type == ORT_ACC_PARALLEL
+				|| ctx->region_type == ORT_ACC_SERIAL))
 	    {
 	      n = splay_tree_lookup (ctx->variables, (splay_tree_key) decl);
 	      if (n->value & (GOVD_PRIVATE | GOVD_FIRSTPRIVATE))
@@ -10084,6 +10544,7 @@
 	case OMP_CLAUSE_FINALIZE:
 	  break;
 
+	case OMP_CLAUSE_NOHOST:
 	default:
 	  gcc_unreachable ();
 	}
@@ -10376,6 +10837,95 @@
   return NULL_TREE;
 }
 
+/* Helper function for localize_reductions.  Replace all uses of REF_VAR with
+   LOCAL_VAR.  */
+
+static tree
+localize_reductions_r (tree *tp, int *walk_subtrees, void *data)
+{
+  enum tree_code tc = TREE_CODE (*tp);
+  struct privatize_reduction *pr = (struct privatize_reduction *) data;
+
+  if (TYPE_P (*tp))
+    *walk_subtrees = 0;
+
+  switch (tc)
+    {
+    case INDIRECT_REF:
+    case MEM_REF:
+      if (TREE_OPERAND (*tp, 0) == pr->ref_var)
+	*tp = pr->local_var;
+
+      *walk_subtrees = 0;
+      break;
+
+    case VAR_DECL:
+    case PARM_DECL:
+    case RESULT_DECL:
+      if (*tp == pr->ref_var)
+	*tp = pr->local_var;
+
+      *walk_subtrees = 0;
+      break;
+
+    default:
+      break;
+    }
+
+  return NULL_TREE;
+}
+
+/* OpenACC worker and vector loop state propagation requires reductions
+   to be inside local variables.  This function replaces all reference-type
+   reductions variables associated with the loop with a local copy.  It is
+   also used to create private copies of reduction variables for those
+   which are not associated with acc loops.  */
+
+static void
+localize_reductions (tree clauses, tree body)
+{
+  tree c, var, type, new_var;
+  struct privatize_reduction pr;
+
+  for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
+      {
+	var = OMP_CLAUSE_DECL (c);
+
+	if (!lang_hooks.decls.omp_privatize_by_reference (var))
+	  {
+	    OMP_CLAUSE_REDUCTION_PRIVATE_DECL (c) = NULL;
+	    continue;
+	  }
+
+	type = TREE_TYPE (TREE_TYPE (var));
+	new_var = create_tmp_var (type, IDENTIFIER_POINTER (DECL_NAME (var)));
+
+	pr.ref_var = var;
+	pr.local_var = new_var;
+
+	walk_tree (&body, localize_reductions_r, &pr, NULL);
+
+	OMP_CLAUSE_REDUCTION_PRIVATE_DECL (c) = new_var;
+      }
+    else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_PRIVATE)
+      {
+	var = OMP_CLAUSE_DECL (c);
+
+	if (!lang_hooks.decls.omp_privatize_by_reference (var))
+	  continue;
+
+	type = TREE_TYPE (TREE_TYPE (var));
+	new_var = create_tmp_var (type, IDENTIFIER_POINTER (DECL_NAME (var)));
+
+	pr.ref_var = var;
+	pr.local_var = new_var;
+
+	walk_tree (&body, localize_reductions_r, &pr, NULL);
+      }
+}
+
+
 /* Gimplify the gross structure of an OMP_FOR statement.  */
 
 static enum gimplify_status
@@ -10577,6 +11127,24 @@
       gcc_unreachable ();
     }
 
+  if (ort == ORT_ACC)
+    {
+      gimplify_omp_ctx *outer = gimplify_omp_ctxp;
+
+      while (outer
+	     && outer->region_type != ORT_ACC_PARALLEL
+	     && outer->region_type != ORT_ACC_KERNELS)
+	outer = outer->outer_context;
+
+      /* FIXME: Reductions only work in parallel regions at present.  We avoid
+	 doing the reduction localization transformation in kernels regions
+	 here, because the code to remove reductions in kernels regions cannot
+	 handle that.  */
+      if (outer && outer->region_type == ORT_ACC_PARALLEL)
+	localize_reductions (OMP_FOR_CLAUSES (for_stmt),
+			     OMP_FOR_BODY (for_stmt));
+    }
+
   /* Set OMP_CLAUSE_LINEAR_NO_COPYIN flag on explicit linear
      clause for the IV.  */
   if (ort == ORT_SIMD && TREE_VEC_LENGTH (OMP_FOR_INIT (for_stmt)) == 1)
@@ -11710,6 +12278,9 @@
     case OACC_PARALLEL:
       ort = ORT_ACC_PARALLEL;
       break;
+    case OACC_SERIAL:
+      ort = ORT_ACC_SERIAL;
+      break;
     case OACC_DATA:
       ort = ORT_ACC_DATA;
       break;
@@ -11739,6 +12310,11 @@
       || (ort & ORT_HOST_TEAMS) == ORT_HOST_TEAMS)
     {
       push_gimplify_context ();
+
+      /* FIXME: Reductions are not supported in kernels regions yet.  */
+      if (/*ort == ORT_ACC_KERNELS ||*/ ort == ORT_ACC_PARALLEL)
+        localize_reductions (OMP_CLAUSES (expr), OMP_BODY (expr));
+
       gimple *g = gimplify_and_return_first (OMP_BODY (expr), &body);
       if (gimple_code (g) == GIMPLE_BIND)
 	pop_gimplify_context (g);
@@ -11791,6 +12367,10 @@
       stmt = gimple_build_omp_target (body, GF_OMP_TARGET_KIND_OACC_PARALLEL,
 				      OMP_CLAUSES (expr));
       break;
+    case OACC_SERIAL:
+      stmt = gimple_build_omp_target (body, GF_OMP_TARGET_KIND_OACC_SERIAL,
+				      OMP_CLAUSES (expr));
+      break;
     case OMP_SECTIONS:
       stmt = gimple_build_omp_sections (body, OMP_CLAUSES (expr));
       break;
@@ -11880,8 +12460,9 @@
 	   && omp_find_clause (OMP_STANDALONE_CLAUSES (expr),
 			       OMP_CLAUSE_FINALIZE))
     {
-      /* Use GOMP_MAP_DELETE/GOMP_MAP_FORCE_FROM to denote that "finalize"
-	 semantics apply to all mappings of this OpenACC directive.  */
+      /* Use GOMP_MAP_DELETE, GOMP_MAP_FORCE_DETACH, and
+	 GOMP_MAP_FORCE_FROM to denote that "finalize" semantics apply
+	 to all mappings of this OpenACC directive.  */
       bool finalize_marked = false;
       for (tree c = OMP_STANDALONE_CLAUSES (expr); c; c = OMP_CLAUSE_CHAIN (c))
 	if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP)
@@ -11895,10 +12476,19 @@
 	      OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_DELETE);
 	      finalize_marked = true;
 	      break;
+	    case GOMP_MAP_DETACH:
+	      OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_DETACH);
+	      finalize_marked = true;
+	      break;
+	    case GOMP_MAP_STRUCT:
+	    case GOMP_MAP_FORCE_PRESENT:
+	      /* Skip over an initial struct or force_present mapping.  */
+	      break;
 	    default:
-	      /* Check consistency: libgomp relies on the very first data
-		 mapping clause being marked, so make sure we did that before
-		 any other mapping clauses.  */
+	      /* Check consistency: libgomp relies on the very first
+		 non-struct, non-force-present data mapping clause being
+		 marked, so make sure we did that before any other mapping
+		 clauses.  */
 	      gcc_assert (finalize_marked);
 	      break;
 	    }
@@ -13010,6 +13600,7 @@
 	case OACC_DATA:
 	case OACC_KERNELS:
 	case OACC_PARALLEL:
+	case OACC_SERIAL:
 	case OMP_SECTIONS:
 	case OMP_SINGLE:
 	case OMP_TARGET:
@@ -13409,6 +14000,7 @@
 		  && code != TRY_FINALLY_EXPR
 		  && code != OACC_PARALLEL
 		  && code != OACC_KERNELS
+		  && code != OACC_SERIAL
 		  && code != OACC_DATA
 		  && code != OACC_HOST_DATA
 		  && code != OACC_DECLARE
diff --git a/gcc/hooks.c b/gcc/hooks.c
index f95659b..9a60ac5 100644
--- a/gcc/hooks.c
+++ b/gcc/hooks.c
@@ -431,6 +431,12 @@
 }
 
 tree
+hook_tree_tree_null (tree)
+{
+  return NULL;
+}
+
+tree
 hook_tree_tree_tree_null (tree, tree)
 {
   return NULL;
diff --git a/gcc/hooks.h b/gcc/hooks.h
index 0bc8117..dca6ec2 100644
--- a/gcc/hooks.h
+++ b/gcc/hooks.h
@@ -106,6 +106,7 @@
 extern tree hook_tree_const_tree_null (const_tree);
 extern tree hook_tree_void_null (void);
 
+extern tree hook_tree_tree_null (tree);
 extern tree hook_tree_tree_tree_null (tree, tree);
 extern tree hook_tree_tree_tree_tree_null (tree, tree, tree);
 extern tree hook_tree_tree_int_treep_bool_null (tree, int, tree *, bool);
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 95788df..13ef318 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -2622,6 +2622,8 @@
       else
 	gcc_unreachable ();
       break;
+    case IFN_UNIQUE_OACC_PRIVATE:
+      break;
     }
 
   if (pattern)
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 7164ee5..a2810ed 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -36,7 +36,8 @@
 #define IFN_UNIQUE_CODES				  \
   DEF(UNSPEC),	\
     DEF(OACC_FORK), DEF(OACC_JOIN),		\
-    DEF(OACC_HEAD_MARK), DEF(OACC_TAIL_MARK)
+    DEF(OACC_HEAD_MARK), DEF(OACC_TAIL_MARK),	\
+    DEF(OACC_PRIVATE)
 
 enum ifn_unique_kind {
 #define DEF(X) IFN_UNIQUE_##X
diff --git a/gcc/ira.c b/gcc/ira.c
index b330f2a..205d817 100644
--- a/gcc/ira.c
+++ b/gcc/ira.c
@@ -516,7 +516,8 @@
 #endif
   COPY_HARD_REG_SET (no_unit_alloc_regs, fixed_nonglobal_reg_set);
   if (! use_hard_frame_p)
-    SET_HARD_REG_BIT (no_unit_alloc_regs, HARD_FRAME_POINTER_REGNUM);
+    add_to_hard_reg_set (&no_unit_alloc_regs, Pmode,
+			 HARD_FRAME_POINTER_REGNUM);
   setup_class_hard_regs ();
 }
 
@@ -2275,6 +2276,7 @@
 {
   int i;
   static const struct {const int from, to; } eliminables[] = ELIMINABLE_REGS;
+  int fp_reg_count = hard_regno_nregs (HARD_FRAME_POINTER_REGNUM, Pmode);
 
   /* Setup is_leaf as frame_pointer_required may use it.  This function
      is called by sched_init before ira if scheduling is enabled.  */
@@ -2303,7 +2305,8 @@
        frame pointer in LRA.  */
 
   if (frame_pointer_needed)
-    df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM, true);
+    for (i = 0; i < fp_reg_count; i++)
+      df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM + i, true);
     
   COPY_HARD_REG_SET (ira_no_alloc_regs, no_unit_alloc_regs);
   CLEAR_HARD_REG_SET (eliminable_regset);
@@ -2333,17 +2336,21 @@
     }
   if (!HARD_FRAME_POINTER_IS_FRAME_POINTER)
     {
-      if (!TEST_HARD_REG_BIT (crtl->asm_clobbers, HARD_FRAME_POINTER_REGNUM))
-	{
-	  SET_HARD_REG_BIT (eliminable_regset, HARD_FRAME_POINTER_REGNUM);
-	  if (frame_pointer_needed)
-	    SET_HARD_REG_BIT (ira_no_alloc_regs, HARD_FRAME_POINTER_REGNUM);
-	}
-      else if (frame_pointer_needed)
-	error ("%s cannot be used in asm here",
-	       reg_names[HARD_FRAME_POINTER_REGNUM]);
-      else
-	df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM, true);
+      for (i = 0; i < fp_reg_count; i++)
+	if (!TEST_HARD_REG_BIT (crtl->asm_clobbers,
+				HARD_FRAME_POINTER_REGNUM + i))
+	  {
+	    SET_HARD_REG_BIT (eliminable_regset,
+			      HARD_FRAME_POINTER_REGNUM + i);
+	    if (frame_pointer_needed)
+	      SET_HARD_REG_BIT (ira_no_alloc_regs,
+				HARD_FRAME_POINTER_REGNUM + i);
+	  }
+	else if (frame_pointer_needed)
+	  error ("%s cannot be used in asm here",
+		 reg_names[HARD_FRAME_POINTER_REGNUM + i]);
+	else
+	  df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM + i, true);
     }
 }
 
diff --git a/gcc/langhooks-def.h b/gcc/langhooks-def.h
index a059841..46ec9d1 100644
--- a/gcc/langhooks-def.h
+++ b/gcc/langhooks-def.h
@@ -188,6 +188,7 @@
 #define LANG_HOOKS_OMP_FIRSTPRIVATIZE_TYPE_SIZES \
   lhd_omp_firstprivatize_type_sizes
 #define LANG_HOOKS_OMP_MAPPABLE_TYPE	lhd_omp_mappable_type
+#define LANG_HOOKS_OMP_ARRAY_DATA	hook_tree_tree_null
 #define LANG_HOOKS_TYPE_HASH_EQ		NULL
 #define LANG_HOOKS_COPY_LANG_QUALIFIERS NULL
 #define LANG_HOOKS_GET_ARRAY_DESCR_INFO	NULL
@@ -214,6 +215,7 @@
   LANG_HOOKS_TYPE_MAX_SIZE, \
   LANG_HOOKS_OMP_FIRSTPRIVATIZE_TYPE_SIZES, \
   LANG_HOOKS_OMP_MAPPABLE_TYPE, \
+  LANG_HOOKS_OMP_ARRAY_DATA, \
   LANG_HOOKS_TYPE_HASH_EQ, \
   LANG_HOOKS_COPY_LANG_QUALIFIERS, \
   LANG_HOOKS_GET_ARRAY_DESCR_INFO, \
diff --git a/gcc/langhooks.h b/gcc/langhooks.h
index a45579b..f7fa5f3 100644
--- a/gcc/langhooks.h
+++ b/gcc/langhooks.h
@@ -117,6 +117,10 @@
   /* Return true if TYPE is a mappable type.  */
   bool (*omp_mappable_type) (tree type);
 
+  /* Return a tree for of the actual data of an array descriptor - or
+     NULL_TREE if original tree is not an array descriptor.  */
+  tree (*omp_array_data) (tree);
+
   /* Return TRUE if TYPE1 and TYPE2 are identical for type hashing purposes.
      Called only after doing all language independent checks.
      At present, this function is only called when both TYPE1 and TYPE2 are
diff --git a/gcc/lra-spills.c b/gcc/lra-spills.c
index 7a49c07..4b68223 100644
--- a/gcc/lra-spills.c
+++ b/gcc/lra-spills.c
@@ -283,6 +283,9 @@
       for (k = 0; k < spill_class_size; k++)
 	{
 	  hard_regno = ira_class_hard_regs[spill_class][k];
+	  if (TEST_HARD_REG_BIT (eliminable_regset, hard_regno)
+	      || !targetm.hard_regno_mode_ok (hard_regno, mode))
+	    continue;
 	  if (! overlaps_hard_reg_set_p (conflict_hard_regs, mode, hard_regno))
 	    break;
 	}
diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def
index 9961c28..a8f10e33 100644
--- a/gcc/omp-builtins.def
+++ b/gcc/omp-builtins.def
@@ -73,6 +73,8 @@
 		  BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_BARRIER_CANCEL, "GOMP_barrier_cancel",
 		  BT_FN_BOOL, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_BARRIER, "GOACC_barrier",
+		   BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASKWAIT, "GOMP_taskwait",
 		  BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASKWAIT_DEPEND, "GOMP_taskwait_depend",
@@ -410,6 +412,12 @@
 		  BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SINGLE_COPY_END, "GOMP_single_copy_end",
 		  BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_SINGLE_START, "GOACC_single_start",
+		   BT_FN_BOOL, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_SINGLE_COPY_START, "GOACC_single_copy_start",
+		   BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_SINGLE_COPY_END, "GOACC_single_copy_end",
+		   BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_OFFLOAD_REGISTER, "GOMP_offload_register_ver",
 		  BT_FN_VOID_UINT_PTR_INT_PTR, ATTR_NOTHROW_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_OFFLOAD_UNREGISTER,
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 7415973..25dcd63 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -103,6 +103,9 @@
   /* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has
      a depend clause.  */
   gomp_ordered *ord_stmt;
+
+  /* True if this is nested inside an OpenACC kernels construct.  */
+  bool inside_kernels_p;
 };
 
 static struct omp_region *root_omp_region;
@@ -2645,6 +2648,7 @@
   gassign *assign_stmt;
   bool in_combined_parallel = is_combined_parallel (region);
   bool broken_loop = region->cont == NULL;
+  bool seq_loop = (start_fn == BUILT_IN_NONE || next_fn == BUILT_IN_NONE);
   edge e, ne;
   tree *counts = NULL;
   int i;
@@ -2766,8 +2770,12 @@
   type = TREE_TYPE (fd->loop.v);
   istart0 = create_tmp_var (fd->iter_type, ".istart0");
   iend0 = create_tmp_var (fd->iter_type, ".iend0");
-  TREE_ADDRESSABLE (istart0) = 1;
-  TREE_ADDRESSABLE (iend0) = 1;
+
+  if (!seq_loop)
+    {
+      TREE_ADDRESSABLE (istart0) = 1;
+      TREE_ADDRESSABLE (iend0) = 1;
+    }
 
   /* See if we need to bias by LLONG_MIN.  */
   if (fd->iter_type == long_long_unsigned_type_node
@@ -2797,7 +2805,25 @@
   gsi_prev (&gsif);
 
   tree arr = NULL_TREE;
-  if (in_combined_parallel)
+  if (seq_loop)
+    {
+      tree n1 = fold_convert (fd->iter_type, fd->loop.n1);
+      tree n2 = fold_convert (fd->iter_type, fd->loop.n2);
+
+      n1 = force_gimple_operand_gsi_1 (&gsi, n1, is_gimple_reg, NULL_TREE, true,
+				       GSI_SAME_STMT);
+      n2 = force_gimple_operand_gsi_1 (&gsi, n2, is_gimple_reg, NULL_TREE, true,
+				       GSI_SAME_STMT);
+
+      assign_stmt = gimple_build_assign (istart0, n1);
+      gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+      assign_stmt = gimple_build_assign (iend0, n2);
+      gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+      t = fold_build2 (NE_EXPR, boolean_type_node, istart0, iend0);
+    }
+  else if (in_combined_parallel)
     {
       gcc_assert (fd->ordered == 0);
       /* In a combined parallel loop, emit a call to
@@ -3246,48 +3272,55 @@
 	collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb);
 
       /* Emit code to get the next parallel iteration in L2_BB.  */
-      gsi = gsi_start_bb (l2_bb);
+      if (!seq_loop)
+        {
+	  gsi = gsi_start_bb (l2_bb);
 
-      t = build_call_expr (builtin_decl_explicit (next_fn), 2,
-			   build_fold_addr_expr (istart0),
-			   build_fold_addr_expr (iend0));
-      t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
-				    false, GSI_CONTINUE_LINKING);
-      if (TREE_TYPE (t) != boolean_type_node)
-	t = fold_build2 (NE_EXPR, boolean_type_node,
-			 t, build_int_cst (TREE_TYPE (t), 0));
-      gcond *cond_stmt = gimple_build_cond_empty (t);
-      gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+	  t = build_call_expr (builtin_decl_explicit (next_fn), 2,
+			       build_fold_addr_expr (istart0),
+			       build_fold_addr_expr (iend0));
+	  t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+					false, GSI_CONTINUE_LINKING);
+	  if (TREE_TYPE (t) != boolean_type_node)
+	    t = fold_build2 (NE_EXPR, boolean_type_node,
+			     t, build_int_cst (TREE_TYPE (t), 0));
+	  gcond *cond_stmt = gimple_build_cond_empty (t);
+	  gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+	}
     }
 
   /* Add the loop cleanup function.  */
   gsi = gsi_last_nondebug_bb (exit_bb);
-  if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
-    t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
-  else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
-    t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
-  else
-    t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
-  gcall *call_stmt = gimple_build_call (t, 0);
-  if (fd->ordered)
+  if (!seq_loop)
     {
-      tree arr = counts[fd->ordered];
-      tree clobber = build_constructor (TREE_TYPE (arr), NULL);
-      TREE_THIS_VOLATILE (clobber) = 1;
-      gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
-			GSI_SAME_STMT);
-    }
-  if (gimple_omp_return_lhs (gsi_stmt (gsi)))
-    {
-      gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
-      if (fd->have_reductemp)
+      if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
+	t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
+      else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+	t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
+      else
+	t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
+      gcall *call_stmt = gimple_build_call (t, 0);
+      if (fd->ordered)
 	{
-	  gimple *g = gimple_build_assign (reductions, NOP_EXPR,
-					   gimple_call_lhs (call_stmt));
-	  gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+	  tree arr = counts[fd->ordered];
+	  tree clobber = build_constructor (TREE_TYPE (arr), NULL);
+	  TREE_THIS_VOLATILE (clobber) = 1;
+	  gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
+			    GSI_SAME_STMT);
 	}
+      if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+	{
+	  gimple_call_set_lhs (call_stmt,
+			       gimple_omp_return_lhs (gsi_stmt (gsi)));
+	  if (fd->have_reductemp)
+	    {
+	      gimple *g = gimple_build_assign (reductions, NOP_EXPR,
+					       gimple_call_lhs (call_stmt));
+	      gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+	    }
+	}
+      gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
     }
-  gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
   gsi_remove (&gsi, true);
 
   /* Connect the new blocks.  */
@@ -3299,7 +3332,8 @@
       gimple_seq phis;
 
       e = find_edge (cont_bb, l3_bb);
-      ne = make_edge (l2_bb, l3_bb, EDGE_FALSE_VALUE);
+      ne = make_edge (l2_bb, l3_bb,
+		      seq_loop ? EDGE_FALLTHRU : EDGE_FALSE_VALUE);
 
       phis = phi_nodes (l3_bb);
       for (gsi = gsi_start (phis); !gsi_end_p (gsi); gsi_next (&gsi))
@@ -3339,7 +3373,8 @@
 	  e = find_edge (cont_bb, l2_bb);
 	  e->flags = EDGE_FALLTHRU;
 	}
-      make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
+      if (!seq_loop)
+	make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
 
       if (gimple_in_ssa_p (cfun))
 	{
@@ -3398,12 +3433,16 @@
 
       add_bb_to_loop (l2_bb, outer_loop);
 
-      /* We've added a new loop around the original loop.  Allocate the
-	 corresponding loop struct.  */
-      struct loop *new_loop = alloc_loop ();
-      new_loop->header = l0_bb;
-      new_loop->latch = l2_bb;
-      add_loop (new_loop, outer_loop);
+      struct loop *new_loop = NULL;
+      if (!seq_loop)
+	{
+	  /* We've added a new loop around the original loop.  Allocate the
+	     corresponding loop struct.  */
+	  new_loop = alloc_loop ();
+	  new_loop->header = l0_bb;
+	  new_loop->latch = l2_bb;
+	  add_loop (new_loop, outer_loop);
+	}
 
       /* Allocate a loop structure for the original loop unless we already
 	 had one.  */
@@ -3413,7 +3452,8 @@
 	  struct loop *orig_loop = alloc_loop ();
 	  orig_loop->header = l1_bb;
 	  /* The loop may have multiple latches.  */
-	  add_loop (orig_loop, new_loop);
+	  add_loop (orig_loop,
+		    new_loop != NULL ? new_loop : outer_loop);
 	}
     }
 }
@@ -5957,7 +5997,10 @@
        original loops from being detected.  Fix that up.  */
     loops_state_set (LOOPS_NEED_FIXUP);
 
-  if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
+  if (region->inside_kernels_p)
+    expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+			    NULL_TREE, inner_stmt);
+  else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
     expand_omp_simd (region, &fd);
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
     {
@@ -7276,19 +7319,26 @@
   gomp_target *entry_stmt;
   gimple *stmt;
   edge e;
-  bool offloaded, data_region;
+  bool offloaded, data_region, oacc_explode_args;
 
   entry_stmt = as_a <gomp_target *> (last_stmt (region->entry));
   new_bb = region->entry;
+  oacc_explode_args = false;
 
   offloaded = is_gimple_omp_offloaded (entry_stmt);
   switch (gimple_omp_target_kind (entry_stmt))
     {
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL:
+    case GF_OMP_TARGET_KIND_OACC_SERIAL:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+      if (targetm.goacc.explode_args ())
+	oacc_explode_args = true;
+      gcc_fallthrough ();
     case GF_OMP_TARGET_KIND_REGION:
     case GF_OMP_TARGET_KIND_UPDATE:
     case GF_OMP_TARGET_KIND_ENTER_DATA:
     case GF_OMP_TARGET_KIND_EXIT_DATA:
-    case GF_OMP_TARGET_KIND_OACC_PARALLEL:
     case GF_OMP_TARGET_KIND_OACC_KERNELS:
     case GF_OMP_TARGET_KIND_OACC_UPDATE:
     case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
@@ -7298,6 +7348,7 @@
     case GF_OMP_TARGET_KIND_DATA:
     case GF_OMP_TARGET_KIND_OACC_DATA:
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
       data_region = true;
       break;
     default:
@@ -7320,16 +7371,35 @@
   entry_bb = region->entry;
   exit_bb = region->exit;
 
-  if (gimple_omp_target_kind (entry_stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
+  /* Further down, all OpenACC compute constructs will be mapped to
+     BUILT_IN_GOACC_PARALLEL, and to distinguish between them, we now attach
+     attributes.  */
+  switch (gimple_omp_target_kind (entry_stmt))
     {
+    case GF_OMP_TARGET_KIND_OACC_KERNELS:
       mark_loops_in_oacc_kernels_region (region->entry, region->exit);
 
-      /* Further down, both OpenACC kernels and OpenACC parallel constructs
-	 will be mappted to BUILT_IN_GOACC_PARALLEL, and to distinguish the
-	 two, there is an "oacc kernels" attribute set for OpenACC kernels.  */
       DECL_ATTRIBUTES (child_fn)
 	= tree_cons (get_identifier ("oacc kernels"),
 		     NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    case GF_OMP_TARGET_KIND_OACC_SERIAL:
+      DECL_ATTRIBUTES (child_fn)
+	= tree_cons (get_identifier ("oacc serial"),
+		     NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+      DECL_ATTRIBUTES (child_fn)
+	= tree_cons (get_identifier ("oacc parallel_kernels_parallelized"),
+		     NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+      DECL_ATTRIBUTES (child_fn)
+	= tree_cons (get_identifier ("oacc parallel_kernels_gang_single"),
+		     NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    default:
+      break;
     }
 
   if (offloaded)
@@ -7350,7 +7420,7 @@
 	 .OMP_DATA_I may have been converted into a different local
 	 variable.  In which case, we need to keep the assignment.  */
       tree data_arg = gimple_omp_target_data_arg (entry_stmt);
-      if (data_arg)
+      if (data_arg && !oacc_explode_args)
 	{
 	  basic_block entry_succ_bb = single_succ (entry_bb);
 	  gimple_stmt_iterator gsi;
@@ -7534,10 +7604,14 @@
       break;
     case GF_OMP_TARGET_KIND_OACC_KERNELS:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL:
+    case GF_OMP_TARGET_KIND_OACC_SERIAL:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
       start_ix = BUILT_IN_GOACC_PARALLEL;
       break;
     case GF_OMP_TARGET_KIND_OACC_DATA:
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
       start_ix = BUILT_IN_GOACC_DATA_START;
       break;
     case GF_OMP_TARGET_KIND_OACC_UPDATE:
@@ -7561,7 +7635,17 @@
   if (is_gimple_omp_oacc (entry_stmt))
     {
       /* By default, no GOACC_FLAGs are set.  */
-      goacc_flags = integer_zero_node;
+      int goacc_flags_i = 0;
+
+      if (start_ix != BUILT_IN_GOACC_UPDATE
+	  && omp_find_clause (clauses, OMP_CLAUSE_IF_PRESENT))
+	{
+	  gcc_checking_assert (gimple_omp_target_kind (entry_stmt)
+			       == GF_OMP_TARGET_KIND_OACC_HOST_DATA);
+	  goacc_flags_i |= GOACC_FLAG_HOST_DATA_IF_PRESENT;
+	}
+
+      goacc_flags = build_int_cst (integer_type_node, goacc_flags_i);
     }
   else
     {
@@ -7728,7 +7812,18 @@
 	args.quick_push (get_target_arguments (&gsi, entry_stmt));
       break;
     case BUILT_IN_GOACC_PARALLEL:
-      oacc_set_fn_attrib (child_fn, clauses, &args);
+      if (lookup_attribute ("oacc serial", DECL_ATTRIBUTES (child_fn)) != NULL)
+	{
+	  tree dims = NULL_TREE;
+	  unsigned int ix;
+
+	  /* For serial constructs we set all dimensions to 1.  */
+	  for (ix = GOMP_DIM_MAX; ix--;)
+	    dims = tree_cons (NULL_TREE, integer_one_node, dims);
+	  oacc_replace_fn_attrib (child_fn, dims);
+	}
+      else
+	oacc_set_fn_attrib (child_fn, clauses, &args);
       tagging = true;
       /* FALLTHRU */
     case BUILT_IN_GOACC_ENTER_EXIT_DATA:
@@ -7799,6 +7894,10 @@
 				    unsigned_type_node, len);
 	    args[t_wait_idx] = len;
 	  }
+
+	if (tagging && oacc_explode_args)
+	  args.safe_push (oacc_launch_pack (GOMP_LAUNCH_ARGS_EXPLODED,
+					    NULL_TREE, 0));
       }
       break;
     default:
@@ -8142,7 +8241,19 @@
       if (region->type == GIMPLE_OMP_PARALLEL)
 	determine_parallel_type (region);
       else if (region->type == GIMPLE_OMP_TARGET)
-	grid_expand_target_grid_body (region);
+	{
+	  grid_expand_target_grid_body (region);
+
+	  if (region->inner)
+	    {
+	      gomp_target *entry
+		= as_a <gomp_target *> (last_stmt (region->entry));
+	      if (region->inside_kernels_p
+		  || (gimple_omp_target_kind (entry)
+		      == GF_OMP_TARGET_KIND_OACC_KERNELS))
+		region->inner->inside_kernels_p = true;
+	    }
+	}
 
       if (region->type == GIMPLE_OMP_FOR
 	  && gimple_omp_for_combined_p (last_stmt (region->entry)))
@@ -8289,8 +8400,12 @@
 		case GF_OMP_TARGET_KIND_DATA:
 		case GF_OMP_TARGET_KIND_OACC_PARALLEL:
 		case GF_OMP_TARGET_KIND_OACC_KERNELS:
+		case GF_OMP_TARGET_KIND_OACC_SERIAL:
 		case GF_OMP_TARGET_KIND_OACC_DATA:
 		case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+		case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+		case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+		case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
 		  break;
 		case GF_OMP_TARGET_KIND_UPDATE:
 		case GF_OMP_TARGET_KIND_ENTER_DATA:
@@ -8543,8 +8658,12 @@
 	case GF_OMP_TARGET_KIND_DATA:
 	case GF_OMP_TARGET_KIND_OACC_PARALLEL:
 	case GF_OMP_TARGET_KIND_OACC_KERNELS:
+	case GF_OMP_TARGET_KIND_OACC_SERIAL:
 	case GF_OMP_TARGET_KIND_OACC_DATA:
 	case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+	case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
 	  break;
 	case GF_OMP_TARGET_KIND_UPDATE:
 	case GF_OMP_TARGET_KIND_ENTER_DATA:
diff --git a/gcc/omp-general.c b/gcc/omp-general.c
index 356772f..bb704bf 100644
--- a/gcc/omp-general.c
+++ b/gcc/omp-general.c
@@ -48,6 +48,22 @@
   return NULL_TREE;
 }
 
+/* Return true if DECL is a Fortran optional argument.  */
+
+bool
+omp_is_optional_argument (tree decl)
+{
+  /* A passed-by-reference Fortran optional argument is similar to
+     a normal argument, but since it can be null the type is a
+     POINTER_TYPE rather than a REFERENCE_TYPE.  However, for
+     optional + value, de-referencing gives 'void' which is invalid.  */
+  return lang_GNU_Fortran ()
+	 && TREE_CODE (decl) == PARM_DECL
+	 && DECL_BY_REFERENCE (decl)
+	 && TREE_CODE (TREE_TYPE (decl)) == POINTER_TYPE
+	 && !VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl)));
+}
+
 /* Return true if DECL is a reference type.  */
 
 bool
@@ -608,9 +624,64 @@
     }
 }
 
-/*  Process the routine's dimension clauess to generate an attribute
-    value.  Issue diagnostics as appropriate.  We default to SEQ
-    (OpenACC 2.5 clarifies this). All dimensions have a size of zero
+/* Verify OpenACC routine clauses.
+
+   The chain of clauses returned will contain exactly one clause specifying the
+   level of parallelism.  */
+
+void
+oacc_verify_routine_clauses (tree *clauses, location_t loc)
+{
+  tree c_level = NULL_TREE;
+  tree c_p = NULL_TREE;
+  for (tree c = *clauses; c; c_p = c, c = OMP_CLAUSE_CHAIN (c))
+    switch (OMP_CLAUSE_CODE (c))
+      {
+      case OMP_CLAUSE_GANG:
+      case OMP_CLAUSE_WORKER:
+      case OMP_CLAUSE_VECTOR:
+      case OMP_CLAUSE_SEQ:
+	if (c_level == NULL_TREE)
+	  c_level = c;
+	else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_CODE (c_level))
+	  {
+	    /* This has already been diagnosed in the front ends.  */
+	    /* Drop the duplicate clause.  */
+	    gcc_checking_assert (c_p != NULL_TREE);
+	    OMP_CLAUSE_CHAIN (c_p) = OMP_CLAUSE_CHAIN (c);
+	    c = c_p;
+	  }
+	else
+	  {
+	    error_at (OMP_CLAUSE_LOCATION (c),
+		      "%qs specifies a conflicting level of parallelism",
+		      omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
+	    inform (OMP_CLAUSE_LOCATION (c_level),
+		    "... to the previous %qs clause here",
+		    omp_clause_code_name[OMP_CLAUSE_CODE (c_level)]);
+	    /* Drop the conflicting clause.  */
+	    gcc_checking_assert (c_p != NULL_TREE);
+	    OMP_CLAUSE_CHAIN (c_p) = OMP_CLAUSE_CHAIN (c);
+	    c = c_p;
+	  }
+	break;
+      case OMP_CLAUSE_NOHOST:
+	break;
+      default:
+	gcc_unreachable ();
+      }
+  if (c_level == NULL_TREE)
+    {
+      /* OpenACC 2.5 makes this an error; for the current OpenACC 2.0a
+	 implementation add an implicit "seq" clause.  */
+      c_level = build_omp_clause (loc, OMP_CLAUSE_SEQ);
+      OMP_CLAUSE_CHAIN (c_level) = *clauses;
+      *clauses = c_level;
+    }
+}
+
+/*  Process the OpenACC routine's clauses to generate an attribute
+    for the level of parallelism.  All dimensions have a size of zero
     (dynamic).  TREE_PURPOSE is set to indicate whether that dimension
     can have a loop partitioned on it.  non-zero indicates
     yes, zero indicates no.  By construction once a non-zero has been
@@ -632,16 +703,10 @@
     for (ix = GOMP_DIM_MAX + 1; ix--;)
       if (OMP_CLAUSE_CODE (clauses) == ids[ix])
 	{
-	  if (level >= 0)
-	    error_at (OMP_CLAUSE_LOCATION (clauses),
-		      "multiple loop axes specified for routine");
 	  level = ix;
 	  break;
 	}
-
-  /* Default to SEQ.  */
-  if (level < 0)
-    level = GOMP_DIM_MAX;
+  gcc_checking_assert (level >= 0);
 
   tree dims = NULL_TREE;
 
diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index 60faa52..0bd0ade 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -32,9 +32,10 @@
   OLF_INDEPENDENT = 1u << 2,	/* Iterations are known independent.  */
   OLF_GANG_STATIC = 1u << 3,	/* Gang partitioning is static (has op). */
   OLF_TILE	= 1u << 4,	/* Tiled loop. */
-  
+  OLF_REDUCTION = 1u << 5,	/* Reduction loop.  */
+
   /* Explicitly specified loop axes.  */
-  OLF_DIM_BASE = 5,
+  OLF_DIM_BASE = 6,
   OLF_DIM_GANG   = 1u << (OLF_DIM_BASE + GOMP_DIM_GANG),
   OLF_DIM_WORKER = 1u << (OLF_DIM_BASE + GOMP_DIM_WORKER),
   OLF_DIM_VECTOR = 1u << (OLF_DIM_BASE + GOMP_DIM_VECTOR),
@@ -71,6 +72,7 @@
 #define OACC_FN_ATTRIB "oacc function"
 
 extern tree omp_find_clause (tree clauses, enum omp_clause_code kind);
+extern bool omp_is_optional_argument (tree decl);
 extern bool omp_is_reference (tree decl);
 extern void omp_adjust_for_condition (location_t loc, enum tree_code *cond_code,
 				      tree *n2, tree v, tree step);
@@ -84,6 +86,7 @@
 extern tree oacc_replace_fn_attrib_attr (tree attribs, tree dims);
 extern void oacc_replace_fn_attrib (tree fn, tree dims);
 extern void oacc_set_fn_attrib (tree fn, tree clauses, vec<tree> *args);
+extern void oacc_verify_routine_clauses (tree *, location_t);
 extern tree oacc_build_routine_dims (tree clauses);
 extern tree oacc_get_fn_attrib (tree fn);
 extern bool offloading_function_p (tree fn);
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 813cefd..b987ceb 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -60,6 +60,7 @@
 #include "hsa-common.h"
 #include "stringpool.h"
 #include "attribs.h"
+#include "tree-hash-traits.h"
 
 /* Lowering of OMP parallel and workshare constructs proceeds in two
    phases.  The first phase scans the function looking for OMP statements
@@ -90,6 +91,9 @@
   /* Map variables to fields in a structure that allows communication
      between sending and receiving threads.  */
   splay_tree field_map;
+  splay_tree array_data_map;
+  splay_tree present_map;
+  splay_tree parm_map;
   tree record_type;
   tree sender_decl;
   tree receiver_decl;
@@ -133,6 +137,18 @@
 
   /* True if this construct can be cancelled.  */
   bool cancellable;
+
+  /* Hash map of dynamic arrays in this context.  */
+  hash_map<tree_operand_hash, tree> *dynamic_arrays;
+
+  /* A tree_list of the reduction clauses in this context.  */
+  tree local_reduction_clauses;
+
+  /* A tree_list of the reduction clauses in outer contexts.  */
+  tree outer_reduction_clauses;
+
+  /* Addressable variable decls in this context.  */
+  vec<tree> *oacc_addressable_var_decls;
 };
 
 static splay_tree all_contexts;
@@ -155,15 +171,21 @@
       *handled_ops_p = false; \
       break;
 
-/* Return true if CTX corresponds to an oacc parallel region.  */
+/* Return true if CTX corresponds to an oacc parallel or serial region.  */
 
 static bool
-is_oacc_parallel (omp_context *ctx)
+is_oacc_parallel_or_serial (omp_context *ctx)
 {
   enum gimple_code outer_type = gimple_code (ctx->stmt);
   return ((outer_type == GIMPLE_OMP_TARGET)
-	  && (gimple_omp_target_kind (ctx->stmt)
-	      == GF_OMP_TARGET_KIND_OACC_PARALLEL));
+	  && ((gimple_omp_target_kind (ctx->stmt)
+	       == GF_OMP_TARGET_KIND_OACC_PARALLEL)
+	      || (gimple_omp_target_kind (ctx->stmt)
+		  == GF_OMP_TARGET_KIND_OACC_SERIAL)
+	      || (gimple_omp_target_kind (ctx->stmt)
+		  == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
+	      || (gimple_omp_target_kind (ctx->stmt)
+		  == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)));
 }
 
 /* Return true if CTX corresponds to an oacc kernels region.  */
@@ -177,6 +199,22 @@
 	      == GF_OMP_TARGET_KIND_OACC_KERNELS));
 }
 
+/* Return true if CTX corresponds to an oacc region that was generated from
+   an original kernels region that has been lowered to parallel regions.  */
+
+static bool
+was_originally_oacc_kernels (omp_context *ctx)
+{
+  enum gimple_code outer_type = gimple_code (ctx->stmt);
+  return ((outer_type == GIMPLE_OMP_TARGET)
+	  && ((gimple_omp_target_kind (ctx->stmt)
+	       == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
+	      || (gimple_omp_target_kind (ctx->stmt)
+		  == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+	      || (gimple_omp_target_kind (ctx->stmt)
+		  == GF_OMP_TARGET_KIND_OACC_DATA_KERNELS)));
+}
+
 /* If DECL is the artificial dummy VAR_DECL created for non-static
    data member privatization, return the underlying "this" parameter,
    otherwise return NULL.  */
@@ -336,6 +374,14 @@
 }
 
 static inline tree
+lookup_parm (const_tree var, omp_context *ctx)
+{
+  splay_tree_node n;
+  n = splay_tree_lookup (ctx->parm_map, (splay_tree_key) var);
+  return (tree) n->value;
+}
+
+static inline tree
 lookup_field (tree var, omp_context *ctx)
 {
   splay_tree_node n;
@@ -549,15 +595,21 @@
 {
   tree x, field = lookup_field (var, ctx);
 
-  /* If the receiver record type was remapped in the child function,
-     remap the field into the new record type.  */
-  x = maybe_lookup_field (field, ctx);
-  if (x != NULL)
-    field = x;
+  if (is_oacc_parallel_or_serial (ctx) && targetm.goacc.explode_args ())
+    x = lookup_parm (var, ctx);
+  else
+    {
+      /* If the receiver record type was remapped in the child function,
+	 remap the field into the new record type.  */
+      x = maybe_lookup_field (field, ctx);
+      if (x != NULL)
+	field = x;
 
-  x = build_simple_mem_ref (ctx->receiver_decl);
-  TREE_THIS_NOTRAP (x) = 1;
-  x = omp_build_component_ref (x, field);
+      x = build_simple_mem_ref (ctx->receiver_decl);
+      TREE_THIS_NOTRAP (x) = 1;
+      x = omp_build_component_ref (x, field);
+    }
+
   if (by_ref)
     {
       x = build_simple_mem_ref (x);
@@ -694,16 +746,45 @@
   return build_sender_ref ((splay_tree_key) var, ctx);
 }
 
+static void
+install_parm_decl (tree var, tree type, omp_context *ctx)
+{
+  if (!is_oacc_parallel_or_serial (ctx) || !targetm.goacc.explode_args ())
+    return;
+
+  splay_tree_key key = (splay_tree_key) var;
+  tree decl_name = NULL_TREE, t;
+  location_t loc = UNKNOWN_LOCATION;
+
+  if (DECL_P (var) && !DECL_ARTIFICIAL (var))
+    {
+      decl_name = get_identifier (get_name (var));
+      loc = DECL_SOURCE_LOCATION (var);
+    }
+  t = build_decl (loc, PARM_DECL, decl_name, type);
+  DECL_ARTIFICIAL (t) = 1;
+  DECL_NAMELESS (t) = 1;
+  DECL_ARG_TYPE (t) = type;
+  DECL_CONTEXT (t) = current_function_decl;
+  TREE_USED (t) = 1;
+  TREE_READONLY (t) = 1;
+
+  splay_tree_insert (ctx->parm_map, key, (splay_tree_value) t);
+}
+
 /* Add a new field for VAR inside the structure CTX->SENDER_DECL.  If
    BASE_POINTERS_RESTRICT, declare the field with restrict.  */
 
 static void
-install_var_field (tree var, bool by_ref, int mask, omp_context *ctx)
+install_var_field (tree var, bool by_ref, int mask, omp_context *ctx,
+		   bool base_pointers_restrict = false)
 {
   tree field, type, sfield = NULL_TREE;
   splay_tree_key key = (splay_tree_key) var;
 
-  if ((mask & 8) != 0)
+  if ((mask & 16) != 0)
+    key = (splay_tree_key) var;
+  else if ((mask & 8) != 0)
     {
       key = (splay_tree_key) &DECL_UID (var);
       gcc_checking_assert (key != (splay_tree_key) var);
@@ -729,18 +810,25 @@
       type = build_pointer_type (build_pointer_type (type));
     }
   else if (by_ref)
-    type = build_pointer_type (type);
+    {
+      type = build_pointer_type (type);
+      if (base_pointers_restrict)
+        type = build_qualified_type (type, TYPE_QUAL_RESTRICT);
+    }
   else if ((mask & 3) == 1 && omp_is_reference (var))
     type = TREE_TYPE (type);
 
-  field = build_decl (DECL_SOURCE_LOCATION (var),
-		      FIELD_DECL, DECL_NAME (var), type);
+  if ((mask & 16) != 0)
+    field = build_decl (UNKNOWN_LOCATION, FIELD_DECL, NULL_TREE, type);
+  else
+    field = build_decl (DECL_SOURCE_LOCATION (var),
+			FIELD_DECL, DECL_NAME (var), type);
 
   /* Remember what variable this field was created for.  This does have a
      side effect of making dwarf2out ignore this member, so for helpful
      debugging we clear it later in delete_omp_context.  */
   DECL_ABSTRACT_ORIGIN (field) = var;
-  if (type == TREE_TYPE (var))
+  if ((mask & 16) == 0 && type == TREE_TYPE (var))
     {
       SET_DECL_ALIGN (field, DECL_ALIGN (var));
       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
@@ -788,7 +876,10 @@
     }
 
   if (mask & 1)
-    splay_tree_insert (ctx->field_map, key, (splay_tree_value) field);
+    {
+      splay_tree_insert (ctx->field_map, key, (splay_tree_value) field);
+      install_parm_decl (var, type, ctx);
+    }
   if ((mask & 2) && ctx->sfield_map)
     splay_tree_insert (ctx->sfield_map, key, (splay_tree_value) sfield);
 }
@@ -873,6 +964,136 @@
   return error_mark_node;
 }
 
+/* Helper function for create_dynamic_array_descr_type(), to append a new field
+   to a record type.  */
+
+static void
+append_field_to_record_type (tree record_type, tree fld_ident, tree fld_type)
+{
+  tree *p, fld = build_decl (UNKNOWN_LOCATION, FIELD_DECL, fld_ident, fld_type);
+  DECL_CONTEXT (fld) = record_type;
+
+  for (p = &TYPE_FIELDS (record_type); *p; p = &DECL_CHAIN (*p))
+    ;
+  *p = fld;
+}
+
+/* Create type for dynamic array descriptor. Returns created type, and
+   returns the number of dimensions in *DIM_NUM.  */
+
+static tree
+create_dynamic_array_descr_type (tree decl, tree dims, int *dim_num)
+{
+  int n = 0;
+  tree da_descr_type, name, x;
+  gcc_assert (TREE_CODE (dims) == TREE_LIST);
+
+  da_descr_type = lang_hooks.types.make_type (RECORD_TYPE);
+  name = create_tmp_var_name (".omp_dynamic_array_descr_type");
+  name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, da_descr_type);
+  DECL_ARTIFICIAL (name) = 1;
+  DECL_NAMELESS (name) = 1;
+  TYPE_NAME (da_descr_type) = name;
+  TYPE_ARTIFICIAL (da_descr_type) = 1;
+
+  /* Main starting pointer/array.  */
+  tree main_var_type = TREE_TYPE (decl);
+  if (TREE_CODE (main_var_type) == REFERENCE_TYPE)
+    main_var_type = TREE_TYPE (main_var_type);
+  append_field_to_record_type (da_descr_type, DECL_NAME (decl),
+			       (TREE_CODE (TREE_TYPE (decl)) == POINTER_TYPE
+				? main_var_type
+				: build_pointer_type (main_var_type)));
+  /* Number of dimensions.  */
+  append_field_to_record_type (da_descr_type, get_identifier ("$dim_num"),
+			       sizetype);
+
+  for (x = dims; x; x = TREE_CHAIN (x), n++)
+    {
+      char *fldname;
+      /* One for the start index.  */
+      ASM_FORMAT_PRIVATE_NAME (fldname, "$dim_base", n);
+      append_field_to_record_type (da_descr_type, get_identifier (fldname),
+				   sizetype);
+      /* One for the length.  */
+      ASM_FORMAT_PRIVATE_NAME (fldname, "$dim_length", n);
+      append_field_to_record_type (da_descr_type, get_identifier (fldname),
+				   sizetype);
+      /* One for the element size.  */
+      ASM_FORMAT_PRIVATE_NAME (fldname, "$dim_elem_size", n);
+      append_field_to_record_type (da_descr_type, get_identifier (fldname),
+				   sizetype);
+      /* One for is_array flag.  */
+      ASM_FORMAT_PRIVATE_NAME (fldname, "$dim_is_array", n);
+      append_field_to_record_type (da_descr_type, get_identifier (fldname),
+				   sizetype);
+    }
+
+  layout_type (da_descr_type);
+  *dim_num = n;
+  return da_descr_type;
+}
+
+/* Generate code sequence for initializing dynamic array descriptor.  */
+
+static void
+create_dynamic_array_descr_init_code (tree da_descr, tree da_var,
+				      tree dimensions, int da_dim_num,
+				      gimple_seq *ilist)
+{
+  tree fld, fldref;
+  tree da_descr_type = TREE_TYPE (da_descr);
+  tree dim_type = TREE_TYPE (da_var);
+
+  fld = TYPE_FIELDS (da_descr_type);
+  fldref = omp_build_component_ref (da_descr, fld);
+  gimplify_assign (fldref, (TREE_CODE (dim_type) == ARRAY_TYPE
+			    ? build_fold_addr_expr (da_var) : da_var), ilist);
+
+  if (TREE_CODE (dim_type) == REFERENCE_TYPE)
+    dim_type = TREE_TYPE (dim_type);
+
+  fld = TREE_CHAIN (fld);
+  fldref = omp_build_component_ref (da_descr, fld);
+  gimplify_assign (fldref, build_int_cst (sizetype, da_dim_num), ilist);
+
+  while (dimensions)
+    {
+      tree dim_base = fold_convert (sizetype, TREE_PURPOSE (dimensions));
+      tree dim_length = fold_convert (sizetype, TREE_VALUE (dimensions));
+      tree dim_elem_size = TYPE_SIZE_UNIT (TREE_TYPE (dim_type));
+      tree dim_is_array = (TREE_CODE (dim_type) == ARRAY_TYPE
+			   ? integer_one_node : integer_zero_node);
+      /* Set base.  */
+      fld = TREE_CHAIN (fld);
+      fldref = omp_build_component_ref (da_descr, fld);
+      dim_base = fold_build2 (MULT_EXPR, sizetype, dim_base, dim_elem_size);
+      gimplify_assign (fldref, dim_base, ilist);
+
+      /* Set length.  */
+      fld = TREE_CHAIN (fld);
+      fldref = omp_build_component_ref (da_descr, fld);
+      dim_length = fold_build2 (MULT_EXPR, sizetype, dim_length, dim_elem_size);
+      gimplify_assign (fldref, dim_length, ilist);
+
+      /* Set elem_size.  */
+      fld = TREE_CHAIN (fld);
+      fldref = omp_build_component_ref (da_descr, fld);
+      dim_elem_size = fold_convert (sizetype, dim_elem_size);
+      gimplify_assign (fldref, dim_elem_size, ilist);
+
+      /* Set is_array flag.  */
+      fld = TREE_CHAIN (fld);
+      fldref = omp_build_component_ref (da_descr, fld);
+      dim_is_array = fold_convert (sizetype, dim_is_array);
+      gimplify_assign (fldref, dim_is_array, ilist);
+
+      dimensions = TREE_CHAIN (dimensions);
+      dim_type = TREE_TYPE (dim_type);
+    }
+  gcc_assert (TREE_CHAIN (fld) == NULL_TREE);
+}
+
 /* Create a new context, with OUTER_CTX being the surrounding context.  */
 
 static omp_context *
@@ -890,6 +1111,8 @@
       ctx->cb = outer_ctx->cb;
       ctx->cb.block = NULL;
       ctx->depth = outer_ctx->depth + 1;
+      ctx->local_reduction_clauses = NULL;
+      ctx->outer_reduction_clauses = ctx->outer_reduction_clauses;
     }
   else
     {
@@ -905,10 +1128,15 @@
       ctx->cb.adjust_array_error_bounds = true;
       ctx->cb.dont_remap_vla_if_no_change = true;
       ctx->depth = 1;
+      ctx->local_reduction_clauses = NULL;
+      ctx->outer_reduction_clauses = NULL;
     }
 
   ctx->cb.decl_map = new hash_map<tree, tree>;
 
+  ctx->dynamic_arrays = new hash_map<tree_operand_hash, tree>;
+  ctx->oacc_addressable_var_decls = new vec<tree> ();
+
   return ctx;
 }
 
@@ -964,6 +1192,12 @@
     splay_tree_delete (ctx->field_map);
   if (ctx->sfield_map)
     splay_tree_delete (ctx->sfield_map);
+  if (ctx->array_data_map)
+    splay_tree_delete (ctx->array_data_map);
+  if (ctx->present_map)
+    splay_tree_delete (ctx->present_map);
+  if (ctx->parm_map)
+    splay_tree_delete (ctx->parm_map);
 
   /* We hijacked DECL_ABSTRACT_ORIGIN earlier.  We need to clear it before
      it produces corrupt debug information.  */
@@ -989,6 +1223,9 @@
       delete ctx->task_reduction_map;
     }
 
+  delete ctx->dynamic_arrays;
+  delete ctx->oacc_addressable_var_decls;
+
   XDELETE (ctx);
 }
 
@@ -1051,12 +1288,14 @@
 }
 
 /* Instantiate decls as necessary in CTX to satisfy the data sharing
-   specified by CLAUSES.  */
+   specified by CLAUSES.  If BASE_POINTERS_RESTRICT, install var field with
+   restrict.  */
 
 static void
-scan_sharing_clauses (tree clauses, omp_context *ctx)
+scan_sharing_clauses (tree clauses, omp_context *ctx,
+		      bool base_pointers_restrict = false)
 {
-  tree c, decl;
+  tree c, decl, x;
   bool scan_array_reductions = false;
 
   for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
@@ -1117,6 +1356,9 @@
 
 	case OMP_CLAUSE_REDUCTION:
 	case OMP_CLAUSE_IN_REDUCTION:
+          if (is_oacc_parallel_or_serial (ctx) || is_oacc_kernels (ctx))
+            ctx->local_reduction_clauses
+	      = tree_cons (NULL, c, ctx->local_reduction_clauses);
 	  decl = OMP_CLAUSE_DECL (c);
 	  if (TREE_CODE (decl) == MEM_REF)
 	    {
@@ -1225,7 +1467,30 @@
 
 	case OMP_CLAUSE_USE_DEVICE_PTR:
 	  decl = OMP_CLAUSE_DECL (c);
-	  if (TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE)
+          x = NULL;
+	  // Handle array descriptors
+	  if (TREE_CODE (TREE_TYPE (decl)) == RECORD_TYPE ||
+	      (omp_is_reference (decl)
+	       && TREE_CODE (TREE_TYPE (TREE_TYPE (decl))) == RECORD_TYPE))
+	    x = lang_hooks.types.omp_array_data (decl);
+
+	  if (x)
+	    {
+	      gcc_assert (!ctx->array_data_map
+			  || !splay_tree_lookup (ctx->array_data_map,
+					       (splay_tree_key) decl));
+	      if (!ctx->array_data_map)
+		ctx->array_data_map
+			= splay_tree_new (splay_tree_compare_pointers, 0, 0);
+
+	      splay_tree_insert (ctx->array_data_map, (splay_tree_key) decl,
+				 (splay_tree_value) x);
+
+	      install_var_field (x, false, 19, ctx);
+	      DECL_SOURCE_LOCATION (lookup_field (x, ctx))
+			= OMP_CLAUSE_LOCATION (c);
+	    }
+	  else if (TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE)
 	    install_var_field (decl, true, 3, ctx);
 	  else
 	    install_var_field (decl, false, 3, ctx);
@@ -1301,7 +1566,8 @@
 	      && is_global_var (maybe_lookup_decl_in_outer_ctx (decl, ctx))
 	      && varpool_node::get_create (decl)->offloadable
 	      && !lookup_attribute ("omp declare target link",
-				    DECL_ATTRIBUTES (decl)))
+				    DECL_ATTRIBUTES (decl))
+	      && !is_gimple_omp_oacc (ctx->stmt))
 	    break;
 	  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
 	      && OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_POINTER)
@@ -1336,6 +1602,50 @@
 	      install_var_local (decl, ctx);
 	      break;
 	    }
+
+	  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+	      && GOMP_MAP_DYNAMIC_ARRAY_P (OMP_CLAUSE_MAP_KIND (c)))
+	    {
+	      tree da_decl = OMP_CLAUSE_DECL (c);
+	      tree da_dimensions = OMP_CLAUSE_SIZE (c);
+	      tree da_type = TREE_TYPE (da_decl);
+	      bool by_ref = (TREE_CODE (da_type) == ARRAY_TYPE
+			     ? true : false);
+
+	      /* Checking code to ensure we only have arrays at top dimension.
+		 This limitation might be lifted in the future.  */
+	      if (TREE_CODE (da_type) == REFERENCE_TYPE)
+		da_type = TREE_TYPE (da_type);
+	      tree t = da_type, prev_t = NULL_TREE;
+	      while (t)
+		{
+		  if (TREE_CODE (t) == ARRAY_TYPE && prev_t)
+		    {
+		      error_at (gimple_location (ctx->stmt), "array types are"
+				" only allowed at outermost dimension of"
+				" dynamic array");
+		      break;
+		    }
+		  prev_t = t;
+		  t = TREE_TYPE (t);
+		}
+
+	      if (DECL_P (decl))
+		install_var_field (da_decl, by_ref, 3, ctx);
+	      else
+	        {
+		  error_at (OMP_CLAUSE_LOCATION (c),
+			    "dynamic arrays cannot be used within structs");
+		  break;
+		}
+
+	      tree new_var = install_var_local (da_decl, ctx);
+
+	      bool existed = ctx->dynamic_arrays->put (new_var, da_dimensions);
+	      gcc_assert (!existed);
+	      break;
+	    }
+
 	  if (DECL_P (decl))
 	    {
 	      if (DECL_SIZE (decl)
@@ -1357,7 +1667,8 @@
 		      && TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE)
 		    install_var_field (decl, true, 7, ctx);
 		  else
-		    install_var_field (decl, true, 3, ctx);
+		    install_var_field (decl, true, 3, ctx,
+				       base_pointers_restrict);
 		  if (is_gimple_omp_offloaded (ctx->stmt)
 		      && !OMP_CLAUSE_MAP_IN_REDUCTION (c))
 		    install_var_local (decl, ctx);
@@ -1393,6 +1704,7 @@
 		  insert_field_into_struct (ctx->record_type, field);
 		  splay_tree_insert (ctx->field_map, (splay_tree_key) decl,
 				     (splay_tree_value) field);
+		  install_parm_decl (decl, ptr_type_node, ctx);
 		}
 	    }
 	  break;
@@ -1441,6 +1753,7 @@
 	    install_var_local (decl, ctx);
 	  break;
 
+	case OMP_CLAUSE_NOHOST:
 	case OMP_CLAUSE__CACHE_:
 	default:
 	  gcc_unreachable ();
@@ -1617,6 +1930,7 @@
 	case OMP_CLAUSE_FINALIZE:
 	  break;
 
+	case OMP_CLAUSE_NOHOST:
 	case OMP_CLAUSE__CACHE_:
 	default:
 	  gcc_unreachable ();
@@ -1669,10 +1983,13 @@
 }
 
 /* Build a decl for the omp child function.  It'll not contain a body
-   yet, just the bare decl.  */
+   yet, just the bare decl.  Unlike omp child functions, acc child
+   functions for parallel regions have one argument per data
+   mapping.  */
 
 static void
-create_omp_child_function (omp_context *ctx, bool task_copy)
+create_omp_child_function (omp_context *ctx, bool task_copy,
+			   unsigned int map_cnt = 0)
 {
   tree decl, type, name, t;
 
@@ -1680,6 +1997,13 @@
   if (task_copy)
     type = build_function_type_list (void_type_node, ptr_type_node,
 				     ptr_type_node, NULL_TREE);
+  else if (is_oacc_parallel_or_serial (ctx) && targetm.goacc.explode_args ())
+    {
+      tree *arg_types = (tree *) alloca (sizeof (tree) * map_cnt);
+      for (unsigned int i = 0; i < map_cnt; i++)
+	arg_types[i] = ptr_type_node;
+      type = build_function_type_array (void_type_node, map_cnt, arg_types);
+    }
   else
     type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
 
@@ -1753,33 +2077,35 @@
   DECL_CONTEXT (t) = decl;
   DECL_RESULT (decl) = t;
 
-  tree data_name = get_identifier (".omp_data_i");
-  t = build_decl (DECL_SOURCE_LOCATION (decl), PARM_DECL, data_name,
-		  ptr_type_node);
-  DECL_ARTIFICIAL (t) = 1;
-  DECL_NAMELESS (t) = 1;
-  DECL_ARG_TYPE (t) = ptr_type_node;
-  DECL_CONTEXT (t) = current_function_decl;
-  TREE_USED (t) = 1;
-  TREE_READONLY (t) = 1;
-  DECL_ARGUMENTS (decl) = t;
-  if (!task_copy)
-    ctx->receiver_decl = t;
-  else
+  if (!is_oacc_parallel_or_serial (ctx) || !targetm.goacc.explode_args ())
     {
-      t = build_decl (DECL_SOURCE_LOCATION (decl),
-		      PARM_DECL, get_identifier (".omp_data_o"),
+      tree data_name = get_identifier (".omp_data_i");
+      t = build_decl (DECL_SOURCE_LOCATION (decl), PARM_DECL, data_name,
 		      ptr_type_node);
       DECL_ARTIFICIAL (t) = 1;
       DECL_NAMELESS (t) = 1;
       DECL_ARG_TYPE (t) = ptr_type_node;
       DECL_CONTEXT (t) = current_function_decl;
       TREE_USED (t) = 1;
-      TREE_ADDRESSABLE (t) = 1;
-      DECL_CHAIN (t) = DECL_ARGUMENTS (decl);
+      TREE_READONLY (t) = 1;
       DECL_ARGUMENTS (decl) = t;
+      if (!task_copy)
+	ctx->receiver_decl = t;
+      else
+	{
+	  t = build_decl (DECL_SOURCE_LOCATION (decl),
+			  PARM_DECL, get_identifier (".omp_data_o"),
+			  ptr_type_node);
+	  DECL_ARTIFICIAL (t) = 1;
+	  DECL_NAMELESS (t) = 1;
+	  DECL_ARG_TYPE (t) = ptr_type_node;
+	  DECL_CONTEXT (t) = current_function_decl;
+	  TREE_USED (t) = 1;
+	  TREE_ADDRESSABLE (t) = 1;
+	  DECL_CHAIN (t) = DECL_ARGUMENTS (decl);
+	  DECL_ARGUMENTS (decl) = t;
+	}
     }
-
   /* Allocate memory for the function structure.  The call to
      allocate_struct_function clobbers CFUN, so we need to restore
      it afterward.  */
@@ -2320,7 +2646,8 @@
     {
       omp_context *tgt = enclosing_target_ctx (outer_ctx);
 
-      if (!tgt || is_oacc_parallel (tgt))
+      if (!tgt || (is_oacc_parallel_or_serial (tgt)
+                    && !was_originally_oacc_kernels (tgt)))
 	for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
 	  {
 	    char const *check = NULL;
@@ -2367,6 +2694,98 @@
 	  gimple_omp_for_set_clauses (stmt, clauses);
 	  check_oacc_kernel_gwv (stmt, ctx);
 	}
+
+      /* Collect all variables named in reductions on this loop.  Ensure
+         that, if this loop has a reduction on some variable v, and there is
+         a reduction on v somewhere in an outer context, then there is a
+         reduction on v on all intervening loops as well.  */
+      tree local_reduction_clauses = NULL;
+      for (tree c = gimple_omp_for_clauses (stmt); c; c = OMP_CLAUSE_CHAIN (c))
+        {
+          if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
+            local_reduction_clauses
+	      = tree_cons (NULL, c, local_reduction_clauses);
+        }
+      if (ctx->outer_reduction_clauses == NULL && ctx->outer != NULL)
+        ctx->outer_reduction_clauses
+	  = chainon (unshare_expr (ctx->outer->local_reduction_clauses),
+		     ctx->outer->outer_reduction_clauses);
+      tree outer_reduction_clauses = ctx->outer_reduction_clauses;
+      tree local_iter = local_reduction_clauses;
+      for (; local_iter; local_iter = TREE_CHAIN (local_iter))
+        {
+          tree local_clause = TREE_VALUE (local_iter);
+          tree local_var = OMP_CLAUSE_DECL (local_clause);
+          tree_code local_op = OMP_CLAUSE_REDUCTION_CODE (local_clause);
+          bool have_outer_reduction = false;
+          tree ctx_iter = outer_reduction_clauses;
+          for (; ctx_iter; ctx_iter = TREE_CHAIN (ctx_iter))
+            {
+              tree outer_clause = TREE_VALUE (ctx_iter);
+              tree outer_var = OMP_CLAUSE_DECL (outer_clause);
+              tree_code outer_op = OMP_CLAUSE_REDUCTION_CODE (outer_clause);
+              if (outer_var == local_var && outer_op != local_op)
+                {
+                  error_at (gimple_location (stmt),
+                            "conflicting reduction operations for %qE",
+                            local_var);
+                  inform (OMP_CLAUSE_LOCATION (outer_clause),
+                          "location of the previous reduction for %qE",
+                          outer_var);
+                  /* Change this operation to be equal to the outer one.
+                     This is meant to suppress spurious errors; for example,
+                     in nested +, -, + reductions, we would generate errors
+                     for both the change from + to - and from - to +.  */
+                  OMP_CLAUSE_REDUCTION_CODE (local_clause) = outer_op;
+                  /* Also change the location so that in nested +, -, -
+                     reductions, the second error message also refers to the
+                     outermost + reduction.  */
+                  OMP_CLAUSE_LOCATION (local_clause)
+		    = OMP_CLAUSE_LOCATION (outer_clause);
+                }
+              if (outer_var == local_var)
+                {
+                  have_outer_reduction = true;
+                  break;
+                }
+            }
+          if (have_outer_reduction)
+            {
+              /* There is a reduction on outer_var both on this loop and on
+                 some enclosing loop.  Walk up the context tree until such a
+                 loop with a reduction on outer_var is found, and complain
+                 about all intervening loops that do not have such a
+                 reduction.  */
+              struct omp_context *curr_loop = ctx->outer;
+              bool found = false;
+              while (curr_loop != NULL)
+                {
+                  tree curr_iter = curr_loop->local_reduction_clauses;
+                  for (; curr_iter; curr_iter = TREE_CHAIN (curr_iter))
+                    {
+                      tree curr_clause = TREE_VALUE (curr_iter);
+                      tree curr_var = OMP_CLAUSE_DECL (curr_clause);
+                      if (curr_var == local_var)
+                        {
+                          found = true;
+                          break;
+                        }
+                    }
+                  if (!found)
+                    error_at (gimple_location (curr_loop->stmt),
+                              "nested loop in reduction needs "
+                              "reduction clause for %qE",
+                              local_var);
+                  else
+                    break;
+                  curr_loop = curr_loop->outer;
+                }
+            }
+        }
+      ctx->local_reduction_clauses = local_reduction_clauses;
+      ctx->outer_reduction_clauses
+	= chainon (unshare_expr (ctx->local_reduction_clauses),
+		   ctx->outer_reduction_clauses);
     }
 
   scan_sharing_clauses (clauses, ctx);
@@ -2462,6 +2881,112 @@
     layout_type (ctx->record_type);
 }
 
+/* Return true if the CLAUSES of an omp target guarantee that the base pointers
+   used in the corresponding offloaded function are restrict.  */
+
+static bool
+omp_target_base_pointers_restrict_p (tree clauses)
+{
+  /* The analysis relies on the GOMP_MAP_FORCE_* mapping kinds, which are only
+     used by OpenACC.  */
+  if (flag_openacc == 0)
+    return false;
+
+  /* I.  Basic example:
+
+       void foo (void)
+       {
+	 unsigned int a[2], b[2];
+
+	 #pragma acc kernels \
+	   copyout (a) \
+	   copyout (b)
+	 {
+	   a[0] = 0;
+	   b[0] = 1;
+	 }
+       }
+
+     After gimplification, we have:
+
+       #pragma omp target oacc_kernels \
+	 map(force_from:a [len: 8]) \
+	 map(force_from:b [len: 8])
+       {
+	 a[0] = 0;
+	 b[0] = 1;
+       }
+
+     Because both mappings have the force prefix, we know that they will be
+     allocated when calling the corresponding offloaded function, which means we
+     can mark the base pointers for a and b in the offloaded function as
+     restrict.  */
+
+  tree c;
+  for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+    {
+      if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_MAP)
+	return false;
+
+      switch (OMP_CLAUSE_MAP_KIND (c))
+	{
+	case GOMP_MAP_FORCE_ALLOC:
+	case GOMP_MAP_FORCE_TO:
+	case GOMP_MAP_FORCE_FROM:
+	case GOMP_MAP_FORCE_TOFROM:
+	  break;
+	default:
+	  return false;
+	}
+    }
+
+  return true;
+}
+
+/* Reorder clauses so that dynamic array map clauses are placed at the very
+   front of the chain.  */
+
+static void
+reorder_dynamic_array_clauses (tree *clauses_ptr)
+{
+  tree c, clauses = *clauses_ptr;
+  tree prev_clause = NULL_TREE, next_clause;
+  tree da_clauses = NULL_TREE, da_clauses_tail = NULL_TREE;
+
+  for (c = clauses; c; c = next_clause)
+    {
+      next_clause = OMP_CLAUSE_CHAIN (c);
+
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+	  && GOMP_MAP_DYNAMIC_ARRAY_P (OMP_CLAUSE_MAP_KIND (c)))
+	{
+	  /* Unchain c from clauses.  */
+	  if (c == clauses)
+	    clauses = next_clause;
+
+	  /* Link on to da_clauses.  */
+	  if (da_clauses_tail)
+	    OMP_CLAUSE_CHAIN (da_clauses_tail) = c;
+	  else
+	    da_clauses = c;
+	  da_clauses_tail = c;
+
+	  if (prev_clause)
+	    OMP_CLAUSE_CHAIN (prev_clause) = next_clause;
+	  continue;
+	}
+
+      prev_clause = c;
+    }  
+
+  /* Place dynamic array clauses at the start of the clause list.  */
+  if (da_clauses)
+    {
+      OMP_CLAUSE_CHAIN (da_clauses_tail) = clauses;
+      *clauses_ptr = da_clauses;
+    }
+}
+
 /* Scan a GIMPLE_OMP_TARGET.  */
 
 static void
@@ -2474,6 +2999,7 @@
 
   ctx = new_omp_context (stmt, outer_ctx);
   ctx->field_map = splay_tree_new (splay_tree_compare_pointers, 0, 0);
+  ctx->parm_map = splay_tree_new (splay_tree_compare_pointers, 0, 0);
   ctx->record_type = lang_hooks.types.make_type (RECORD_TYPE);
   name = create_tmp_var_name (".omp_data_t");
   name = build_decl (gimple_location (stmt),
@@ -2483,13 +3009,31 @@
   TYPE_NAME (ctx->record_type) = name;
   TYPE_ARTIFICIAL (ctx->record_type) = 1;
 
+  bool base_pointers_restrict = false;
   if (offloaded)
     {
-      create_omp_child_function (ctx, false);
-      gimple_omp_target_set_child_fn (stmt, ctx->cb.dst_fn);
+      if (!is_oacc_parallel_or_serial (ctx) || !targetm.goacc.explode_args ())
+	{
+	  create_omp_child_function (ctx, false);
+	  gimple_omp_target_set_child_fn (stmt, ctx->cb.dst_fn);
+	}
+
+      base_pointers_restrict = omp_target_base_pointers_restrict_p (clauses);
+      if (base_pointers_restrict && dump_file && (dump_flags & TDF_DETAILS))
+        fprintf (dump_file,
+		 "Base pointers in offloaded function are restrict\n");
     }
 
-  scan_sharing_clauses (clauses, ctx);
+  /* If OpenACC construct, put dynamic array clauses (if any) in front of
+     clause chain. The runtime can then test the first to see if the
+     additional map processing for them is required.  */
+  if (is_gimple_omp_oacc (stmt))
+    {
+      reorder_dynamic_array_clauses (gimple_omp_target_clauses_ptr (stmt));
+      clauses = gimple_omp_target_clauses (stmt);
+    }
+
+  scan_sharing_clauses (clauses, ctx, base_pointers_restrict);
   scan_omp (gimple_omp_body_ptr (stmt), ctx);
 
   if (TYPE_FIELDS (ctx->record_type) == NULL)
@@ -2669,6 +3213,9 @@
 		  {
 		  case GF_OMP_TARGET_KIND_OACC_PARALLEL:
 		  case GF_OMP_TARGET_KIND_OACC_KERNELS:
+		  case GF_OMP_TARGET_KIND_OACC_SERIAL:
+		  case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+		  case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
 		    ok = true;
 		    break;
 
@@ -3127,12 +3674,18 @@
 	      stmt_name = "target exit data"; break;
 	    case GF_OMP_TARGET_KIND_OACC_PARALLEL: stmt_name = "parallel"; break;
 	    case GF_OMP_TARGET_KIND_OACC_KERNELS: stmt_name = "kernels"; break;
+	    case GF_OMP_TARGET_KIND_OACC_SERIAL: stmt_name = "serial"; break;
 	    case GF_OMP_TARGET_KIND_OACC_DATA: stmt_name = "data"; break;
 	    case GF_OMP_TARGET_KIND_OACC_UPDATE: stmt_name = "update"; break;
 	    case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
 	      stmt_name = "enter/exit data"; break;
 	    case GF_OMP_TARGET_KIND_OACC_HOST_DATA: stmt_name = "host_data";
 	      break;
+	    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+	    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
+	      /* These three cases arise from kernels conversion.  */
+	      stmt_name = "kernels"; break;
 	    default: gcc_unreachable ();
 	    }
 	  switch (gimple_omp_target_kind (ctx->stmt))
@@ -3143,9 +3696,16 @@
 	      ctx_stmt_name = "parallel"; break;
 	    case GF_OMP_TARGET_KIND_OACC_KERNELS:
 	      ctx_stmt_name = "kernels"; break;
+	    case GF_OMP_TARGET_KIND_OACC_SERIAL:
+	      ctx_stmt_name = "serial"; break;
 	    case GF_OMP_TARGET_KIND_OACC_DATA: ctx_stmt_name = "data"; break;
 	    case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
 	      ctx_stmt_name = "host_data"; break;
+	    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+	    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+	    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
+	      /* These three cases arise from kernels conversion.  */
+	      ctx_stmt_name = "kernels"; break;
 	    default: gcc_unreachable ();
 	    }
 
@@ -3550,6 +4110,19 @@
   return t ? t : decl;
 }
 
+/* Returns true if DECL is present inside a field that encloses CTX.  */
+
+static bool
+maybe_lookup_field_in_outer_ctx (tree decl, omp_context *ctx)
+{
+  omp_context *up;
+
+  for (up = ctx->outer; up; up = up->outer)
+    if (maybe_lookup_field (decl, up))
+      return true;
+
+  return false;
+}
 
 /* Construct the initialization value for reduction operation OP.  */
 
@@ -5646,8 +6219,9 @@
 
 static void
 lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
-		       gcall *fork, gcall *join, gimple_seq *fork_seq,
-		       gimple_seq *join_seq, omp_context *ctx)
+		       gcall *fork, gcall *private_marker, gcall *join,
+		       gimple_seq *fork_seq, gimple_seq *join_seq,
+		       omp_context *ctx)
 {
   gimple_seq before_fork = NULL;
   gimple_seq after_fork = NULL;
@@ -5661,10 +6235,11 @@
     if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
       {
 	tree orig = OMP_CLAUSE_DECL (c);
-	tree var = maybe_lookup_decl (orig, ctx);
+	tree var;
 	tree ref_to_res = NULL_TREE;
-	tree incoming, outgoing, v1, v2, v3;
+	tree incoming, outgoing;
 	bool is_private = false;
+	bool is_fpp = false;
 
 	enum tree_code rcode = OMP_CLAUSE_REDUCTION_CODE (c);
 	if (rcode == MINUS_EXPR)
@@ -5675,6 +6250,9 @@
 	  rcode = BIT_IOR_EXPR;
 	tree op = build_int_cst (unsigned_type_node, rcode);
 
+	var = OMP_CLAUSE_REDUCTION_PRIVATE_DECL (c);
+	if (!var)
+	  var = maybe_lookup_decl (orig, ctx);
 	if (!var)
 	  var = orig;
 
@@ -5697,8 +6275,14 @@
 		    break;
 
 		  case GIMPLE_OMP_TARGET:
-		    if (gimple_omp_target_kind (probe->stmt)
-			!= GF_OMP_TARGET_KIND_OACC_PARALLEL)
+		    if ((gimple_omp_target_kind (probe->stmt)
+			 != GF_OMP_TARGET_KIND_OACC_PARALLEL)
+			&& (gimple_omp_target_kind (probe->stmt)
+			    != GF_OMP_TARGET_KIND_OACC_SERIAL)
+			&& (gimple_omp_target_kind (probe->stmt)
+			    != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
+			&& (gimple_omp_target_kind (probe->stmt)
+			    != GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE))
 		      goto do_lookup;
 
 		    cls = gimple_omp_target_clauses (probe->stmt);
@@ -5723,19 +6307,37 @@
 		      is_private = true;
 		      goto do_lookup;
 		    }
+		  else if (OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_MAP
+			   && (OMP_CLAUSE_MAP_KIND (cls)
+			       == GOMP_MAP_FIRSTPRIVATE_POINTER)
+			   && orig == OMP_CLAUSE_DECL (cls))
+		    {
+		      is_fpp = true;
+		      goto do_lookup;
+		    }
 	      }
 
 	  do_lookup:
 	    /* This is the outermost construct with this reduction,
 	       see if there's a mapping for it.  */
 	    if (gimple_code (outer->stmt) == GIMPLE_OMP_TARGET
-		&& maybe_lookup_field (orig, outer) && !is_private)
+		&& (maybe_lookup_field (orig, outer) || is_fpp) && !is_private)
 	      {
-		ref_to_res = build_receiver_ref (orig, false, outer);
-		if (omp_is_reference (orig))
-		  ref_to_res = build_simple_mem_ref (ref_to_res);
-
 		tree type = TREE_TYPE (var);
+
+		if (is_fpp)
+		  {
+		    tree x = create_tmp_var (type);
+		    gimplify_assign (x, lookup_decl (orig, outer), fork_seq);
+		    ref_to_res = x;
+		  }
+		else
+		  {
+		    ref_to_res = build_receiver_ref (orig, false, outer);
+		    if (omp_is_reference (orig))
+		      ref_to_res = build_simple_mem_ref (ref_to_res);
+		  }
+
 		if (POINTER_TYPE_P (type))
 		  type = TREE_TYPE (type);
 
@@ -5762,36 +6364,13 @@
 	if (!ref_to_res)
 	  ref_to_res = integer_zero_node;
 
-	if (omp_is_reference (orig))
+	if (omp_is_reference (outgoing))
 	  {
-	    tree type = TREE_TYPE (var);
-	    const char *id = IDENTIFIER_POINTER (DECL_NAME (var));
-
-	    if (!inner)
-	      {
-		tree x = create_tmp_var (TREE_TYPE (type), id);
-		gimplify_assign (var, build_fold_addr_expr (x), fork_seq);
-	      }
-
-	    v1 = create_tmp_var (type, id);
-	    v2 = create_tmp_var (type, id);
-	    v3 = create_tmp_var (type, id);
-
-	    gimplify_assign (v1, var, fork_seq);
-	    gimplify_assign (v2, var, fork_seq);
-	    gimplify_assign (v3, var, fork_seq);
-
-	    var = build_simple_mem_ref (var);
-	    v1 = build_simple_mem_ref (v1);
-	    v2 = build_simple_mem_ref (v2);
-	    v3 = build_simple_mem_ref (v3);
 	    outgoing = build_simple_mem_ref (outgoing);
 
 	    if (!TREE_CONSTANT (incoming))
 	      incoming = build_simple_mem_ref (incoming);
 	  }
-	else
-	  v1 = v2 = v3 = var;
 
 	/* Determine position in reduction buffer, which may be used
 	   by target.  The parser has ensured that this is not a
@@ -5824,25 +6403,28 @@
 	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
 					  TREE_TYPE (var), 6, init_code,
 					  unshare_expr (ref_to_res),
-					  v1, level, op, off);
+					  var, level, op, off);
 	tree fini_call
 	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
 					  TREE_TYPE (var), 6, fini_code,
 					  unshare_expr (ref_to_res),
-					  v2, level, op, off);
+					  var, level, op, off);
 	tree teardown_call
 	  = build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
-					  TREE_TYPE (var), 6, teardown_code,
-					  ref_to_res, v3, level, op, off);
+					  TREE_TYPE (var), 6,
+					  teardown_code, ref_to_res, var,
+					  level, op, off);
 
-	gimplify_assign (v1, setup_call, &before_fork);
-	gimplify_assign (v2, init_call, &after_fork);
-	gimplify_assign (v3, fini_call, &before_join);
+	gimplify_assign (var, setup_call, &before_fork);
+	gimplify_assign (var, init_call, &after_fork);
+	gimplify_assign (var, fini_call, &before_join);
 	gimplify_assign (outgoing, teardown_call, &after_join);
       }
 
   /* Now stitch things together.  */
   gimple_seq_add_seq (fork_seq, before_fork);
+  if (private_marker)
+    gimple_seq_add_stmt (fork_seq, private_marker);
   if (fork)
     gimple_seq_add_stmt (fork_seq, fork);
   gimple_seq_add_seq (fork_seq, after_fork);
@@ -6466,6 +7048,10 @@
 	  tag |= OLF_TILE;
 	  break;
 
+	case OMP_CLAUSE_REDUCTION:
+	  tag |= OLF_REDUCTION;
+	  break;
+
 	default:
 	  continue;
 	}
@@ -6478,9 +7064,11 @@
       tag |= OLF_GANG_STATIC;
     }
 
-  /* In a parallel region, loops are implicitly INDEPENDENT.  */
+  /* In a parallel region, loops without auto and seq clauses are
+     implicitly INDEPENDENT.  */
   omp_context *tgt = enclosing_target_ctx (ctx);
-  if (!tgt || is_oacc_parallel (tgt))
+  if ((!tgt || is_oacc_parallel_or_serial (tgt))
+      && !(tag & (OLF_SEQ | OLF_AUTO)))
     tag |= OLF_INDEPENDENT;
 
   if (tag & OLF_TILE)
@@ -6534,7 +7122,7 @@
    HEAD and TAIL.  */
 
 static void
-lower_oacc_head_tail (location_t loc, tree clauses,
+lower_oacc_head_tail (location_t loc, tree clauses, gcall *private_marker,
 		      gimple_seq *head, gimple_seq *tail, omp_context *ctx)
 {
   bool inner = false;
@@ -6542,10 +7130,19 @@
   gimple_seq_add_stmt (head, gimple_build_assign (ddvar, integer_zero_node));
 
   unsigned count = lower_oacc_head_mark (loc, ddvar, clauses, head, ctx);
+
+  if (private_marker)
+    {
+      gimple_set_location (private_marker, loc);
+      gimple_call_set_lhs (private_marker, ddvar);
+      gimple_call_set_arg (private_marker, 1, ddvar);
+    }
+
   tree fork_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_FORK);
   tree join_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_JOIN);
 
   gcc_assert (count);
+
   for (unsigned done = 1; count; count--, done++)
     {
       gimple_seq fork_seq = NULL;
@@ -6572,7 +7169,8 @@
 			      &join_seq);
 
       lower_oacc_reductions (loc, clauses, place, inner,
-			     fork, join, &fork_seq, &join_seq,  ctx);
+			     fork, (count == 1) ? private_marker : NULL,
+			     join, &fork_seq, &join_seq,  ctx);
 
       /* Append this level to head. */
       gimple_seq_add_seq (head, fork_seq);
@@ -8230,6 +8828,128 @@
     }
 }
 
+/* Record vars listed in private clauses in CLAUSES in CTX.  This information
+   is used to mark up variables that should be made private per-gang.  */
+
+static void
+oacc_record_private_var_clauses (omp_context *ctx, tree clauses)
+{
+  tree c;
+
+  for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_PRIVATE)
+      {
+	tree decl = OMP_CLAUSE_DECL (c);
+	if (VAR_P (decl) && TREE_ADDRESSABLE (decl))
+	  ctx->oacc_addressable_var_decls->safe_push (decl);
+      }
+}
+
+/* Record addressable vars declared in BINDVARS in CTX.  This information is
+   used to mark up variables that should be made private per-gang.  */
+
+static void
+oacc_record_vars_in_bind (omp_context *ctx, tree bindvars)
+{
+  if (!ctx)
+    return;
+
+  for (tree v = bindvars; v; v = DECL_CHAIN (v))
+    if (VAR_P (v) && TREE_ADDRESSABLE (v))
+      ctx->oacc_addressable_var_decls->safe_push (v);
+}
+
+/* Mark addressable variables which are declared implicitly or explicitly as
+   gang private with a special attribute.  These may need to have their
+   declarations altered later on in compilation (e.g. in
+   execute_oacc_device_lower or the backend, depending on how the OpenACC
+   execution model is implemented on a given target) to ensure that sharing
+   semantics are correct.  */
+
+static void
+mark_oacc_gangprivate (vec<tree> *decls, omp_context *ctx)
+{
+  int i;
+  tree decl;
+
+  FOR_EACH_VEC_ELT (*decls, i, decl)
+    {
+      for (omp_context *thisctx = ctx; thisctx; thisctx = thisctx->outer)
+	{
+	  tree inner_decl = maybe_lookup_decl (decl, thisctx);
+	  if (inner_decl)
+	    {
+	      decl = inner_decl;
+	      break;
+	    }
+	}
+      if (!lookup_attribute ("oacc gangprivate", DECL_ATTRIBUTES (decl)))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file,
+		       "Setting 'oacc gangprivate' attribute for decl:");
+	      print_generic_decl (dump_file, decl, TDF_SLIM);
+	      fputc ('\n', dump_file);
+	    }
+	  DECL_ATTRIBUTES (decl)
+	    = tree_cons (get_identifier ("oacc gangprivate"),
+			 NULL, DECL_ATTRIBUTES (decl));
+	}
+    }
+}
+
+/* Build an internal UNIQUE function with type IFN_UNIQUE_OACC_PRIVATE listing
+   the addresses of variables that should be made private at the surrounding
+   parallelism level.  Such functions appear in the gimple code stream in two
+   forms, e.g. for a partitioned loop:
+
+      .data_dep.6 = .UNIQUE (OACC_HEAD_MARK, .data_dep.6, 1, 68);
+      .data_dep.6 = .UNIQUE (OACC_PRIVATE, .data_dep.6, -1, &w);
+      .data_dep.6 = .UNIQUE (OACC_FORK, .data_dep.6, -1);
+      .data_dep.6 = .UNIQUE (OACC_HEAD_MARK, .data_dep.6);
+
+   or alternatively, OACC_PRIVATE can appear at the top level of a parallel,
+   not as part of a HEAD_MARK sequence:
+
+      .UNIQUE (OACC_PRIVATE, 0, 0, &w);
+
+   For such stand-alone appearances, the 3rd argument is always 0, denoting
+   gang partitioning.  */
+
+static gcall *
+make_oacc_private_marker (omp_context *ctx)
+{
+  int i;
+  tree decl;
+
+  if (ctx->oacc_addressable_var_decls->length () == 0)
+    return NULL;
+
+  auto_vec<tree, 5> args;
+
+  args.quick_push (build_int_cst (integer_type_node,
+				 IFN_UNIQUE_OACC_PRIVATE));
+  args.quick_push (integer_zero_node);
+  args.quick_push (integer_minus_one_node);
+
+  FOR_EACH_VEC_ELT (*ctx->oacc_addressable_var_decls, i, decl)
+    {
+      for (omp_context *thisctx = ctx; thisctx; thisctx = thisctx->outer)
+	{
+	  tree inner_decl = maybe_lookup_decl (decl, thisctx);
+	  if (inner_decl)
+	    {
+	      decl = inner_decl;
+	      break;
+	    }
+	}
+      tree addr = build_fold_addr_expr (decl);
+      args.safe_push (addr);
+    }
+
+  return gimple_build_call_internal_vec (IFN_UNIQUE, args);
+}
 
 /* Lower code for an OMP loop directive.  */
 
@@ -8247,6 +8967,8 @@
 
   push_gimplify_context ();
 
+  oacc_record_private_var_clauses (ctx, gimple_omp_for_clauses (stmt));
+
   lower_omp (gimple_omp_for_pre_body_ptr (stmt), ctx);
 
   block = make_node (BLOCK);
@@ -8265,6 +8987,8 @@
       gbind *inner_bind
 	= as_a <gbind *> (gimple_seq_first_stmt (omp_for_body));
       tree vars = gimple_bind_vars (inner_bind);
+      if (is_gimple_omp_oacc (ctx->stmt))
+	oacc_record_vars_in_bind (ctx, vars);
       gimple_bind_append_vars (new_stmt, vars);
       /* bind_vars/BLOCK_VARS are being moved to new_stmt/block, don't
 	 keep them on the inner_bind and it's block.  */
@@ -8361,6 +9085,12 @@
 
   lower_omp (gimple_omp_body_ptr (stmt), ctx);
 
+  gcall *private_marker = NULL;
+  if (is_gimple_omp_oacc (ctx->stmt)
+      && !gimple_seq_empty_p (omp_for_body)
+      && !gimple_seq_empty_p (omp_for_body))
+    private_marker = make_oacc_private_marker (ctx);
+
   /* Lower the header expressions.  At this point, we can assume that
      the header is of the form:
 
@@ -8397,7 +9127,7 @@
   if (is_gimple_omp_oacc (ctx->stmt)
       && !ctx_in_oacc_kernels_region (ctx))
     lower_oacc_head_tail (gimple_location (stmt),
-			  gimple_omp_for_clauses (stmt),
+			  gimple_omp_for_clauses (stmt), private_marker,
 			  &oacc_head, &oacc_tail, ctx);
 
   /* Add OpenACC partitioning and reduction markers just before the loop.  */
@@ -9138,6 +9868,103 @@
     }
 }
 
+/* Helper function for lower_omp_target.  Converts VAR to something that can
+   be represented by a POINTER_SIZED_INT_NODE.  Any new instructions are
+   appended to GS.  This is used to optimize firstprivate variables, so that
+   small types (less precision than POINTER_SIZE) do not require additional
+   data mappings.  */
+
+static tree
+convert_to_firstprivate_int (tree var, gimple_seq *gs)
+{
+  tree type = TREE_TYPE (var), new_type = NULL_TREE;
+
+  if (omp_is_reference (var) || POINTER_TYPE_P (type))
+    {
+      type = TREE_TYPE (type);
+      tree tmp = create_tmp_var (type);
+      gimplify_assign (tmp, build_simple_mem_ref (var), gs);
+      var = tmp;
+    }
+
+  if (INTEGRAL_TYPE_P (type) || POINTER_TYPE_P (type))
+    return fold_convert (pointer_sized_int_node, var);
+
+  gcc_assert (tree_to_uhwi (TYPE_SIZE (type)) <= POINTER_SIZE);
+
+  new_type = lang_hooks.types.type_for_size (tree_to_uhwi (TYPE_SIZE (type)),
+					     true);
+  tree tmp = create_tmp_var (new_type);
+  var = fold_build1 (VIEW_CONVERT_EXPR, new_type, var);
+  gimplify_assign (tmp, var, gs);
+
+  return fold_convert (pointer_sized_int_node, tmp);
+}
+
+/* Like convert_to_firstprivate_int, but restore the original type.  */
+
+static tree
+convert_from_firstprivate_int (tree var, tree orig_type, bool is_ref,
+			       gimple_seq *gs)
+{
+  tree type = TREE_TYPE (var);
+  tree new_type = NULL_TREE;
+  tree tmp = NULL_TREE;
+
+  gcc_assert (TREE_CODE (var) == MEM_REF);
+  var = TREE_OPERAND (var, 0);
+
+  if (is_ref || POINTER_TYPE_P (orig_type))
+    {
+      tree_code code = NOP_EXPR;
+
+      if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
+	code = VIEW_CONVERT_EXPR;
+
+      if (code == VIEW_CONVERT_EXPR
+         && TYPE_SIZE (type) != TYPE_SIZE (orig_type))
+	{
+	  tree ptype = build_pointer_type (type);
+	  var = fold_build1 (code, ptype, build_fold_addr_expr (var));
+	  var = build_simple_mem_ref (var);
+	}
+      else
+	var = fold_build1 (code, type, var);
+
+      tree inst = create_tmp_var (type);
+      gimplify_assign (inst, var, gs);
+      var = build_fold_addr_expr (inst);
+
+      return var;
+    }
+
+  if (INTEGRAL_TYPE_P (var))
+    return fold_convert (type, var);
+
+  gcc_assert (tree_to_uhwi (TYPE_SIZE (type)) <= POINTER_SIZE);
+
+  new_type = lang_hooks.types.type_for_size (tree_to_uhwi (TYPE_SIZE (type)),
+					     true);
+
+  tmp = create_tmp_var (new_type);
+  var = fold_convert (new_type, var);
+  gimplify_assign (tmp, var, gs);
+
+  return fold_build1 (VIEW_CONVERT_EXPR, type, tmp);
+}
+
+static tree
+append_decl_arg (tree var, tree decl_args, omp_context *ctx)
+{
+  if (!is_oacc_parallel_or_serial (ctx))
+    return NULL_TREE;
+
+  tree temp = lookup_parm (var, ctx);
+  DECL_CHAIN (temp) = decl_args;
+
+  return temp;
+}
+
 /* Lower the GIMPLE_OMP_TARGET in the current statement
    in GSI_P.  CTX holds context information for the directive.  */
 
@@ -9151,7 +9978,8 @@
   gimple_seq tgt_body, olist, ilist, fplist, new_body;
   location_t loc = gimple_location (stmt);
   bool offloaded, data_region;
-  unsigned int map_cnt = 0;
+  unsigned int map_cnt = 0, init_cnt = 0;
+  bool oacc_explode_args = targetm.goacc.explode_args ();
 
   offloaded = is_gimple_omp_offloaded (stmt);
   switch (gimple_omp_target_kind (stmt))
@@ -9162,14 +9990,18 @@
     case GF_OMP_TARGET_KIND_EXIT_DATA:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL:
     case GF_OMP_TARGET_KIND_OACC_KERNELS:
+    case GF_OMP_TARGET_KIND_OACC_SERIAL:
     case GF_OMP_TARGET_KIND_OACC_UPDATE:
     case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
     case GF_OMP_TARGET_KIND_OACC_DECLARE:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
       data_region = false;
       break;
     case GF_OMP_TARGET_KIND_DATA:
     case GF_OMP_TARGET_KIND_OACC_DATA:
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
       data_region = true;
       break;
     default:
@@ -9197,11 +10029,83 @@
     }
   else if (data_region)
     tgt_body = gimple_omp_body (stmt);
-  child_fn = ctx->cb.dst_fn;
 
   push_gimplify_context ();
   fplist = NULL;
 
+  /* Determine init_cnt to finish initialize ctx.  */
+
+  if (is_oacc_parallel_or_serial (ctx) && oacc_explode_args)
+    {
+      for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
+	switch (OMP_CLAUSE_CODE (c))
+	  {
+	    tree var;
+
+	  default:
+	    break;
+	  case OMP_CLAUSE_MAP:
+	  case OMP_CLAUSE_TO:
+	  case OMP_CLAUSE_FROM:
+	  init_oacc_firstprivate:
+	    var = OMP_CLAUSE_DECL (c);
+	    if (!DECL_P (var))
+	      {
+		if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_MAP
+		    || (!OMP_CLAUSE_MAP_ZERO_BIAS_ARRAY_SECTION (c)
+			&& (OMP_CLAUSE_MAP_KIND (c)
+			    != GOMP_MAP_FIRSTPRIVATE_POINTER)))
+		  init_cnt++;
+		continue;
+	      }
+
+	    if (DECL_SIZE (var)
+		&& TREE_CODE (DECL_SIZE (var)) != INTEGER_CST)
+	      {
+		tree var2 = DECL_VALUE_EXPR (var);
+		gcc_assert (TREE_CODE (var2) == INDIRECT_REF);
+		var2 = TREE_OPERAND (var2, 0);
+		gcc_assert (DECL_P (var2));
+		var = var2;
+	      }
+
+	    if (offloaded
+		&& OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+		&& (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_POINTER
+		    || (OMP_CLAUSE_MAP_KIND (c)
+			== GOMP_MAP_FIRSTPRIVATE_REFERENCE)))
+	      {
+		continue;
+	      }
+
+	    if (!maybe_lookup_field (var, ctx))
+	      continue;
+
+	    init_cnt++;
+	    break;
+
+	  case OMP_CLAUSE_FIRSTPRIVATE:
+	    if (is_oacc_parallel_or_serial (ctx))
+	      goto init_oacc_firstprivate;
+	    init_cnt++;
+	    break;
+
+	  case OMP_CLAUSE_USE_DEVICE_PTR:
+	  case OMP_CLAUSE_IS_DEVICE_PTR:
+	    init_cnt++;
+	    break;
+	  }
+
+      /* Initialize the offloaded child function.  */
+
+      create_omp_child_function (ctx, false, init_cnt);
+      gimple_omp_target_set_child_fn (stmt, ctx->cb.dst_fn);
+    }
+
+  child_fn = ctx->cb.dst_fn;
+
+  /* Clause Pass 1: Scan and prepare sender decls VALUE_EXPRs for
+     usage on the child function.  */
   for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
     switch (OMP_CLAUSE_CODE (c))
       {
@@ -9230,6 +10134,7 @@
 	  case GOMP_MAP_STRUCT:
 	  case GOMP_MAP_ALWAYS_POINTER:
 	    break;
+	  case GOMP_MAP_NO_ALLOC:
 	  case GOMP_MAP_FORCE_ALLOC:
 	  case GOMP_MAP_FORCE_TO:
 	  case GOMP_MAP_FORCE_FROM:
@@ -9237,7 +10142,21 @@
 	  case GOMP_MAP_FORCE_PRESENT:
 	  case GOMP_MAP_FORCE_DEVICEPTR:
 	  case GOMP_MAP_DEVICE_RESIDENT:
+	  case GOMP_MAP_DYNAMIC_ARRAY_TO:
+	  case GOMP_MAP_DYNAMIC_ARRAY_FROM:
+	  case GOMP_MAP_DYNAMIC_ARRAY_TOFROM:
+	  case GOMP_MAP_DYNAMIC_ARRAY_FORCE_TO:
+	  case GOMP_MAP_DYNAMIC_ARRAY_FORCE_FROM:
+	  case GOMP_MAP_DYNAMIC_ARRAY_FORCE_TOFROM:
+	  case GOMP_MAP_DYNAMIC_ARRAY_ALLOC:
+	  case GOMP_MAP_DYNAMIC_ARRAY_FORCE_ALLOC:
+	  case GOMP_MAP_DYNAMIC_ARRAY_FORCE_PRESENT:
+	  case GOMP_MAP_DECLARE_ALLOCATE:
+	  case GOMP_MAP_DECLARE_DEALLOCATE:
 	  case GOMP_MAP_LINK:
+	  case GOMP_MAP_ATTACH:
+	  case GOMP_MAP_DETACH:
+	  case GOMP_MAP_FORCE_DETACH:
 	    gcc_assert (is_gimple_omp_oacc (stmt));
 	    break;
 	  default:
@@ -9299,26 +10218,55 @@
 	if (offloaded && !(OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
 			   && OMP_CLAUSE_MAP_IN_REDUCTION (c)))
 	  {
-	    x = build_receiver_ref (var, true, ctx);
+	    tree var_type = TREE_TYPE (var);
 	    tree new_var = lookup_decl (var, ctx);
+	    tree inner_type
+	      = omp_is_reference (new_var) ? TREE_TYPE (var_type) : var_type;
+	    bool rcv_by_ref
+	      = (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+		 && GOMP_MAP_DYNAMIC_ARRAY_P (OMP_CLAUSE_MAP_KIND (c))
+		 && TREE_CODE (var_type) != ARRAY_TYPE
+		 ? false : true);
+
+	    x = build_receiver_ref (var, rcv_by_ref, ctx);
+
+	    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FIRSTPRIVATE
+		&& (FLOAT_TYPE_P (inner_type)
+		    || ANY_INTEGRAL_TYPE_P (inner_type))
+		&& tree_to_uhwi (TYPE_SIZE (inner_type)) <= POINTER_SIZE
+		&& !maybe_lookup_field_in_outer_ctx (var, ctx))
+	      {
+		gcc_assert (is_gimple_omp_oacc (ctx->stmt));
+		x = convert_from_firstprivate_int (x, TREE_TYPE (new_var),
+						   omp_is_reference (var),
+						   &fplist);
+		gimplify_assign (new_var, x, &fplist);
+		map_cnt++;
+		break;
+              }
 
 	    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
 		&& OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_POINTER
 		&& !OMP_CLAUSE_MAP_ZERO_BIAS_ARRAY_SECTION (c)
-		&& TREE_CODE (TREE_TYPE (var)) == ARRAY_TYPE)
+		&& TREE_CODE (var_type) == ARRAY_TYPE)
 	      x = build_simple_mem_ref (x);
 	    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FIRSTPRIVATE)
 	      {
 		gcc_assert (is_gimple_omp_oacc (ctx->stmt));
 		if (omp_is_reference (new_var)
-		    && TREE_CODE (TREE_TYPE (new_var)) != POINTER_TYPE)
+		    /* Accelerators may not have alloca, so it's not
+		       possible to privatize local storage for those
+		       objects.  */
+		    && TREE_CONSTANT (TYPE_SIZE (TREE_TYPE (var_type))))
 		  {
 		    /* Create a local object to hold the instance
 		       value.  */
-		    tree type = TREE_TYPE (TREE_TYPE (new_var));
 		    const char *id = IDENTIFIER_POINTER (DECL_NAME (new_var));
-		    tree inst = create_tmp_var (type, id);
-		    gimplify_assign (inst, fold_indirect_ref (x), &fplist);
+		    tree inst = create_tmp_var (TREE_TYPE (var_type), id);
+		    if (TREE_CODE (var_type) == POINTER_TYPE)
+		      gimplify_assign (inst, x, &fplist);
+		    else
+		      gimplify_assign (inst, fold_indirect_ref (x), &fplist);
 		    x = build_fold_addr_expr (inst);
 		  }
 		gimplify_assign (new_var, x, &fplist);
@@ -9335,7 +10283,7 @@
 	break;
 
       case OMP_CLAUSE_FIRSTPRIVATE:
-	if (is_oacc_parallel (ctx))
+	if (is_oacc_parallel_or_serial (ctx))
 	  goto oacc_firstprivate;
 	map_cnt++;
 	var = OMP_CLAUSE_DECL (c);
@@ -9419,6 +10367,8 @@
 
   if (offloaded)
     {
+      if (is_oacc_parallel_or_serial (ctx) && oacc_explode_args)
+	gcc_assert (init_cnt == map_cnt);
       target_nesting_level++;
       lower_omp (&tgt_body, ctx);
       target_nesting_level--;
@@ -9466,12 +10416,14 @@
       vec_alloc (vsize, map_cnt);
       vec_alloc (vkind, map_cnt);
       unsigned int map_idx = 0;
+      tree decl_args = NULL_TREE;
 
       for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
 	switch (OMP_CLAUSE_CODE (c))
 	  {
 	    tree ovar, nc, s, purpose, var, x, type;
 	    unsigned int talign;
+	    bool oacc_firstprivate_int;
 
 	  default:
 	    break;
@@ -9480,6 +10432,7 @@
 	  case OMP_CLAUSE_TO:
 	  case OMP_CLAUSE_FROM:
 	  oacc_firstprivate_map:
+	    oacc_firstprivate_int = false;
 	    nc = c;
 	    ovar = OMP_CLAUSE_DECL (c);
 	    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
@@ -9543,10 +10496,44 @@
 		    avar = build_fold_addr_expr (avar);
 		    gimplify_assign (x, avar, &ilist);
 		  }
+		else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+			 && (OMP_CLAUSE_MAP_KIND (c) & GOMP_MAP_DYNAMIC_ARRAY))
+		  {
+		    int da_dim_num;
+		    tree dimensions = OMP_CLAUSE_SIZE (c);
+
+		    tree da_descr_type =
+		      create_dynamic_array_descr_type (OMP_CLAUSE_DECL (c),
+						       dimensions, &da_dim_num);
+		    tree da_descr =
+		      create_tmp_var_raw (da_descr_type, ".$omp_da_descr");
+		    gimple_add_tmp_var (da_descr);
+
+		    create_dynamic_array_descr_init_code
+		      (da_descr, ovar, dimensions, da_dim_num, &ilist);
+
+		    gimplify_assign (x, build_fold_addr_expr (da_descr),
+				     &ilist);
+		  }
 		else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FIRSTPRIVATE)
 		  {
-		    gcc_assert (is_gimple_omp_oacc (ctx->stmt));
-		    if (!omp_is_reference (var))
+		    gcc_checking_assert (is_gimple_omp_oacc (ctx->stmt));
+		    tree new_var = lookup_decl (var, ctx);
+		    tree type = TREE_TYPE (var);
+		    tree inner_type
+		      = omp_is_reference (new_var) ? TREE_TYPE (type) : type;
+		    if ((FLOAT_TYPE_P (inner_type)
+			 || ANY_INTEGRAL_TYPE_P (inner_type))
+			&& tree_to_uhwi (TYPE_SIZE (inner_type)) <= POINTER_SIZE
+			&& !maybe_lookup_field_in_outer_ctx (var, ctx))
+		      {
+			oacc_firstprivate_int = true;
+			if (is_gimple_reg (var)
+			    && OMP_CLAUSE_FIRSTPRIVATE_IMPLICIT (c))
+			  TREE_NO_WARNING (var) = 1;
+			var = convert_to_firstprivate_int (var, &ilist);
+		      }
+		    else if (!omp_is_reference (var))
 		      {
 			if (is_gimple_reg (var)
 			    && OMP_CLAUSE_FIRSTPRIVATE_IMPLICIT (c))
@@ -9598,16 +10585,32 @@
 	    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_FIRSTPRIVATE)
 	      {
 		gcc_checking_assert (is_gimple_omp_oacc (ctx->stmt));
-		s = TREE_TYPE (ovar);
-		if (TREE_CODE (s) == REFERENCE_TYPE)
-		  s = TREE_TYPE (s);
-		s = TYPE_SIZE_UNIT (s);
+		if (oacc_firstprivate_int)
+		  s = size_int (0);
+		else
+		  {
+		    s = TREE_TYPE (ovar);
+		    if (TREE_CODE (s) == REFERENCE_TYPE
+			|| omp_is_optional_argument (ovar))
+		      s = TREE_TYPE (s);
+		    s = TYPE_SIZE_UNIT (s);
+		  }
 	      }
+	    else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
+		     && (OMP_CLAUSE_MAP_KIND (c) & GOMP_MAP_DYNAMIC_ARRAY))
+	      s = NULL_TREE;
 	    else
 	      s = OMP_CLAUSE_SIZE (c);
 	    if (s == NULL_TREE)
 	      s = TYPE_SIZE_UNIT (TREE_TYPE (ovar));
+	    /* Fortran assumed-size arrays have zero size because the type is
+	       incomplete.  Set the size to one to allow the runtime to remap
+	       any existing data that is already present on the accelerator.  */
+	    if (s == NULL_TREE && is_gimple_omp_oacc (ctx->stmt))
+	      s = integer_one_node;
 	    s = fold_convert (size_type_node, s);
+	    if (oacc_explode_args)
+	      decl_args = append_decl_arg (ovar, decl_args, ctx);
 	    purpose = size_int (map_idx++);
 	    CONSTRUCTOR_APPEND_ELT (vsize, purpose, s);
 	    if (TREE_CODE (s) != INTEGER_CST)
@@ -9623,6 +10626,7 @@
 		  switch (tkind)
 		    {
 		    case GOMP_MAP_ALLOC:
+		    case GOMP_MAP_NO_ALLOC:
 		    case GOMP_MAP_TO:
 		    case GOMP_MAP_FROM:
 		    case GOMP_MAP_TOFROM:
@@ -9651,7 +10655,10 @@
 		break;
 	      case OMP_CLAUSE_FIRSTPRIVATE:
 		gcc_checking_assert (is_gimple_omp_oacc (ctx->stmt));
-		tkind = GOMP_MAP_TO;
+		if (oacc_firstprivate_int)
+		  tkind = GOMP_MAP_FIRSTPRIVATE_INT;
+		else
+		  tkind = GOMP_MAP_TO;
 		tkind_zero = tkind;
 		break;
 	      case OMP_CLAUSE_TO:
@@ -9693,7 +10700,7 @@
 	    break;
 
 	  case OMP_CLAUSE_FIRSTPRIVATE:
-	    if (is_oacc_parallel (ctx))
+	    if (is_oacc_parallel_or_serial (ctx))
 	      goto oacc_firstprivate_map;
 	    ovar = OMP_CLAUSE_DECL (c);
 	    if (omp_is_reference (ovar))
@@ -9745,6 +10752,8 @@
 	    else
 	      s = TYPE_SIZE_UNIT (TREE_TYPE (ovar));
 	    s = fold_convert (size_type_node, s);
+	    if (oacc_explode_args)
+	      decl_args = append_decl_arg (ovar, decl_args, ctx);
 	    purpose = size_int (map_idx++);
 	    CONSTRUCTOR_APPEND_ELT (vsize, purpose, s);
 	    if (TREE_CODE (s) != INTEGER_CST)
@@ -9764,26 +10773,88 @@
 	  case OMP_CLAUSE_IS_DEVICE_PTR:
 	    ovar = OMP_CLAUSE_DECL (c);
 	    var = lookup_decl_in_outer_ctx (ovar, ctx);
-	    x = build_sender_ref (ovar, ctx);
+
+	    // For arrays with descriptor, use the pointer to the actual data
+	    splay_tree_node n = ctx->array_data_map
+				? splay_tree_lookup (ctx->array_data_map,
+						     (splay_tree_key) ovar)
+				: NULL;
+	    if (n)
+	      x = build_sender_ref ((tree) n->value, ctx);
+	    else
+	      x = build_sender_ref (ovar, ctx);
 	    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_USE_DEVICE_PTR)
 	      tkind = GOMP_MAP_USE_DEVICE_PTR;
 	    else
 	      tkind = GOMP_MAP_FIRSTPRIVATE_INT;
 	    type = TREE_TYPE (ovar);
-	    if (TREE_CODE (type) == ARRAY_TYPE)
-	      var = build_fold_addr_expr (var);
+	    if (n)
+	      {
+		var = (tree) n->value;
+		gimplify_assign (x, var, &ilist);
+	      }
+	    else if (TREE_CODE (type) == ARRAY_TYPE)
+	      {
+		var = build_fold_addr_expr (var);
+		gimplify_assign (x, var, &ilist);
+	      }
 	    else
 	      {
-		if (omp_is_reference (ovar))
+		tree opt_arg_label;
+		bool optional_arg_p = omp_is_optional_argument (ovar);
+
+		if (optional_arg_p)
+		  {
+		    tree null_label
+		      = create_artificial_label (UNKNOWN_LOCATION);
+		    tree notnull_label
+		      = create_artificial_label (UNKNOWN_LOCATION);
+		    opt_arg_label
+		      = create_artificial_label (UNKNOWN_LOCATION);
+
+		    tree present = create_tmp_var_raw (boolean_type_node,
+						       get_name (var));
+		    tree cond2 = fold_build2 (NE_EXPR, boolean_type_node,
+					      var, null_pointer_node);
+		    if (!ctx->present_map)
+		      ctx->present_map
+			= splay_tree_new (splay_tree_compare_pointers, 0, 0);
+
+		    splay_tree_insert (ctx->present_map, (splay_tree_key) var,
+				       (splay_tree_value) present);
+
+		    tree new_x = copy_node (x);
+		    gcond *cond = gimple_build_cond_from_tree (present,
+							       notnull_label,
+							       null_label);
+		    gimple_add_tmp_var (present);
+		    gimplify_assign (present, cond2, &ilist);
+		    gimple_seq_add_stmt (&ilist, cond);
+		    gimple_seq_add_stmt (&ilist,
+					 gimple_build_label (null_label));
+		    gimplify_assign (new_x, null_pointer_node, &ilist);
+		    gimple_seq_add_stmt (&ilist,
+					 gimple_build_goto (opt_arg_label));
+		    gimple_seq_add_stmt (&ilist,
+					 gimple_build_label (notnull_label));
+		  }
+
+		if (omp_is_reference (ovar) || optional_arg_p)
 		  {
 		    type = TREE_TYPE (type);
 		    if (TREE_CODE (type) != ARRAY_TYPE)
 		      var = build_simple_mem_ref (var);
 		    var = fold_convert (TREE_TYPE (x), var);
 		  }
+
+		gimplify_assign (x, var, &ilist);
+		if (optional_arg_p)
+		  gimple_seq_add_stmt (&ilist,
+				       gimple_build_label (opt_arg_label));
 	      }
-	    gimplify_assign (x, var, &ilist);
 	    s = size_int (0);
+	    if (oacc_explode_args)
+	      decl_args = append_decl_arg (ovar, decl_args, ctx);
 	    purpose = size_int (map_idx++);
 	    CONSTRUCTOR_APPEND_ELT (vsize, purpose, s);
 	    gcc_checking_assert (tkind
@@ -9796,6 +10867,8 @@
 	  }
 
       gcc_assert (map_idx == map_cnt);
+      if (is_oacc_parallel_or_serial (ctx) && oacc_explode_args)
+	DECL_ARGUMENTS (child_fn) = nreverse (decl_args);
 
       DECL_INITIAL (TREE_VEC_ELT (t, 1))
 	= build_constructor (TREE_TYPE (TREE_VEC_ELT (t, 1)), vsize);
@@ -9834,9 +10907,12 @@
     {
       t = build_fold_addr_expr_loc (loc, ctx->sender_decl);
       /* fixup_child_record_type might have changed receiver_decl's type.  */
-      t = fold_convert_loc (loc, TREE_TYPE (ctx->receiver_decl), t);
-      gimple_seq_add_stmt (&new_body,
-	  		   gimple_build_assign (ctx->receiver_decl, t));
+      if (!is_oacc_parallel_or_serial (ctx) || !oacc_explode_args)
+	{
+	  t = fold_convert_loc (loc, TREE_TYPE (ctx->receiver_decl), t);
+	  gimple_seq_add_stmt (&new_body,
+			       gimple_build_assign (ctx->receiver_decl, t));
+	}
     }
   gimple_seq_add_seq (&new_body, fplist);
 
@@ -9934,11 +11010,54 @@
 	  case OMP_CLAUSE_USE_DEVICE_PTR:
 	  case OMP_CLAUSE_IS_DEVICE_PTR:
 	    var = OMP_CLAUSE_DECL (c);
+	    tree array_data = NULL;
+	    if (ctx->array_data_map)
+	      {
+		splay_tree_node n = splay_tree_lookup (ctx->array_data_map,
+						       (splay_tree_key) var);
+		if (n)
+		  array_data = (tree) n->value;
+	      }
+
 	    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_USE_DEVICE_PTR)
-	      x = build_sender_ref (var, ctx);
+	      x = build_sender_ref (array_data ? array_data : var, ctx);
 	    else
-	      x = build_receiver_ref (var, false, ctx);
-	    if (is_variable_sized (var))
+	      x = build_receiver_ref (array_data ? array_data : var, false, ctx);
+	    if (array_data)
+	      {
+		tree new_var = lookup_decl (var, ctx);
+		new_var = DECL_VALUE_EXPR (new_var);
+		if (omp_is_reference (var))
+		  {
+		    tree type = TREE_TYPE (TREE_TYPE (var));
+		    tree v = create_tmp_var_raw (type, get_name (var));
+		    gimple_add_tmp_var (v);
+		    TREE_ADDRESSABLE (v) = 1;
+		    tree x2 = build_fold_indirect_ref (var);
+		    gimplify_expr (&x2, &new_body, NULL, is_gimple_val, fb_rvalue);
+		    gimple_seq_add_stmt (&new_body, gimple_build_assign (v, x2));
+
+                    tree v2 = lang_hooks.types.omp_array_data (v);
+		    gcc_assert (v2);
+		    gimplify_expr (&x, &new_body, NULL, is_gimple_val, fb_rvalue);
+		    gimple_seq_add_stmt (&new_body,
+					 gimple_build_assign (v2, x));
+		    x = build_fold_addr_expr (v);
+		    gimple_seq_add_stmt (&new_body,
+					 gimple_build_assign (new_var, x));
+		  }
+		else
+		  {
+		    gimple_seq_add_stmt (&new_body,
+					 gimple_build_assign (new_var, var));
+		    new_var = lang_hooks.types.omp_array_data (new_var);
+		    gcc_assert (new_var);
+		    gimplify_expr (&x, &new_body, NULL, is_gimple_val, fb_rvalue);
+		    gimple_seq_add_stmt (&new_body,
+					 gimple_build_assign (new_var, x));
+		  }
+	      }
+	    else if (is_variable_sized (var))
 	      {
 		tree pvar = DECL_VALUE_EXPR (var);
 		gcc_assert (TREE_CODE (pvar) == INDIRECT_REF);
@@ -9964,11 +11083,45 @@
 	      {
 		tree type = TREE_TYPE (var);
 		tree new_var = lookup_decl (var, ctx);
-		if (omp_is_reference (var))
+		tree opt_arg_label = NULL_TREE;
+
+		if (omp_is_reference (var) || omp_is_optional_argument (var))
 		  {
 		    type = TREE_TYPE (type);
 		    if (TREE_CODE (type) != ARRAY_TYPE)
 		      {
+			if (omp_is_optional_argument (var))
+			  {
+			    tree null_label
+			      = create_artificial_label (UNKNOWN_LOCATION);
+			    tree notnull_label
+			      = create_artificial_label (UNKNOWN_LOCATION);
+			    opt_arg_label
+			      = create_artificial_label (UNKNOWN_LOCATION);
+			    glabel *null_glabel
+			      = gimple_build_label (null_label);
+			    glabel *notnull_glabel
+			      = gimple_build_label (notnull_label);
+			    ggoto *opt_arg_ggoto
+			      = gimple_build_goto (opt_arg_label);
+			    gcond *cond;
+
+			    gimplify_expr (&x, &new_body, NULL, is_gimple_val,
+					   fb_rvalue);
+			    splay_tree_node n_present
+				= splay_tree_lookup (ctx->present_map,
+						     (splay_tree_key) var);
+			    cond = gimple_build_cond_from_tree (
+					(tree) n_present->value,
+					notnull_label, null_label);
+			    gimple_seq_add_stmt (&new_body, cond);
+			    gimple_seq_add_stmt (&new_body, null_glabel);
+			    gimplify_assign (new_var, null_pointer_node,
+					     &new_body);
+			    gimple_seq_add_stmt (&new_body, opt_arg_ggoto);
+			    gimple_seq_add_stmt (&new_body, notnull_glabel);
+			  }
+
 			tree v = create_tmp_var_raw (type, get_name (var));
 			gimple_add_tmp_var (v);
 			TREE_ADDRESSABLE (v) = 1;
@@ -9985,6 +11138,10 @@
 		gimplify_expr (&x, &new_body, NULL, is_gimple_val, fb_rvalue);
 		gimple_seq_add_stmt (&new_body,
 				     gimple_build_assign (new_var, x));
+
+		if (opt_arg_label != NULL_TREE)
+		  gimple_seq_add_stmt (&new_body,
+				       gimple_build_label (opt_arg_label));
 	      }
 	    break;
 	  }
@@ -10079,8 +11236,14 @@
 		x = fold_convert_loc (clause_loc, type, x);
 		if (!integer_zerop (OMP_CLAUSE_SIZE (c)))
 		  {
-		    tree bias = OMP_CLAUSE_SIZE (c);
-		    if (DECL_P (bias))
+		    tree bias = OMP_CLAUSE_SIZE (c), remapped_bias;
+		    if (is_gimple_omp_oacc (ctx->stmt))
+		      {
+			if (DECL_P (bias)
+			    && (remapped_bias = maybe_lookup_decl (bias, ctx)))
+			  bias = remapped_bias;
+		      }
+		    else if (DECL_P (bias))
 		      bias = lookup_decl (bias, ctx);
 		    bias = fold_convert_loc (clause_loc, sizetype, bias);
 		    bias = fold_build1_loc (clause_loc, NEGATE_EXPR, sizetype,
@@ -10160,14 +11323,20 @@
       gimple_seq fork_seq = NULL;
       gimple_seq join_seq = NULL;
 
-      if (is_oacc_parallel (ctx))
+      if (is_oacc_parallel_or_serial (ctx))
 	{
 	  /* If there are reductions on the offloaded region itself, treat
 	     them as a dummy GANG loop.  */
 	  tree level = build_int_cst (integer_type_node, GOMP_DIM_GANG);
 
+	  gcall *private_marker = make_oacc_private_marker (ctx);
+
+	  if (private_marker)
+	    gimple_call_set_arg (private_marker, 2, level);
+
 	  lower_oacc_reductions (gimple_location (ctx->stmt), clauses, level,
-				 false, NULL, NULL, &fork_seq, &join_seq, ctx);
+				 false, NULL, private_marker, NULL, &fork_seq,
+				 &join_seq, ctx);
 	}
 
       gimple_seq_add_seq (&new_body, fork_seq);
@@ -10278,6 +11447,201 @@
 		       gimple_build_omp_return (false));
 }
 
+/* Helper to lookup dynamic array through nested omp contexts. Returns
+   TREE_LIST of dimensions, and the CTX where it was found in *CTX_P.  */
+
+static tree
+dynamic_array_lookup (tree t, omp_context **ctx_p)
+{
+  omp_context *c = *ctx_p;
+  while (c)
+    {
+      tree *dims = c->dynamic_arrays->get (t);
+      if (dims)
+	{
+	  *ctx_p = c;
+	  return *dims;
+	}
+      c = c->outer;
+    }
+  return NULL_TREE;
+}
+
+/* Tests if this gimple STMT is the start of a dynamic array access sequence.
+   Returns true if found, and also returns the gimple operand ptr and
+   dimensions tree list through *OUT_REF and *OUT_DIMS respectively.  */
+
+static bool
+dynamic_array_reference_start (gimple *stmt, omp_context **ctx_p,
+			       tree **out_ref, tree *out_dims)
+{
+  if (gimple_code (stmt) == GIMPLE_ASSIGN)
+    for (unsigned i = 1; i < gimple_num_ops (stmt); i++)
+      {
+	tree *op = gimple_op_ptr (stmt, i), dims;
+	if (TREE_CODE (*op) == ARRAY_REF)
+	  op = &TREE_OPERAND (*op, 0);
+	if (TREE_CODE (*op) == MEM_REF)
+	  op = &TREE_OPERAND (*op, 0);
+	if ((dims = dynamic_array_lookup (*op, ctx_p)) != NULL_TREE)
+	  {
+	    *out_ref = op;
+	    *out_dims = dims;
+	    return true;
+	  }
+      }
+  return false;
+}
+
+static tree
+scan_for_op (tree *tp, int *walk_subtrees, void *data)
+{
+  struct walk_stmt_info *wi = (struct walk_stmt_info *) data;
+  tree t = *tp;
+  tree op = (tree) wi->info;
+  *walk_subtrees = 1;
+  if (operand_equal_p (t, op, 0))
+    {
+      wi->info = tp;
+      return t;
+    }
+  return NULL_TREE;
+}
+
+static tree *
+scan_for_reference (gimple *stmt, tree op)
+{
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  wi.info = op;
+  if (walk_gimple_op (stmt, scan_for_op, &wi))
+    return (tree *) wi.info;
+  return NULL;
+}
+
+static tree
+da_create_bias (tree orig_bias, tree unit_type)
+{
+  return build2 (MULT_EXPR, sizetype, fold_convert (sizetype, orig_bias),
+		 TYPE_SIZE_UNIT (unit_type));
+}
+
+/* Main worker for adjusting dynamic array accesses, handles the adjustment
+   of many cases of statement forms, and called multiple times to 'peel' away
+   each dimension.  */
+
+static gimple_stmt_iterator
+da_dimension_peel (omp_context *da_ctx,
+		   gimple_stmt_iterator da_gsi, tree orig_da,
+		   tree *da_op_p, tree *da_type_p, tree *da_dims_p)
+{
+  gimple *stmt = gsi_stmt (da_gsi);
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+
+  if (gimple_num_ops (stmt) == 2
+      && TREE_CODE (rhs) == MEM_REF
+      && operand_equal_p (*da_op_p, TREE_OPERAND (rhs, 0), 0)
+      && !operand_equal_p (orig_da, TREE_OPERAND (rhs, 0), 0)
+      && (TREE_OPERAND (rhs, 1) == NULL_TREE
+	  || integer_zerop (TREE_OPERAND (rhs, 1))))
+    {
+      gcc_assert (TREE_CODE (TREE_TYPE (*da_type_p)) == POINTER_TYPE);
+      *da_type_p = TREE_TYPE (*da_type_p);
+    }
+  else 
+    {
+      gimple *g;
+      gimple_seq ilist = NULL;
+      tree bias, t;
+      tree op = *da_op_p;
+      tree orig_type = *da_type_p;
+      tree orig_bias = TREE_PURPOSE (*da_dims_p);
+      bool by_ref = false;
+
+      if (TREE_CODE (orig_bias) != INTEGER_CST)
+	orig_bias = lookup_decl (orig_bias, da_ctx);
+
+      if (gimple_num_ops (stmt) == 2)
+	{
+	  if (TREE_CODE (rhs) == ADDR_EXPR)
+	    {
+	      rhs = TREE_OPERAND (rhs, 0);
+	      *da_dims_p = NULL_TREE;
+	    }
+
+	  if (TREE_CODE (rhs) == ARRAY_REF
+	      && TREE_CODE (TREE_OPERAND (rhs, 0)) == MEM_REF
+	      && operand_equal_p (TREE_OPERAND (TREE_OPERAND (rhs, 0), 0),
+				  *da_op_p, 0))
+	    {
+	      bias = da_create_bias (orig_bias,
+				     TREE_TYPE (TREE_TYPE (orig_type)));
+	      *da_type_p = TREE_TYPE (TREE_TYPE (orig_type));
+	    }
+	  else if (TREE_CODE (rhs) == ARRAY_REF
+		   && TREE_CODE (TREE_OPERAND (rhs, 0)) == VAR_DECL
+		   && operand_equal_p (TREE_OPERAND (rhs, 0), *da_op_p, 0))
+	    {
+	      tree ptr_type = build_pointer_type (orig_type);
+	      op = create_tmp_var (ptr_type);
+	      gimplify_assign (op, build_fold_addr_expr (TREE_OPERAND (rhs, 0)),
+			       &ilist);
+	      bias = da_create_bias (orig_bias, TREE_TYPE (orig_type));
+	      *da_type_p = TREE_TYPE (orig_type);
+	      orig_type = ptr_type;
+	      by_ref = true;
+	    }
+	  else if (TREE_CODE (rhs) == MEM_REF
+		   && operand_equal_p (*da_op_p, TREE_OPERAND (rhs, 0), 0)
+		   && TREE_OPERAND (rhs, 1) != NULL_TREE)
+	    {
+	      bias = da_create_bias (orig_bias, TREE_TYPE (orig_type));
+	      *da_type_p = TREE_TYPE (orig_type);
+	    }
+	  else if (TREE_CODE (lhs) == MEM_REF
+		   && operand_equal_p (*da_op_p, TREE_OPERAND (lhs, 0), 0))
+	    {
+	      if (*da_dims_p != NULL_TREE)
+		{
+		  gcc_assert (TREE_CHAIN (*da_dims_p) == NULL_TREE);
+		  bias = da_create_bias (orig_bias, TREE_TYPE (orig_type));
+		  *da_type_p = TREE_TYPE (orig_type);
+		}
+	      else
+		/* This should be the end of the dynamic array access
+		   sequence.  */
+		return da_gsi;
+	    }
+	  else
+	    gcc_unreachable ();
+	}
+      else if (gimple_num_ops (stmt) == 3
+	       && gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR
+	       && operand_equal_p (*da_op_p, rhs, 0))
+	{
+	  bias = da_create_bias (orig_bias, TREE_TYPE (orig_type));
+	}
+      else
+	gcc_unreachable ();
+
+      bias = fold_build1 (NEGATE_EXPR, sizetype, bias);
+      bias = fold_build2 (POINTER_PLUS_EXPR, orig_type, op, bias);
+
+      t = create_tmp_var (by_ref ? build_pointer_type (orig_type) : orig_type);
+
+      g = gimplify_assign (t, bias, &ilist);
+      gsi_insert_seq_before (&da_gsi, ilist, GSI_NEW_STMT);
+      *da_op_p = gimple_assign_lhs (g);
+
+      if (by_ref)
+	*da_op_p = build2 (MEM_REF, TREE_TYPE (orig_type), *da_op_p,
+			   build_int_cst (orig_type, 0));
+      *da_dims_p = TREE_CHAIN (*da_dims_p);
+    }
+
+  return da_gsi;
+}
 
 /* Callback for lower_omp_1.  Return non-NULL if *tp needs to be
    regimplified.  If DATA is non-NULL, lower_omp_1 is outside
@@ -10422,6 +11786,9 @@
 		 ctx);
       break;
     case GIMPLE_BIND:
+      if (ctx && is_gimple_omp_oacc (ctx->stmt))
+	oacc_record_vars_in_bind (ctx,
+				  gimple_bind_vars (as_a <gbind *> (stmt)));
       lower_omp (gimple_bind_body_ptr (as_a <gbind *> (stmt)), ctx);
       maybe_remove_omp_member_access_dummy_vars (as_a <gbind *> (stmt));
       break;
@@ -10551,6 +11918,51 @@
 	  }
       /* FALLTHRU */
     default:
+
+      /* If we detect the start of a dynamic array reference sequence, scan
+	 and do the needed adjustments.  */
+      tree da_dims, *da_op_p;
+      omp_context *da_ctx = ctx;
+      if (da_ctx && dynamic_array_reference_start (stmt, &da_ctx,
+						   &da_op_p, &da_dims))
+	{
+	  bool started = false;
+	  tree orig_da = *da_op_p;
+	  tree da_type = TREE_TYPE (orig_da);
+	  tree next_da_op;
+
+	  gimple_stmt_iterator da_gsi = *gsi_p, new_gsi;
+	  while (da_op_p)
+	    {
+	      if (!is_gimple_assign (gsi_stmt (da_gsi))
+		  || ((gimple_assign_single_p (gsi_stmt (da_gsi))
+		       || gimple_assign_cast_p (gsi_stmt (da_gsi)))
+		      && *da_op_p == gimple_assign_rhs1 (gsi_stmt (da_gsi))))
+		break;
+
+	      new_gsi = da_dimension_peel (da_ctx, da_gsi, orig_da,
+					   da_op_p, &da_type, &da_dims);
+	      if (!started)
+		{
+		  /* Point 'stmt' to the start of the newly added
+		     sequence.  */
+		  started = true;
+		  *gsi_p = new_gsi;
+		  stmt = gsi_stmt (*gsi_p);
+		}
+	      if (!da_dims)
+		break;
+
+	      next_da_op = gimple_assign_lhs (gsi_stmt (da_gsi));
+
+	      do {
+		gsi_next (&da_gsi);
+		da_op_p = scan_for_reference (gsi_stmt (da_gsi), next_da_op);
+	      }
+	      while (!da_op_p);
+	    }
+	}
+
       if ((ctx || task_shared_vars)
 	  && walk_gimple_op (stmt, lower_omp_regimplify_p,
 			     ctx ? NULL : &wi))
diff --git a/gcc/omp-oacc-kernels.c b/gcc/omp-oacc-kernels.c
new file mode 100644
index 0000000..a6c4220
--- /dev/null
+++ b/gcc/omp-oacc-kernels.c
@@ -0,0 +1,1596 @@
+/* Transformation pass for OpenACC kernels regions.  Converts a kernels
+   region into a series of smaller parallel regions.  There is a parallel
+   region for each parallelizable loop nest, as well as a "gang-single"
+   parallel region for each non-parallelizable piece of code.
+
+   Contributed by Gergö Barany <gergo@codesourcery.com> and
+                  Thomas Schwinge <thomas@codesourcery.com>
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "tree.h"
+#include "cp/cp-tree.h"
+#include "gimple.h"
+#include "tree-pass.h"
+#include "cgraph.h"
+#include "fold-const.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "gomp-constants.h"
+#include "omp-general.h"
+
+/* This is a preprocessing pass to be run immediately before lower_omp.  It
+   will convert OpenACC "kernels" regions into sequences of "parallel"
+   regions.
+   For now, the translation is as follows:
+   - The entire kernels region is turned into a data region with clauses
+     taken from the kernels region.  New "create" clauses are added for all
+     variables declared at the top level in the kernels region.
+   - Any loop annotated with an OpenACC loop directive is wrapped in a new
+     parallel region.  Gang/worker/vector annotations are copied from the
+     original kernels region if present.
+     * Loops without an explicit "independent" or "seq" annotation get an
+       "auto" annotation; other annotations are preserved on the loop or
+       moved to the new surrounding parallel region.  Which annotations are
+       moved is determined by the constraints in the OpenACC spec; for
+       example, loops in the kernels region may have a gang clause, but
+       such annotations must now be moved to the new parallel region.
+   - Any sequences of other code (non-loops, non-OpenACC loops) are wrapped
+     in new "gang-single" parallel regions: Worker/vector annotations are
+     copied from the original kernels region if present, but num_gangs is
+     explicitly set to 1.
+   - Both points above only apply at the topmost level in the region, i.e.,
+     the transformation does not introduce new parallel regions inside
+     nested statement bodies.  In particular, this means that a
+     gang-parallelizable loop inside an if statement is "gang-serialized" by
+     the transformation.
+     The transformation visits loops inside such new gang-single-regions and
+     removes and warns about any gang annotations.
+   - In order to make the host wait only once for the whole region instead
+     of once per kernel launch, the new parallel and serial regions are
+     annotated async.  Unless the original kernels region was marked async,
+     the entire region ends with a wait construct.  If the original kernels
+     region was marked async, the generated async statements use the async
+     queue the kernels region was annotated with (possibly implicitly).  */
+
+/* Helper function for decompose_kernels_region_body.  If STMT contains a
+   "top-level" OMP_FOR statement, returns a pointer to that statement;
+   returns NULL otherwise.
+
+   A "top-level" OMP_FOR statement is one that is possibly accompanied by
+   small snippets of setup code.  Specifically, this function accepts an
+   OMP_FOR possibly wrapped in a singleton bind and a singleton try
+   statement to allow for a local loop variable, but not an OMP_FOR
+   statement nested in any other constructs.  Alternatively, it accepts a
+   non-singleton bind containing only assignments and then an OMP_FOR
+   statement at the very end.  The former style can be generated by the C
+   frontend, the latter by the Fortran frontend.  */
+
+static gimple *
+top_level_omp_for_in_stmt (gimple *stmt)
+{
+  if (gimple_code (stmt) == GIMPLE_OMP_FOR)
+    return stmt;
+
+  if (gimple_code (stmt) == GIMPLE_BIND)
+    {
+      gimple_seq body = gimple_bind_body (as_a <gbind *> (stmt));
+      if (gimple_seq_singleton_p (body))
+        {
+          /* Accept an OMP_FOR statement, or a try statement containing only
+             a single OMP_FOR.  */
+          gimple *maybe_for_or_try = gimple_seq_first_stmt (body);
+          if (gimple_code (maybe_for_or_try) == GIMPLE_OMP_FOR)
+            return maybe_for_or_try;
+          else if (gimple_code (maybe_for_or_try) == GIMPLE_TRY)
+            {
+              gimple_seq try_body = gimple_try_eval (maybe_for_or_try);
+              if (!gimple_seq_singleton_p (try_body))
+                return NULL;
+              gimple *maybe_omp_for_stmt = gimple_seq_first_stmt (try_body);
+              if (gimple_code (maybe_omp_for_stmt) == GIMPLE_OMP_FOR)
+                return maybe_omp_for_stmt;
+            }
+        }
+      else
+        {
+          gimple_stmt_iterator gsi;
+          /* Accept only a block of optional assignments followed by an
+             OMP_FOR at the end.  No other kinds of statements allowed.  */
+          for (gsi = gsi_start (body); !gsi_end_p (gsi); gsi_next (&gsi))
+            {
+              gimple *body_stmt = gsi_stmt (gsi);
+              if (gimple_code (body_stmt) == GIMPLE_ASSIGN)
+                continue;
+              else if (gimple_code (body_stmt) == GIMPLE_OMP_FOR
+                        && gsi_one_before_end_p (gsi))
+                return body_stmt;
+              else
+                return NULL;
+            }
+        }
+    }
+
+  return NULL;
+}
+
+/* Helper for adjust_region_code: evaluate the statement at GSI_P.  */
+
+static tree
+adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
+				 bool *handled_ops_p,
+				 struct walk_stmt_info *wi)
+{
+  int *region_code = (int *) wi->info;
+
+  gimple *stmt = gsi_stmt (*gsi_p);
+  switch (gimple_code (stmt))
+    {
+    case GIMPLE_OMP_FOR:
+      {
+	tree clauses = gimple_omp_for_clauses (stmt);
+	if (omp_find_clause (clauses, OMP_CLAUSE_INDEPENDENT))
+	  {
+	    /* Explicit 'independent' clause.  */
+	    /* Keep going; recurse into loop body.  */
+	    break;
+	  }
+	else if (omp_find_clause (clauses, OMP_CLAUSE_SEQ))
+	  {
+	    /* Explicit 'seq' clause.  */
+	    /* We'll "parallelize" if at some level a loop construct has been
+	       marked up by the user as unparallelizable ('seq' clause; we'll
+	       respect that in the later processing).  Given that the user has
+	       explicitly marked it up, this loop construct cannot be
+	       performance-critical (and we thus don't have to "avoid
+	       offloading"), and in this case it's also fine to "parallelize"
+	       instead of "gang-single", because any outer or inner loops may
+	       still exploit the available parallelism.  */
+	    /* Keep going; recurse into loop body.  */
+	    break;
+	  }
+	else
+	  {
+	    /* Explicit or implicit 'auto' clause.  */
+	    /* The user would like this loop analyzed ('auto' clause) and
+	       typically parallelized, but we don't have available yet the
+	       compiler logic to analyze this, so can't parallelize it here, so
+	       we'd very likely be running into a performance problem if we
+	       were to execute this unparallelized, thus forward the whole loop
+	       nest to "parloops".  */
+	    *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+	    /* Terminate: final decision for this region.  */
+	    *handled_ops_p = true;
+	    return integer_zero_node;
+	  }
+	gcc_unreachable ();
+      }
+
+    case GIMPLE_COND:
+    case GIMPLE_GOTO:
+    case GIMPLE_SWITCH:
+    case GIMPLE_ASM:
+    case GIMPLE_TRANSACTION:
+    case GIMPLE_RETURN:
+      /* Statement that might constitute some looping/control flow pattern.  */
+      /* The user would like this code analyzed (implicit inside a 'kernels'
+	 region) and typically parallelized, but we don't have available yet
+	 the compiler logic to analyze this, so can't parallelize it here, so
+	 we'd very likely be running into a performance problem if we were to
+	 execute this unparallelized, thus forward the whole thing to
+	 "parloops".  */
+      *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
+      /* Terminate: final decision for this region.  */
+      *handled_ops_p = true;
+      return integer_zero_node;
+
+    default:
+      /* Keep going.  */
+      break;
+    }
+
+  return NULL;
+}
+
+/* Adjust the REGION_CODE for the region in GS.  */
+
+static void
+adjust_region_code (gimple_seq gs, int *region_code)
+{
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  wi.info = region_code;
+  walk_gimple_seq (gs, adjust_region_code_walk_stmt_fn, NULL, &wi);
+}
+
+/* Helper function for make_loops_gang_single for walking the tree.  If the
+   statement indicated by GSI_P is an OpenACC for loop with a gang clause,
+   issue a warning and remove the clause.  */
+
+static tree
+visit_loops_in_gang_single_region (gimple_stmt_iterator *gsi_p,
+                                   bool *handled_ops_p,
+                                   struct walk_stmt_info *)
+{
+  gimple *stmt = gsi_stmt (*gsi_p);
+  tree clauses = NULL, prev_clause = NULL;
+  *handled_ops_p = false;
+
+  switch (gimple_code (stmt))
+    {
+    case GIMPLE_OMP_FOR:
+      clauses = gimple_omp_for_clauses (stmt);
+      for (tree clause = clauses; clause; clause = OMP_CLAUSE_CHAIN (clause))
+        {
+          if (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_GANG)
+            {
+              /* It makes no sense to have a gang clause in a gang-single
+                 region, so remove it and warn.  */
+              warning_at (gimple_location (stmt), 0,
+                          "conditionally executed loop in kernels region"
+                          " will be executed in a single gang;"
+                          " ignoring %<gang%> clause");
+              if (prev_clause != NULL)
+                OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (clause);
+              else
+                clauses = OMP_CLAUSE_CHAIN (clause);
+
+              break;
+            }
+          prev_clause = clause;
+        }
+      gimple_omp_for_set_clauses (stmt, clauses);
+      /* No need to recurse into nested statements; no loop nested inside
+         this loop can be gang-partitioned.  */
+      sorry ("'gang' loop in \"gang-single\" region");
+      *handled_ops_p = true;
+      break;
+
+    default:
+      break;
+    }
+
+  return NULL;
+}
+
+/* Visit all nested OpenACC loops in the sequence indicated by GS.  This
+   statement is expected to be inside a gang-single region.  Issue a warning
+   for any loops inside it that have gang clauses and remove the clauses.  */
+
+static void
+make_loops_gang_single (gimple_seq gs)
+{
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  walk_gimple_seq (gs, visit_loops_in_gang_single_region, NULL, &wi);
+}
+
+/* Construct a "gang-single" OpenACC parallel region at LOC containing the
+   STMTS.  The newly created region is annotated with CLAUSES, which must
+   not contain a num_gangs clause, and an additional "num_gangs(1)" clause
+   to force gang-single execution.  */
+
+static gimple *
+make_region_seq (location_t loc, gimple_seq stmts,
+		 tree num_gangs_clause,
+		 tree num_workers_clause,
+		 tree vector_length_clause,
+		 tree clauses)
+{
+  /* This correctly unshares the entire clause chain rooted here.  */
+  clauses = unshare_expr (clauses);
+
+  dump_user_location_t loc_stmts_first = gimple_seq_first (stmts);
+
+  /* Figure out the region code for this region.  */
+  /* Optimistic default: assume "setup code", no looping; thus not
+     performance-critical.  */
+  int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE;
+  adjust_region_code (stmts, &region_code);
+
+  if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc_stmts_first,
+			 "beginning \"gang-single\" region in OpenACC"
+			 " 'kernels' construct\n");
+
+      /* Make a num_gangs(1) clause.  */
+      tree gang_single_clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
+      OMP_CLAUSE_OPERAND (gang_single_clause, 0) = integer_one_node;
+      OMP_CLAUSE_CHAIN (gang_single_clause) = clauses;
+      clauses = gang_single_clause;
+
+      /* Remove and issue warnings about gang clauses on any OpenACC
+	 loops nested inside this sequentially executed statement.  */
+      make_loops_gang_single (stmts);
+    }
+  else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc_stmts_first,
+			 "beginning \"parloops\" region in OpenACC"
+			 " 'kernels' construct\n");
+
+      /* As we're transforming a "GF_OMP_TARGET_KIND_OACC_KERNELS" into another
+	 "GF_OMP_TARGET_KIND_OACC_KERNELS", this isn't doing any of the clauses
+	 mangling that "make_region_loop_nest" is doing.  */
+      /* Re-assemble the clauses stripped off earlier.  */
+      if (num_gangs_clause != NULL)
+	{
+	  tree c = unshare_expr (num_gangs_clause);
+	  OMP_CLAUSE_CHAIN (c) = clauses;
+	  clauses = c;
+	}
+      if (num_workers_clause != NULL)
+	{
+	  tree c = unshare_expr (num_workers_clause);
+	  OMP_CLAUSE_CHAIN (c) = clauses;
+	  clauses = c;
+	}
+      if (vector_length_clause != NULL)
+	{
+	  tree c = unshare_expr (vector_length_clause);
+	  OMP_CLAUSE_CHAIN (c) = clauses;
+	  clauses = c;
+	}
+    }
+  else
+    gcc_unreachable ();
+
+  /* Build the gang-single region.  */
+  gimple *single_region = gimple_build_omp_target (NULL, region_code, clauses);
+  gimple_set_location (single_region, loc);
+  gbind *single_body = gimple_build_bind (NULL, stmts, make_node (BLOCK));
+  gimple_omp_set_body (single_region, single_body);
+
+  return single_region;
+}
+
+/* Helper function for make_region_loop_nest.  Adds a num_gangs
+   (num_workers, vector_length) clause to the given CLAUSES, either the one
+   from the parent region (PARENT_CLAUSE) or a new one based on the loop's
+   own LOOP_CLAUSE ("gang(num: N)" or similar for workers or vectors) with
+   the given CLAUSE_CODE.  Does nothing if neither PARENT_CLAUSE nor
+   LOOP_CLAUSE exist.  Returns the new clauses.  */
+
+static tree
+add_parent_or_loop_num_clause (tree parent_clause, tree loop_clause,
+                               omp_clause_code clause_code, tree clauses)
+{
+  if (parent_clause != NULL)
+    {
+      tree num_clause = unshare_expr (parent_clause);
+      OMP_CLAUSE_CHAIN (num_clause) = clauses;
+      clauses = num_clause;
+    }
+  else if (loop_clause != NULL)
+    {
+      /* The kernels region does not have a "num_gangs" clause, but the loop
+         itself had a "gang(num: N)" clause.  Honor it by adding a
+         "num_gangs(N)" clause on the parallel region.  */
+      tree num = OMP_CLAUSE_OPERAND (loop_clause, 0);
+      tree new_num_clause
+        = build_omp_clause (OMP_CLAUSE_LOCATION (loop_clause), clause_code);
+      OMP_CLAUSE_OPERAND (new_num_clause, 0) = num;
+      OMP_CLAUSE_CHAIN (new_num_clause) = clauses;
+      clauses = new_num_clause;
+    }
+  return clauses;
+}
+
+/* Helper for make_region_loop_nest, looking for "worker(num: N)"
+   or "vector(length: N)" clauses in nested loops.  Removes the numeric
+   argument, transferring it to the enclosing parallel region (via
+   WI->INFO).  If numeric arguments within the same loop nest conflict,
+   emits a warning.
+
+   This function also decides whether to add an auto clause on each of these
+   nested loops.  It adds an auto clause unless there is already an
+   independent/seq/auto clause or a gang/worker/vector annotation.  */
+
+struct adjust_nested_loop_clauses_wi_info
+{
+  tree *loop_gang_clause_ptr;
+  tree *loop_worker_clause_ptr;
+  tree *loop_vector_clause_ptr;
+};
+
+static tree
+adjust_nested_loop_clauses (gimple_stmt_iterator *gsi_p, bool *,
+                            struct walk_stmt_info *wi)
+{
+  struct adjust_nested_loop_clauses_wi_info *wi_info
+    = (struct adjust_nested_loop_clauses_wi_info *) wi->info;
+  gimple *stmt = gsi_stmt (*gsi_p);
+
+  if (gimple_code (stmt) == GIMPLE_OMP_FOR)
+    {
+      bool add_auto_clause = true;
+      tree loop_clauses = gimple_omp_for_clauses (stmt);
+      tree loop_clause = loop_clauses;
+      for (; loop_clause; loop_clause = OMP_CLAUSE_CHAIN (loop_clause))
+        {
+          tree *outer_clause_ptr = NULL;
+          switch (OMP_CLAUSE_CODE (loop_clause))
+            {
+              case OMP_CLAUSE_GANG:
+                outer_clause_ptr = wi_info->loop_gang_clause_ptr;
+                break;
+              case OMP_CLAUSE_WORKER:
+                outer_clause_ptr = wi_info->loop_worker_clause_ptr;
+                break;
+              case OMP_CLAUSE_VECTOR:
+                outer_clause_ptr = wi_info->loop_vector_clause_ptr;
+                break;
+              case OMP_CLAUSE_INDEPENDENT:
+              case OMP_CLAUSE_SEQ:
+              case OMP_CLAUSE_AUTO:
+                add_auto_clause = false;
+              default:
+                break;
+            }
+          if (outer_clause_ptr != NULL)
+            {
+              if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL
+                  && *outer_clause_ptr == NULL)
+                {
+                  /* Transfer the clause to the enclosing parallel region
+                     and remove the numerical argument from the loop.  */
+                  *outer_clause_ptr = unshare_expr (loop_clause);
+                  OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
+                }
+              else if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL &&
+                       OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0) != NULL)
+                {
+                  /* See if both of these are the same constant.  If they
+                     aren't, emit a warning.  */
+                  tree old_op = OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0);
+                  tree new_op = OMP_CLAUSE_OPERAND (loop_clause, 0);
+                  if (!(cst_and_fits_in_hwi (old_op) &&
+                        cst_and_fits_in_hwi (new_op) &&
+                        int_cst_value (old_op) == int_cst_value (new_op)))
+                    {
+                      const char *clause_name
+                        = omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)];
+                      error_at (gimple_location (stmt),
+                                "cannot honor conflicting %qs annotation",
+                                clause_name);
+                      inform (OMP_CLAUSE_LOCATION (*outer_clause_ptr),
+                              "location of the previous annotation "
+                              "in the same loop nest");
+                    }
+                  OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
+                }
+            }
+        }
+      if (add_auto_clause)
+        {
+          tree auto_clause
+            = build_omp_clause (gimple_location (stmt), OMP_CLAUSE_AUTO);
+          OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
+          gimple_omp_for_set_clauses (stmt, auto_clause);
+        }
+    }
+
+  return NULL;
+}
+
+/* Helper for make_region_loop_nest.  Transform OpenACC 'kernels'/'loop'
+   construct clauses into OpenACC 'parallel'/'loop' construct ones.  */
+
+static tree
+transform_kernels_loop_clauses (gimple *omp_for,
+				tree num_gangs_clause,
+				tree num_workers_clause,
+				tree vector_length_clause,
+				tree clauses)
+{
+  /* If this loop in a kernels region does not have an explicit
+     "independent", "seq", or "auto" clause, we must give it an explicit
+     "auto" clause.
+     We also check for "gang(num: N)" clauses.  These must not appear in
+     kernels regions that have their own "num_gangs" clause.  Otherwise, they
+     must be converted and put on the region; similarly for workers and
+     vectors.  */
+  bool add_auto_clause = true;
+  tree loop_gang_clause = NULL, loop_worker_clause = NULL,
+       loop_vector_clause = NULL;
+  tree loop_clauses = gimple_omp_for_clauses (omp_for);
+  for (tree loop_clause = loop_clauses;
+       loop_clause;
+       loop_clause = OMP_CLAUSE_CHAIN (loop_clause))
+    {
+      /* Look for gang, worker, and vector clauses.  */
+      bool found_num_clause = false;
+      tree *clause_ptr, clause_to_check;
+      switch (OMP_CLAUSE_CODE (loop_clause))
+         {
+          case OMP_CLAUSE_GANG:
+            found_num_clause = true;
+            clause_ptr = &loop_gang_clause;
+            clause_to_check = num_gangs_clause;
+            break;
+          case OMP_CLAUSE_WORKER:
+            found_num_clause = true;
+            clause_ptr = &loop_worker_clause;
+            clause_to_check = num_workers_clause;
+            break;
+          case OMP_CLAUSE_VECTOR:
+            found_num_clause = true;
+            clause_ptr = &loop_vector_clause;
+            clause_to_check = vector_length_clause;
+            break;
+          case OMP_CLAUSE_INDEPENDENT:
+          case OMP_CLAUSE_SEQ:
+          case OMP_CLAUSE_AUTO:
+            add_auto_clause = false;
+          default:
+            break;
+        }
+      if (found_num_clause && OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL)
+        {
+          if (clause_to_check)
+            {
+              const char *clause_name
+                = omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)];
+              const char *parent_clause_name
+                = omp_clause_code_name[OMP_CLAUSE_CODE (clause_to_check)];
+              error_at (OMP_CLAUSE_LOCATION (loop_clause),
+                        "argument not permitted on %qs clause"
+                        " in OpenACC %<kernels%> region with a %qs clause",
+                        clause_name, parent_clause_name);
+              inform (OMP_CLAUSE_LOCATION (clause_to_check),
+                      "location of OpenACC %<kernels%> region");
+            }
+          /* Copy the gang(N)/worker(N)/vector(N) clause to the enclosing
+             parallel region.  */
+          *clause_ptr = unshare_expr (loop_clause);
+          OMP_CLAUSE_CHAIN (*clause_ptr) = NULL;
+          /* Leave a gang/worker/vector clause on the loop, but without a
+             numeric argument.  */
+          OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
+         }
+     }
+  if (add_auto_clause)
+    {
+      tree auto_clause = build_omp_clause (gimple_location (omp_for),
+                                           OMP_CLAUSE_AUTO);
+      OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
+      loop_clauses = auto_clause;
+    }
+  gimple_omp_for_set_clauses (omp_for, loop_clauses);
+  /* We must also recurse into the loop; it might contain nested loops
+     having their own "worker(num: W)" or "vector(length: V)" annotations.
+     Turn these into worker/vector annotations on the parallel region.  */
+  struct walk_stmt_info wi;
+  memset (&wi, 0, sizeof (wi));
+  struct adjust_nested_loop_clauses_wi_info wi_info;
+  wi_info.loop_gang_clause_ptr = &loop_gang_clause;
+  wi_info.loop_worker_clause_ptr = &loop_worker_clause;
+  wi_info.loop_vector_clause_ptr = &loop_vector_clause;
+  wi.info = &wi_info;
+  gimple *body = gimple_omp_body (omp_for);
+  walk_gimple_seq (body, adjust_nested_loop_clauses, NULL, &wi);
+  /* Check if there were conflicting numbers of workers or vector lanes.  */
+  if (loop_gang_clause != NULL &&
+      OMP_CLAUSE_OPERAND (loop_gang_clause, 0) == NULL)
+    loop_gang_clause = NULL;
+  if (loop_worker_clause != NULL &&
+      OMP_CLAUSE_OPERAND (loop_worker_clause, 0) == NULL)
+    loop_worker_clause = NULL;
+  if (loop_vector_clause != NULL &&
+      OMP_CLAUSE_OPERAND (loop_vector_clause, 0) == NULL)
+    vector_length_clause = NULL;
+
+  /* If the kernels region had num_gangs, num_worker, vector_length clauses,
+     add these to this new parallel region.  */
+  clauses
+    = add_parent_or_loop_num_clause (num_gangs_clause, loop_gang_clause,
+				     OMP_CLAUSE_NUM_GANGS, clauses);
+  clauses
+    = add_parent_or_loop_num_clause (num_workers_clause, loop_worker_clause,
+				     OMP_CLAUSE_NUM_WORKERS, clauses);
+  clauses
+    = add_parent_or_loop_num_clause (vector_length_clause, loop_vector_clause,
+				     OMP_CLAUSE_VECTOR_LENGTH, clauses);
+
+  return clauses;
+}
+
+/* Construct a possibly gang-parallel OpenACC parallel region containing the
+   STMT, which must be identical to, or a bind containing, the loop OMP_FOR
+   with OpenACC loop annotations.
+
+   The NUM_GANGS_CLAUSE, NUM_WORKERS_CLAUSE, and VECTOR_LENGTH_CLAUSE are
+   optional clauses from the original kernels region and must not be
+   contained in the other CLAUSES. The newly created region is annotated
+   with the optional NUM_GANGS_CLAUSE as well as the other CLAUSES. If there
+   is no NUM_GANGS_CLAUSE but the loop has a "gang(num: N)" clause, that is
+   converted to a "num_gangs(N)" clause on the new region, and similarly for
+   workers and vectors.
+
+   The outermost loop gets an auto clause unless there already is an
+   independent/seq/auto clause or a gang/worker/vector annotation.  Nested
+   loops inside OMP_FOR are treated similarly by the
+   adjust_nested_loop_clauses function.  */
+
+static gimple *
+make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
+		       tree num_gangs_clause,
+		       tree num_workers_clause,
+		       tree vector_length_clause,
+		       tree clauses)
+{
+  /* This correctly unshares the entire clause chain rooted here.  */
+  clauses = unshare_expr (clauses);
+
+  /* Figure out the region code for this region.  */
+  /* Optimistic default: assume that the loop nest is parallelizable
+     (essentially, no GIMPLE_OMP_FOR with (explicit or implicit) 'auto' clause,
+     and no un-annotated loops).  */
+  int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED;
+  adjust_region_code (stmts, &region_code);
+
+  if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+			 "parallelized loop nest in OpenACC 'kernels'"
+			 " construct\n");
+
+      clauses = transform_kernels_loop_clauses (omp_for,
+						num_gangs_clause,
+						num_workers_clause,
+						vector_length_clause,
+						clauses);
+    }
+  else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+			 "forwarded loop nest in OpenACC 'kernels' construct"
+			 " to \"parloops\" for analysis\n");
+
+      /* We're transforming one "GF_OMP_TARGET_KIND_OACC_KERNELS" into another
+	 "GF_OMP_TARGET_KIND_OACC_KERNELS", so don't have to
+	 "transform_kernels_loop_clauses".  */
+      /* Re-assemble the clauses stripped off earlier.  */
+      clauses
+	= add_parent_or_loop_num_clause (num_gangs_clause, NULL,
+					 OMP_CLAUSE_NUM_GANGS, clauses);
+      clauses
+	= add_parent_or_loop_num_clause (num_workers_clause, NULL,
+					 OMP_CLAUSE_NUM_WORKERS, clauses);
+      clauses
+	= add_parent_or_loop_num_clause (vector_length_clause, NULL,
+					 OMP_CLAUSE_VECTOR_LENGTH, clauses);
+    }
+  else
+    gcc_unreachable ();
+
+  /* Now build the parallel region containing this loop.  */
+  gimple *parallel_body_bind
+    = gimple_build_bind (NULL, stmts, make_node (BLOCK));
+  gimple *parallel_region
+    = gimple_build_omp_target (parallel_body_bind, region_code, clauses);
+  gimple_set_location (parallel_region, gimple_location (omp_for));
+
+  return parallel_region;
+}
+
+/* Eliminate any binds directly inside BIND by adding their statements to
+   BIND (i.e., modifying it in place), excluding binds that hold only an
+   OMP_FOR loop and associated setup/cleanup code.  Recurse into binds but
+   not other statements.  Return a chain of the local variables of eliminated
+   binds, i.e., the local variables found in nested binds.  If
+   INCLUDE_TOPLEVEL_VARS is true, this also includes the variables belonging
+   to BIND itself. */
+
+static tree
+flatten_binds (gbind *bind, bool include_toplevel_vars = false)
+{
+  tree vars = NULL, last_var = NULL;
+
+  if (include_toplevel_vars)
+    {
+      vars = gimple_bind_vars (bind);
+      last_var = vars;
+    }
+
+  gimple_seq new_body = NULL;
+  gimple_seq body_sequence = gimple_bind_body (bind);
+  gimple_stmt_iterator gsi, gsi_n;
+  for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n)
+    {
+      /* Advance the iterator here because otherwise it would be invalidated
+         by moving statements below.  */
+      gsi_n = gsi;
+      gsi_next (&gsi_n);
+
+      gimple *stmt = gsi_stmt (gsi);
+      /* Flatten bind statements, except the ones that contain only an
+         OpenACC for loop.  */
+      if (gimple_code (stmt) == GIMPLE_BIND
+          && !top_level_omp_for_in_stmt (stmt))
+        {
+          gbind *inner_bind = as_a <gbind *> (stmt);
+          /* Flatten recursively, and collect all variables.  */
+          tree inner_vars = flatten_binds (inner_bind, true);
+          gimple_seq inner_sequence = gimple_bind_body (inner_bind);
+          gcc_assert (gimple_code (inner_sequence) != GIMPLE_BIND
+                      || top_level_omp_for_in_stmt (inner_sequence));
+          gimple_seq_add_seq (&new_body, inner_sequence);
+          /* Find the last variable; we will append others to it.  */
+          while (last_var != NULL && TREE_CHAIN (last_var) != NULL)
+            last_var = TREE_CHAIN (last_var);
+          if (last_var != NULL)
+            {
+              TREE_CHAIN (last_var) = inner_vars;
+              last_var = inner_vars;
+            }
+          else
+            {
+              vars = inner_vars;
+              last_var = vars;
+            }
+        }
+      else
+        gimple_seq_add_stmt (&new_body, stmt);
+    }
+
+  /* Put the possibly transformed body back into the bind.  */
+  gimple_bind_set_body (bind, new_body);
+  return vars;
+}
+
+/* Recursively search BODY_SEQUENCE for 'for' loops, and record their loop
+   indices in IDX_VARS.  */
+
+static void
+find_omp_for_index_vars_1 (gimple_seq body_sequence, hash_set<tree> *idx_vars)
+{
+  gimple_stmt_iterator gsi;
+
+  for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      gimple *for_stmt = top_level_omp_for_in_stmt (stmt);
+
+      if (for_stmt)
+        {
+	  tree idx = gimple_omp_for_index (for_stmt, 0);
+	  idx_vars->add (idx);
+	  find_omp_for_index_vars_1 (gimple_omp_body (for_stmt), idx_vars);
+	}
+      else if (gimple_code (stmt) == GIMPLE_BIND)
+	find_omp_for_index_vars_1 (gimple_bind_body (as_a <gbind *> (stmt)),
+				   idx_vars);
+    }
+}
+
+/* Find all loop index variables in a bind.  */
+
+static hash_set<tree>
+find_omp_for_index_vars (gbind *bind)
+{
+  hash_set<tree> idx_vars;
+
+  find_omp_for_index_vars_1 (gimple_bind_body (bind), &idx_vars);
+
+  return idx_vars;
+}
+
+/* Helper function for places where we construct data regions.  Wraps the BODY
+   inside a try-finally construct at LOC that calls __builtin_GOACC_data_end
+   in its cleanup block.  Returns this try statement.  */
+
+static gimple *
+make_data_region_try_statement (location_t loc, gimple *body)
+{
+  tree data_end_fn = builtin_decl_explicit (BUILT_IN_GOACC_DATA_END);
+  gimple *call = gimple_build_call (data_end_fn, 0);
+  gimple_seq cleanup = NULL;
+  gimple_seq_add_stmt (&cleanup, call);
+  gimple *try_stmt = gimple_build_try (body, cleanup, GIMPLE_TRY_FINALLY);
+  gimple_set_location (body, loc);
+  return try_stmt;
+}
+
+/* If INNER_BIND_VARS holds variables, build an OpenACC data region with
+   location LOC containing BODY and having "create(var)" clauses for each
+   variable (such variables are also made addressable as a side effect).  If
+   INNER_CLEANUP is present, add a try-finally statement with this cleanup
+   code in the finally block.  Return the new data region, or the original
+   BODY if no data region was needed.  */
+
+static gimple *
+maybe_build_inner_data_region (location_t loc, gimple *body,
+			       tree inner_bind_vars, gimple *inner_cleanup,
+			       hash_set<tree> *idx_vars)
+{
+  /* Build data "create(var)" clauses for these local variables.
+     Below we will add these to a data region enclosing the entire body
+     of the decomposed kernels region.  */
+  tree prev_mapped_var = NULL, next = NULL, artificial_vars = NULL,
+       inner_data_clauses = NULL;
+  for (tree v = inner_bind_vars; v; v = next)
+    {
+      next = TREE_CHAIN (v);
+      if (DECL_ARTIFICIAL (v)
+          || TREE_CODE (v) == CONST_DECL
+          || (DECL_LANG_SPECIFIC (current_function_decl)
+              && DECL_TEMPLATE_INSTANTIATION (current_function_decl)))
+        {
+          /* If this is an artificial temporary, it need not be mapped.  We
+             move its declaration into the bind inside the data region.
+             Also avoid mapping variables if we are inside a template
+             instantiation; the code does not contain all the copies to
+             temporaries that would make this legal.  */
+          TREE_CHAIN (v) = artificial_vars;
+          artificial_vars = v;
+          if (prev_mapped_var != NULL)
+            TREE_CHAIN (prev_mapped_var) = next;
+          else
+            inner_bind_vars = next;
+        }
+      else if (!idx_vars->contains (v))
+        {
+          /* Otherwise, build the map clause.  */
+          tree new_clause = build_omp_clause (loc, OMP_CLAUSE_MAP);
+          OMP_CLAUSE_SET_MAP_KIND (new_clause, GOMP_MAP_ALLOC);
+          OMP_CLAUSE_DECL (new_clause) = v;
+          OMP_CLAUSE_SIZE (new_clause) = DECL_SIZE_UNIT (v);
+          OMP_CLAUSE_CHAIN (new_clause) = inner_data_clauses;
+	  TREE_ADDRESSABLE (v) = 1;
+          inner_data_clauses = new_clause;
+
+          prev_mapped_var = v;
+        }
+    }
+
+  if (artificial_vars)
+    body = gimple_build_bind (artificial_vars, body, make_node (BLOCK));
+
+  /* If we determined above that there are variables that need to be created
+     on the device, construct a data region for them and wrap the body
+     inside that.  */
+  if (inner_data_clauses != NULL)
+    {
+      gcc_assert (inner_bind_vars != NULL);
+      gimple *inner_data_region
+        = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS,
+                                   inner_data_clauses);
+      gimple_set_location (inner_data_region, loc);
+      /* Make sure __builtin_GOACC_data_end is called at the end.  */
+      gimple *try_stmt = make_data_region_try_statement (loc, body);
+      gimple_omp_set_body (inner_data_region, try_stmt);
+      gimple *bind_body;
+      if (inner_cleanup != NULL)
+          /* Clobber all the inner variables that need to be clobbered.  */
+          bind_body = gimple_build_try (inner_data_region, inner_cleanup,
+                                        GIMPLE_TRY_FINALLY);
+      else
+          bind_body = inner_data_region;
+      body = gimple_build_bind (inner_bind_vars, bind_body, make_node (BLOCK));
+    }
+
+  return body;
+}
+
+static void
+add_wait (location_t loc, gimple_seq *region_body)
+{
+  /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0).  */
+  tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+  tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+  gimple *wait_call = gimple_build_call (wait_fn, 2,
+                                         sync_arg, integer_zero_node);
+  gimple_set_location (wait_call, loc);
+  gimple_seq_add_stmt (region_body, wait_call);
+}
+
+/* Helper function of decompose_kernels_region_body.  The statements in
+   REGION_BODY are expected to be decomposed parallel regions; add an
+   "async" clause to each.  Also add a "wait" pragma at the end of the
+   sequence.  */
+
+static void
+add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
+{
+  tree default_async_queue
+    = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
+  for (gimple_stmt_iterator gsi = gsi_start (*region_body);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      tree target_clauses = gimple_omp_target_clauses (stmt);
+      tree new_async_clause = build_omp_clause (loc, OMP_CLAUSE_ASYNC);
+      OMP_CLAUSE_OPERAND (new_async_clause, 0) = default_async_queue;
+      OMP_CLAUSE_CHAIN (new_async_clause) = target_clauses;
+      target_clauses = new_async_clause;
+      gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
+                                     target_clauses);
+    }
+  add_wait (loc, region_body);
+}
+
+/* Auxiliary analysis of the body of a kernels region, to determine for each
+   OpenACC loop whether it is control-dependent (i.e., not necessarily
+   executed every time the kernels region is entered) or not.
+   We say that a loop is control-dependent if there is some cond, switch, or
+   goto statement that jumps over it, forwards or backwards.  For example,
+   if the loop is controlled by an if statement, then a jump to the true
+   block, the false block, or from one of those blocks to the control flow
+   join point will necessarily jump over the loop.
+   This analysis implements an ad-hoc union-find data structure classifying
+   statements into "control-flow regions" as follows: Most statements are in
+   the same region as their predecessor, except that each OpenACC loop is in
+   a region of its own, and each OpenACC loop's successor starts a new
+   region.  We then unite the regions of any statements linked by jumps,
+   placing any cond, switch, or goto statement in the same region as its
+   target label(s).
+   In the end, control dependence of OpenACC loops can be determined by
+   comparing their immediate predecessor and successor statements' regions.
+   A jump crosses the loop if and only if the predecessor and successor are
+   in the same region.  (If there is no predecessor or successor, the loop
+   is executed unconditionally.)
+   The methods in this class identify statements by their index in the
+   kernels region's body.  */
+
+class control_flow_regions
+{
+  public:
+    /* Initialize an instance and pre-compute the control-flow region
+       information for the statement sequence SEQ.  */
+    control_flow_regions (gimple_seq seq);
+
+    /* Return true if the statement with the given index IDX in the analyzed
+       statement sequence is an unconditionally executed OpenACC loop.  */
+    bool is_unconditional_oacc_for_loop (size_t idx);
+
+  private:
+    /* Find the region representative for the statement identified by index
+       STMT_IDX.  */
+    size_t find_rep (size_t stmt_idx);
+
+    /* Union the regions containing the statements represented by
+       representatives A and B.  */
+    void union_reps (size_t a, size_t b);
+
+    /* Helper for the constructor.  Performs the actual computation of the
+       control-flow regions in the statement sequence SEQ.  */
+    void compute_regions (gimple_seq seq);
+
+    /* The mapping from statement indices to region representatives.  */
+    vec <size_t> representatives;
+
+    /* A cache mapping statement indices to a flag indicating whether the
+       statement is a top level OpenACC for loop.  */
+    vec <bool> omp_for_loops;
+};
+
+control_flow_regions::control_flow_regions (gimple_seq seq)
+{
+  representatives.create (1);
+  omp_for_loops.create (1);
+  compute_regions (seq);
+}
+
+bool
+control_flow_regions::is_unconditional_oacc_for_loop (size_t idx)
+{
+  if (idx == 0 || idx == representatives.length () - 1)
+    /* The first or last statement in the kernels region.  This means that
+       there is no room before or after it for a jump or a label.  Thus
+       there cannot be a jump across it, so it is unconditional.  */
+    return true;
+  /* Otherwise, the loop is unconditional if the statements before and after
+     it are in different control flow regions.  Scan forward and backward,
+     skipping over neighboring OpenACC for loops, to find these preceding
+     statements.  */
+  size_t prev_index = idx - 1;
+  while (prev_index > 0 && omp_for_loops [prev_index] == true)
+    prev_index--;
+  /* If all preceding statements are also OpenACC loops, all of these are
+     unconditional.  */
+  if (prev_index == 0)
+    return true;
+  size_t succ_index = idx + 1;
+  while (succ_index < omp_for_loops.length ()
+         && omp_for_loops [succ_index] == true)
+    succ_index++;
+  /* If all following statements are also OpenACC loops, all of these are
+     unconditional.  */
+  if (succ_index == omp_for_loops.length ())
+    return true;
+  return (find_rep (prev_index) != find_rep (succ_index));
+}
+
+size_t
+control_flow_regions::find_rep (size_t stmt_idx)
+{
+  size_t rep = stmt_idx, aux = stmt_idx;
+  /* Find the root representative of this statement.  */
+  while (representatives[rep] != rep)
+    rep = representatives[rep];
+  /* Compress the path from the original statement to the representative.  */
+  while (representatives[aux] != rep)
+    {
+      size_t tmp = representatives[aux];
+      representatives[aux] = rep;
+      aux = tmp;
+    }
+  return rep;
+}
+
+void
+control_flow_regions::union_reps (size_t a, size_t b)
+{
+  a = find_rep (a);
+  b = find_rep (b);
+  representatives[b] = a;
+}
+
+void
+control_flow_regions::compute_regions (gimple_seq seq)
+{
+  hash_map <gimple *, size_t> control_flow_reps;
+  hash_map <tree, size_t> label_reps;
+  size_t current_region = 0, idx = 0;
+
+  /* In a first pass, assign an initial region to each statement.  Except in
+     the case of OpenACC loops, each statement simply gets the same region
+     representative as its predecessor.  */
+  for (gimple_stmt_iterator gsi = gsi_start (seq);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      gimple *omp_for = top_level_omp_for_in_stmt (stmt);
+      omp_for_loops.safe_push (omp_for != NULL);
+      if (omp_for != NULL)
+        {
+          /* Assign a new region to this loop and to its successor.  */
+          current_region = idx;
+          representatives.safe_push (current_region);
+          current_region++;
+        }
+      else
+        {
+          representatives.safe_push (current_region);
+          /* Remember any jumps and labels for the second pass below.  */
+          if (gimple_code (stmt) == GIMPLE_COND
+              || gimple_code (stmt) == GIMPLE_SWITCH
+              || gimple_code (stmt) == GIMPLE_GOTO)
+            control_flow_reps.put (stmt, current_region);
+          else if (gimple_code (stmt) == GIMPLE_LABEL)
+            label_reps.put (gimple_label_label (as_a <glabel *> (stmt)),
+                            current_region);
+        }
+      idx++;
+    }
+  gcc_assert (representatives.length () == omp_for_loops.length ());
+
+  /* Revisit all the control flow statements and union the region of each
+     cond, switch, or goto statement with the target labels' regions.  */
+  for (hash_map <gimple *, size_t>::iterator it = control_flow_reps.begin ();
+       it != control_flow_reps.end ();
+       ++it)
+    {
+      gimple *stmt = (*it).first;
+      size_t stmt_rep = (*it).second;
+      switch (gimple_code (stmt))
+        {
+          tree label;
+          unsigned int n;
+
+        case GIMPLE_COND:
+          label = gimple_cond_true_label (as_a <gcond *> (stmt));
+          union_reps (stmt_rep, *label_reps.get (label));
+          label = gimple_cond_false_label (as_a <gcond *> (stmt));
+          union_reps (stmt_rep, *label_reps.get (label));
+          break;
+
+        case GIMPLE_SWITCH:
+          n = gimple_switch_num_labels (as_a <gswitch *> (stmt));
+          for (unsigned int i = 0; i < n; i++)
+            {
+              tree switch_case
+                = gimple_switch_label (as_a <gswitch *> (stmt), i);
+              label = CASE_LABEL (switch_case);
+              union_reps (stmt_rep, *label_reps.get (label));
+            }
+          break;
+
+        case GIMPLE_GOTO:
+          label = gimple_goto_dest (stmt);
+          union_reps (stmt_rep, *label_reps.get (label));
+          break;
+
+        default:
+          gcc_unreachable ();
+        }
+    }
+}
+
+/* Decompose the body of the KERNELS_REGION, which was originally annotated
+   with the KERNELS_CLAUSES, into a series of regions.  */
+
+static gimple *
+decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
+{
+  location_t loc = gimple_location (kernels_region);
+
+  /* The kernels clauses will be propagated to the child clauses unmodified,
+     except that the num_gangs, num_workers, and vector_length clauses will
+     only be added to loop regions.  The other regions are "gang-single" and
+     get an explicit num_gangs(1) clause.  So separate out the num_gangs,
+     num_workers, and vector_length clauses here.
+     Also check for the presence of an async clause but do not remove it
+     from the kernels clauses.  */
+  tree num_gangs_clause = NULL, num_workers_clause = NULL,
+       vector_length_clause = NULL;
+  tree prev_clause = NULL, next_clause = NULL, async_clause = NULL;
+  tree parallel_clauses = kernels_clauses;
+  for (tree c = parallel_clauses; c; c = next_clause)
+    {
+      /* Preserve this here, as we might NULL it later.  */
+      next_clause = OMP_CLAUSE_CHAIN (c);
+
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_GANGS
+          || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_WORKERS
+          || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR_LENGTH)
+        {
+          /* Cut this clause out of the chain.  */
+          if (prev_clause != NULL)
+            OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (c);
+          else
+            kernels_clauses = OMP_CLAUSE_CHAIN (c);
+          OMP_CLAUSE_CHAIN (c) = NULL;
+          switch (OMP_CLAUSE_CODE (c))
+            {
+              case OMP_CLAUSE_NUM_GANGS:
+                num_gangs_clause = c;
+                break;
+              case OMP_CLAUSE_NUM_WORKERS:
+                num_workers_clause = c;
+                break;
+              case OMP_CLAUSE_VECTOR_LENGTH:
+                vector_length_clause = c;
+                break;
+              default:
+                gcc_unreachable ();
+            }
+        }
+      else
+        prev_clause = c;
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
+        async_clause = c;
+    }
+
+  gimple *kernels_body = gimple_omp_body (kernels_region);
+  gbind *kernels_bind = as_a <gbind *> (kernels_body);
+
+  /* The body of the region may contain other nested binds declaring inner
+     local variables.  Collapse all these binds into one to ensure that we
+     have a single sequence of statements to iterate over; also, collect all
+     inner variables.  */
+  tree inner_bind_vars = flatten_binds (kernels_bind);
+  gimple_seq body_sequence = gimple_bind_body (kernels_bind);
+
+  hash_set<tree> idx_vars = find_omp_for_index_vars (kernels_bind);
+
+  /* All these inner variables will get allocated on the device (below, by
+     calling maybe_build_inner_data_region).  Here we create "present"
+     clauses for them and add these clauses to the list of clauses to be
+     attached to each inner parallel region.  */
+  tree present_clauses = kernels_clauses;
+  for (tree var = inner_bind_vars; var; var = TREE_CHAIN (var))
+    {
+      if (!DECL_ARTIFICIAL (var)
+	  && TREE_CODE (var) != CONST_DECL
+	  && !idx_vars.contains (var))
+        {
+          tree present_clause = build_omp_clause (loc, OMP_CLAUSE_MAP);
+          OMP_CLAUSE_SET_MAP_KIND (present_clause, GOMP_MAP_FORCE_PRESENT);
+          OMP_CLAUSE_DECL (present_clause) = var;
+          OMP_CLAUSE_SIZE (present_clause) = DECL_SIZE_UNIT (var);
+          OMP_CLAUSE_CHAIN (present_clause) = present_clauses;
+          present_clauses = present_clause;
+        }
+    }
+  kernels_clauses = present_clauses;
+
+  /* In addition to nested binds, the "real" body of the region may be
+     nested inside a try-finally block.  Find its cleanup block, which
+     contains code to clobber the local variables that must be clobbered.  */
+  gimple *inner_cleanup = NULL;
+  if (body_sequence != NULL && gimple_code (body_sequence) == GIMPLE_TRY)
+    {
+      if (gimple_seq_singleton_p (body_sequence))
+        {
+          /* The try statement is the only thing inside the bind.  */
+          inner_cleanup = gimple_try_cleanup (body_sequence);
+          body_sequence = gimple_try_eval (body_sequence);
+        }
+      else
+        {
+          /* The bind's body starts with a try statement, but it is followed
+             by other things.  */
+          gimple_stmt_iterator gsi = gsi_start (body_sequence);
+          gimple *try_stmt = gsi_stmt (gsi);
+          inner_cleanup = gimple_try_cleanup (try_stmt);
+          gimple *try_body = gimple_try_eval (try_stmt);
+
+          gsi_remove (&gsi, false);
+          /* Now gsi indicates the sequence of statements after the try
+             statement in the bind.  Append the statement in the try body and
+             the trailing statements from gsi.  */
+          gsi_insert_seq_before (&gsi, try_body, GSI_CONTINUE_LINKING);
+          body_sequence = gsi_stmt (gsi);
+        }
+    }
+
+  /* This sequence will collect all the top-level statements in the body of
+     the data region we are about to construct.  */
+  gimple_seq region_body = NULL;
+  /* This sequence will collect consecutive statements to be put into a
+     gang-single region.  */
+  gimple_seq gang_single_seq = NULL;
+  /* Flag recording whether the gang_single_seq only contains copies to
+     local variables.  These may be loop setup code that should not be
+     separated from the loop.  */
+  bool only_simple_assignments = true;
+
+  /* Precompute the control flow region information to determine whether an
+     OpenACC loop is executed conditionally or unconditionally.  */
+  control_flow_regions cf_regions (body_sequence);
+
+  /* Iterate over the statements in the kernels region's body.  */
+  size_t idx = 0;
+  gimple_stmt_iterator gsi, gsi_n;
+  for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n, idx++)
+    {
+      /* Advance the iterator here because otherwise it would be invalidated
+         by moving statements below.  */
+      gsi_n = gsi;
+      gsi_next (&gsi_n);
+
+      gimple *stmt = gsi_stmt (gsi);
+      gimple *omp_for = top_level_omp_for_in_stmt (stmt);
+      bool is_unconditional_oacc_for_loop = false;
+      if (omp_for != NULL)
+	is_unconditional_oacc_for_loop
+	  = cf_regions.is_unconditional_oacc_for_loop (idx);
+      if (omp_for != NULL
+          && is_unconditional_oacc_for_loop)
+        {
+          /* This is an OMP for statement, put it into a separate region.
+             But first, construct a gang-single region containing any
+             complex sequential statements we may have seen.  */
+          if (gang_single_seq != NULL && !only_simple_assignments)
+            {
+              gimple *single_region
+                = make_region_seq (loc, gang_single_seq,
+				   num_gangs_clause,
+				   num_workers_clause,
+				   vector_length_clause,
+				   kernels_clauses);
+              gimple_seq_add_stmt (&region_body, single_region);
+            }
+          else if (gang_single_seq != NULL && only_simple_assignments)
+            {
+              /* There is a sequence of sequential statements preceding this
+                 loop, but they are all simple assignments.  This is
+                 probably setup code for the loop; in particular, Fortran DO
+                 loops are preceded by code to copy the loop limit variable
+                 to a temporary.  Group this code together with the loop
+                 itself.  */
+              gimple_seq_add_stmt (&gang_single_seq, stmt);
+              stmt = gimple_build_bind (NULL, gang_single_seq,
+                                        make_node (BLOCK));
+            }
+          gang_single_seq = NULL;
+          only_simple_assignments = true;
+
+	  gimple_seq parallel_seq = NULL;
+	  gimple_seq_add_stmt (&parallel_seq, stmt);
+          gimple *parallel_region
+	    = make_region_loop_nest (omp_for, parallel_seq,
+                                              num_gangs_clause,
+                                              num_workers_clause,
+                                              vector_length_clause,
+                                              kernels_clauses);
+          gimple_seq_add_stmt (&region_body, parallel_region);
+        }
+      else
+        {
+	  if (omp_for != NULL)
+	    {
+	      gcc_checking_assert (!is_unconditional_oacc_for_loop);
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, omp_for,
+				 "unparallelized loop nest in OpenACC"
+				 " 'kernels' region: it's executed"
+				 " conditionally\n");
+	    }
+
+          /* This is not an unconditional OMP for statement, so it will be
+             put into a gang-single region.  */
+          gimple_seq_add_stmt (&gang_single_seq, stmt);
+          /* Is this a simple assignment? We call it simple if it is an
+             assignment to an artificial local variable.  This captures
+             Fortran loop setup code computing loop bounds and offsets.  */
+          bool is_simple_assignment
+            = (gimple_code (stmt) == GIMPLE_ASSIGN
+                && TREE_CODE (gimple_assign_lhs (stmt)) == VAR_DECL
+                && DECL_ARTIFICIAL (gimple_assign_lhs (stmt)));
+          if (!is_simple_assignment)
+            only_simple_assignments = false;
+        }
+    }
+
+  /* If we did not emit a new region, and are not going to emit one now
+     (that is, the original region was empty), prepare to emit a dummy so as
+     to preserve the original construct, which other processing (at least
+     test cases) depend on.  */
+  if (region_body == NULL && gang_single_seq == NULL)
+    {
+      gimple *stmt = gimple_build_nop ();
+      gimple_set_location (stmt, loc);
+      gimple_seq_add_stmt (&gang_single_seq, stmt);
+    }
+
+  /* Gather up any remaining gang-single statements.  */
+  if (gang_single_seq != NULL)
+    {
+      gimple *single_region
+        = make_region_seq (loc, gang_single_seq,
+			   num_gangs_clause,
+			   num_workers_clause,
+			   vector_length_clause,
+			   kernels_clauses);
+      gimple_seq_add_stmt (&region_body, single_region);
+    }
+
+  /* We want to launch these kernels asynchronously.  If the original
+     kernels region had an async clause, this is done automatically because
+     that async clause was copied to the individual regions we created.
+     Otherwise, add an async clause to each newly created region, as well as
+     a wait directive at the end.  */
+  if (async_clause == NULL)
+    add_async_clauses_and_wait (loc, &region_body);
+  else
+    /* !!! If we have asynchronous parallel blocks inside a (synchronous) data
+       region, then target memory will get unmapped at the point the data
+       region ends, even if the inner asynchronous parallels have not yet
+       completed.  For kernels marked "async", we might want to use "enter data
+       async(...)" and "exit data async(...)" instead.
+       For now, insert a (synchronous) wait at the end of the block.  */
+    add_wait (loc, &region_body);
+
+  tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
+  gimple *body = gimple_build_bind (kernels_locals, region_body,
+                                    make_node (BLOCK));
+
+  /* If we found variables declared in nested scopes, build a data region to
+     map them to the device.  */
+  body = maybe_build_inner_data_region (loc, body, inner_bind_vars,
+                                        inner_cleanup, &idx_vars);
+
+  return body;
+}
+
+/* Transform KERNELS_REGION, which is an OpenACC kernels region, into a data
+   region containing the original kernels region's body cut up into a
+   sequence of parallel regions.  */
+
+static gimple *
+transform_kernels_region (gimple *kernels_region)
+{
+  gcc_checking_assert (gimple_omp_target_kind (kernels_region)
+                        == GF_OMP_TARGET_KIND_OACC_KERNELS);
+  location_t loc = gimple_location (kernels_region);
+
+  /* Collect the kernels region's data clauses and create the new data
+     region with those clauses.  */
+  tree kernels_clauses = gimple_omp_target_clauses (kernels_region);
+  tree data_clauses = NULL;
+  for (tree c = kernels_clauses; c; c = OMP_CLAUSE_CHAIN (c))
+    {
+      /* Certain map clauses are copied to the enclosing data region.  Any
+         non-data clause remains on the kernels region.  */
+      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP)
+        {
+          tree decl = OMP_CLAUSE_DECL (c);
+          HOST_WIDE_INT kind = OMP_CLAUSE_MAP_KIND (c);
+          switch (kind)
+            {
+            default:
+              if (kind == GOMP_MAP_ALLOC &&
+                  integer_zerop (OMP_CLAUSE_SIZE (c)))
+                /* ??? This is an alloc clause for mapping a pointer whose
+                   target is already mapped.  We leave these on the inner
+                   parallel regions because moving them to the outer data
+                   region causes runtime errors.  */
+                break;
+
+              /* For non-artificial variables, and for non-declaration
+                 expressions like A[0:n], copy the clause to the data
+                 region.  */
+              if ((DECL_P (decl) && !DECL_ARTIFICIAL (decl))
+                  || !DECL_P (decl))
+                {
+                  tree new_clause = build_omp_clause (OMP_CLAUSE_LOCATION (c),
+                                                      OMP_CLAUSE_MAP);
+                  OMP_CLAUSE_SET_MAP_KIND (new_clause, kind);
+                  /* This must be unshared here to avoid "incorrect sharing
+                     of tree nodes" errors from verify_gimple.  */
+                  OMP_CLAUSE_DECL (new_clause) = unshare_expr (decl);
+                  OMP_CLAUSE_SIZE (new_clause) = OMP_CLAUSE_SIZE (c);
+                  OMP_CLAUSE_CHAIN (new_clause) = data_clauses;
+                  data_clauses = new_clause;
+
+                  /* Now that this data is mapped, the inner data clause on
+                     the kernels region can become a present clause.  */
+                  OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_PRESENT);
+                }
+              break;
+
+            case GOMP_MAP_POINTER:
+            case GOMP_MAP_TO_PSET:
+            case GOMP_MAP_FORCE_TOFROM:
+            case GOMP_MAP_FIRSTPRIVATE_POINTER:
+            case GOMP_MAP_FIRSTPRIVATE_REFERENCE:
+              /* ??? Copying these map kinds leads to internal compiler
+                 errors in later passes.  */
+              break;
+            }
+        }
+      else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_IF)
+        {
+          /* If there is an if clause, it must also be present on the
+             enclosing data region.  Temporarily remove the if clause's
+             chain to avoid copying it.  */
+          tree saved_chain = OMP_CLAUSE_CHAIN (c);
+          OMP_CLAUSE_CHAIN (c) = NULL;
+          tree new_if_clause = unshare_expr (c);
+          OMP_CLAUSE_CHAIN (c) = saved_chain;
+          OMP_CLAUSE_CHAIN (new_if_clause) = data_clauses;
+          data_clauses = new_if_clause;
+        }
+    }
+  /* Restore the original order of the clauses.  */
+  data_clauses = nreverse (data_clauses);
+
+  gimple *data_region
+    = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS,
+                               data_clauses);
+  gimple_set_location (data_region, loc);
+
+  /* Transform the body of the kernels region into a sequence of parallel
+     regions.  */
+  gimple *body = decompose_kernels_region_body (kernels_region,
+                                                kernels_clauses);
+
+  /* Put the transformed pieces together.  The entire body of the region is
+     wrapped in a try-finally statement that calls __builtin_GOACC_data_end
+     for cleanup.  */
+  gimple *try_stmt = make_data_region_try_statement (loc, body);
+  gimple_omp_set_body (data_region, try_stmt);
+
+  return data_region;
+}
+
+/* Helper function of convert_oacc_kernels for walking the tree, calling
+   transform_kernels_region on each kernels region found.  */
+
+static tree
+scan_kernels (gimple_stmt_iterator *gsi_p, bool *handled_ops_p,
+              struct walk_stmt_info *)
+{
+  gimple *stmt = gsi_stmt (*gsi_p);
+  *handled_ops_p = false;
+
+  int kind;
+  switch (gimple_code (stmt))
+    {
+    case GIMPLE_OMP_TARGET:
+      kind = gimple_omp_target_kind (stmt);
+      if (kind == GF_OMP_TARGET_KIND_OACC_KERNELS)
+        {
+          gimple *new_region = transform_kernels_region (stmt);
+          gsi_replace (gsi_p, new_region, false);
+          *handled_ops_p = true;
+        }
+      break;
+
+    default:
+      break;
+    }
+
+  return NULL;
+}
+
+/* Find and transform OpenACC kernels regions in the current function.  */
+
+static unsigned int
+convert_oacc_kernels (void)
+{
+  struct walk_stmt_info wi;
+  gimple_seq body = gimple_body (current_function_decl);
+
+  memset (&wi, 0, sizeof (wi));
+  walk_gimple_seq_mod (&body, scan_kernels, NULL, &wi);
+
+  gimple_set_body (current_function_decl, body);
+
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_convert_oacc_kernels =
+{
+  GIMPLE_PASS, /* type */
+  "convert_oacc_kernels", /* name */
+  OPTGROUP_OMP, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  PROP_gimple_any, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_convert_oacc_kernels : public gimple_opt_pass
+{
+public:
+  pass_convert_oacc_kernels (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_convert_oacc_kernels, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+  {
+    return (flag_openacc
+	    && flag_openacc_kernels == OPENACC_KERNELS_SPLIT);
+  }
+  virtual unsigned int execute (function *)
+  {
+    return convert_oacc_kernels ();
+  }
+
+}; // class pass_convert_oacc_kernels
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_convert_oacc_kernels (gcc::context *ctxt)
+{
+  return new pass_convert_oacc_kernels (ctxt);
+}
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 97ae47b..e489ad3 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -52,6 +52,8 @@
 #include "stringpool.h"
 #include "attribs.h"
 #include "cfgloop.h"
+#include "omp-sese.h"
+#include "convert.h"
 
 /* Describe the OpenACC looping structure of a function.  The entire
    function is held in a 'NULL' loop.  */
@@ -588,8 +590,9 @@
 }
 
 /* Parse the default dimension parameter.  This is a set of
-   :-separated optional compute dimensions.  Each specified dimension
-   is a positive integer.  When device type support is added, it is
+   :-separated optional compute dimensions.  Each dimension is either
+   a positive integer, or '-' for a dynamic value computed at
+   runtime.  When device type support is added, it is
    planned to be a comma separated list of such compute dimensions,
    with all but the first prefixed by the colon-terminated device
    type.  */
@@ -624,14 +627,20 @@
 
 	  if (*pos != ':')
 	    {
-	      long val;
-	      const char *eptr;
+	      long val = 0;
 
-	      errno = 0;
-	      val = strtol (pos, CONST_CAST (char **, &eptr), 10);
-	      if (errno || val <= 0 || (int) val != val)
-		goto malformed;
-	      pos = eptr;
+	      if (*pos == '-')
+		pos++;
+	      else
+		{
+		  const char *eptr;
+
+		  errno = 0;
+		  val = strtol (pos, CONST_CAST (char **, &eptr), 10);
+		  if (errno || val <= 0 || (int) val != val)
+		    goto malformed;
+		  pos = eptr;
+		}
 	      oacc_default_dims[ix] = (int) val;
 	    }
 	}
@@ -673,6 +682,34 @@
       pos = TREE_CHAIN (pos);
     }
 
+  bool check = true;
+#ifdef ACCEL_COMPILER
+  check = false;
+#endif
+  if (check
+      && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn)))
+    {
+      static char const *const axes[] =
+      /* Must be kept in sync with GOMP_DIM enumeration.  */
+	{ "gang", "worker", "vector" };
+      for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
+	if (dims[ix] < 0)
+	  ; /* Defaulting axis.  */
+	else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
+	  /* There is partitioned execution, but the user requested a
+	     dimension size of 1.  They're probably confused.  */
+	  warning_at (DECL_SOURCE_LOCATION (fn), 0,
+		      "region contains %s partitioned code but"
+		      " is not %s partitioned", axes[ix], axes[ix]);
+	else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
+	  /* The dimension is explicitly partitioned to non-unity, but
+	     no use is made within the region.  */
+	  warning_at (DECL_SOURCE_LOCATION (fn), 0,
+		      "region is %s partitioned but"
+		      " does not contain %s partitioned code",
+		      axes[ix], axes[ix]);
+    }
+
   bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
 
   /* Default anything left to 1 or a partitioned default.  */
@@ -1073,7 +1110,9 @@
 	    = ((enum ifn_unique_kind)
 	       TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
 
-	  if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
+	  if (k == IFN_UNIQUE_OACC_FORK
+	      || k == IFN_UNIQUE_OACC_JOIN
+	      || k == IFN_UNIQUE_OACC_PRIVATE)
 	    *gimple_call_arg_ptr (stmt, 2) = replacement;
 	  else if (k == kind && stmt != from)
 	    break;
@@ -1081,6 +1120,8 @@
       else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
 	*gimple_call_arg_ptr (stmt, 3) = replacement;
 
+      update_stmt (stmt);
+
       gsi_next (&gsi);
       while (gsi_end_p (gsi))
 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
@@ -1105,25 +1146,28 @@
       gcall *call;
       
       for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
-	switch (gimple_call_internal_fn (call))
-	  {
-	  case IFN_GOACC_LOOP:
+        {
+	  switch (gimple_call_internal_fn (call))
 	    {
-	      bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
-	      gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
-	      if (!is_e)
-		gimple_call_set_arg (call, 4, chunk_arg);
+	    case IFN_GOACC_LOOP:
+	      {
+		bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
+		gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
+		if (!is_e)
+		  gimple_call_set_arg (call, 4, chunk_arg);
+	      }
+	      break;
+
+	    case IFN_GOACC_TILE:
+	      gimple_call_set_arg (call, 3, mask_arg);
+	      gimple_call_set_arg (call, 4, e_mask_arg);
+	      break;
+
+	    default:
+	      gcc_unreachable ();
 	    }
-	    break;
-
-	  case IFN_GOACC_TILE:
-	    gimple_call_set_arg (call, 3, mask_arg);
-	    gimple_call_set_arg (call, 4, e_mask_arg);
-	    break;
-
-	  default:
-	    gcc_unreachable ();
-	  }
+	  update_stmt (call);
+	}
 
       unsigned dim = GOMP_DIM_GANG;
       unsigned mask = loop->mask | loop->e_mask;
@@ -1301,7 +1345,7 @@
 
 static unsigned
 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
-			   bool outer_assign)
+			   bool outer_assign, bool is_oacc_gang_single)
 {
   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
   bool noisy = true;
@@ -1319,6 +1363,17 @@
 	 non-innermost available level.  */
       unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
 
+      /* Gang partitioning is not available in a gang-single region.  */
+      if (is_oacc_gang_single)
+        this_mask = GOMP_DIM_MASK (GOMP_DIM_WORKER);
+
+      /* Orphan reductions cannot have gang partitioning.  */
+      if ((loop->flags & OLF_REDUCTION)
+	  && oacc_get_fn_attrib (current_function_decl)
+	  && !lookup_attribute ("omp target entrypoint",
+				DECL_ATTRIBUTES (current_function_decl)))
+	this_mask = GOMP_DIM_MASK (GOMP_DIM_WORKER);
+
       /* Find the first outermost available partition. */
       while (this_mask <= outer_mask)
 	this_mask <<= 1;
@@ -1348,7 +1403,8 @@
     {
       unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
       loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
-					       outer_assign | assign);
+					       outer_assign | assign,
+					       is_oacc_gang_single);
     }
 
   if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
@@ -1407,7 +1463,8 @@
 
   if (loop->sibling)
     inner_mask |= oacc_loop_auto_partitions (loop->sibling,
-					     outer_mask, outer_assign);
+					     outer_mask, outer_assign,
+					     is_oacc_gang_single);
 
   inner_mask |= loop->inner | loop->mask | loop->e_mask;
 
@@ -1418,14 +1475,16 @@
    axes.  Return mask of partitioning.  */
 
 static unsigned
-oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
+oacc_loop_partition (oacc_loop *loop, unsigned outer_mask,
+                     bool is_oacc_gang_single)
 {
   unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
 
   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
     {
       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
-      mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
+      mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false,
+                                             is_oacc_gang_single);
     }
   return mask_all;
 }
@@ -1470,6 +1529,17 @@
 
       if (!integer_zerop (ref_to_res))
 	{
+	  /* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data
+	     mappings gets retyped to (void *).  Adjust the type of ref_to_res
+	     as appropriate.  */
+	  if (TREE_TYPE (TREE_TYPE (ref_to_res)) != TREE_TYPE (var))
+	    {
+	      tree ptype = build_pointer_type (TREE_TYPE (var));
+	      tree t = make_ssa_name (ptype);
+	      tree expr = fold_build1 (NOP_EXPR, ptype, ref_to_res);
+	      gimple_seq_add_stmt (&seq, gimple_build_assign (t, expr));
+	      ref_to_res = t;
+	    }
 	  tree dst = build_simple_mem_ref (ref_to_res);
 	  tree src = var;
 
@@ -1490,19 +1560,132 @@
   gsi_replace_with_seq (&gsi, seq, true);
 }
 
+/* Determine whether DECL should be discarded in this offload
+   compilation.  */
+
+static bool
+maybe_discard_oacc_function (tree decl)
+{
+  tree attr = lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl));
+
+  if (!attr)
+    return false;
+
+  enum omp_clause_code kind = OMP_CLAUSE_NOHOST;
+
+  if (omp_find_clause (TREE_VALUE (attr), kind))
+    return true;
+
+  return false;
+}
+
+struct addr_expr_rewrite_info
+{
+  gimple *stmt;
+  hash_set<tree> *adjusted_vars;
+  bool avoid_pointer_conversion;
+  bool modified;
+};
+
+static tree
+rewrite_addr_expr (tree *tp, int *walk_subtrees, void *data)
+{
+  walk_stmt_info *wi = (walk_stmt_info *) data;
+  addr_expr_rewrite_info *info = (addr_expr_rewrite_info *) wi->info;
+
+  if (TREE_CODE (*tp) == ADDR_EXPR)
+    {
+      tree arg = TREE_OPERAND (*tp, 0);
+
+      if (info->adjusted_vars->contains (arg))
+	{
+	  if (info->avoid_pointer_conversion)
+	    {
+	      *tp = build_fold_addr_expr (arg);
+	      info->modified = true;
+	      *walk_subtrees = 0;
+	    }
+	  else
+	    {
+	      gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
+	      tree repl = build_fold_addr_expr (arg);
+	      gimple *stmt1
+		= gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
+	      tree conv = convert_to_pointer (TREE_TYPE (*tp),
+					      gimple_assign_lhs (stmt1));
+	      gimple *stmt2
+		= gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv);
+	      gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT);
+	      gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT);
+	      *tp = gimple_assign_lhs (stmt2);
+	      info->modified = true;
+	      *walk_subtrees = 0;
+	    }
+	}
+    }
+
+  return NULL_TREE;
+}
+
+/* Return TRUE if CALL is a call to a builtin atomic/sync operation.  */
+
+static bool
+is_sync_builtin_call (gcall *call)
+{
+  tree callee = gimple_call_fndecl (call);
+
+  if (callee != NULL_TREE
+      && gimple_call_builtin_p (call, BUILT_IN_NORMAL))
+    switch (DECL_FUNCTION_CODE (callee))
+      {
+#undef DEF_SYNC_BUILTIN
+#define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
+#include "sync-builtins.def"
+#undef DEF_SYNC_BUILTIN
+	return true;
+
+      default:
+	;
+      }
+
+  return false;
+}
+
+/* Default implementation creates a temporary variable of type RECORD_TYPE if
+   SENDER is true, else a pointer to RECORD_TYPE if SENDER is false.  */
+
+tree
+default_goacc_create_propagation_record (tree record_type, bool sender,
+					 const char *name)
+{
+  tree type = record_type;
+
+  if (!sender)
+    type = build_pointer_type (type);
+
+  return create_tmp_var (type, name);
+}
+
 /* Main entry point for oacc transformations which run on the device
    compiler after LTO, so we know what the target device is at this
    point (including the host fallback).  */
 
 static unsigned int
-execute_oacc_device_lower ()
+execute_oacc_loop_designation ()
 {
-  tree attrs = oacc_get_fn_attrib (current_function_decl);
-
-  if (!attrs)
+  tree attr = oacc_get_fn_attrib (current_function_decl);
+  if (!attr)
     /* Not an offloaded function.  */
     return 0;
 
+  if (maybe_discard_oacc_function (current_function_decl))
+    {
+      if (dump_file)
+	fprintf (dump_file, "Discarding function\n");
+      TREE_ASM_WRITTEN (current_function_decl) = 1;
+      return TODO_discard_function;
+    }
+
   /* Parse the default dim argument exactly once.  */
   if ((const void *)flag_openacc_dims != &flag_openacc_dims)
     {
@@ -1516,6 +1699,20 @@
   bool is_oacc_kernels_parallelized
     = (lookup_attribute ("oacc kernels parallelized",
 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
+  if (is_oacc_kernels_parallelized)
+    gcc_checking_assert (is_oacc_kernels);
+  bool is_oacc_parallel_kernels_parallelized
+    = (lookup_attribute ("oacc parallel_kernels_parallelized",
+			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
+  if (is_oacc_parallel_kernels_parallelized)
+    gcc_checking_assert (!is_oacc_kernels);
+  bool is_oacc_parallel_kernels_gang_single
+    = (lookup_attribute ("oacc parallel_kernels_gang_single",
+			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
+  if (is_oacc_parallel_kernels_gang_single)
+    gcc_checking_assert (!is_oacc_kernels);
+  gcc_checking_assert (!(is_oacc_parallel_kernels_parallelized
+			 && is_oacc_parallel_kernels_gang_single));
 
   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
      kernels, so remove the parallelism dimensions function attributes
@@ -1523,12 +1720,12 @@
   if (is_oacc_kernels && !is_oacc_kernels_parallelized)
     {
       oacc_set_fn_attrib (current_function_decl, NULL, NULL);
-      attrs = oacc_get_fn_attrib (current_function_decl);
+      attr = oacc_get_fn_attrib (current_function_decl);
     }
 
   /* Discover, partition and process the loops.  */
   oacc_loop *loops = oacc_loop_discovery ();
-  int fn_level = oacc_fn_attrib_level (attrs);
+  int fn_level = oacc_fn_attrib_level (attr);
 
   if (dump_file)
     {
@@ -1539,12 +1736,20 @@
 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
 		 (is_oacc_kernels_parallelized
 		  ? "parallelized" : "unparallelized"));
+      else if (is_oacc_parallel_kernels_parallelized)
+	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
+		 "parallel_kernels_parallelized");
+      else if (is_oacc_parallel_kernels_gang_single)
+	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
+		 "parallel_kernels_gang_single");
       else
 	fprintf (dump_file, "Function is OpenACC parallel offload\n");
     }
 
   unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
-  unsigned used_mask = oacc_loop_partition (loops, outer_mask);
+  unsigned used_mask = oacc_loop_partition (loops, outer_mask,
+                                            is_oacc_parallel_kernels_gang_single);
+
   /* OpenACC kernels constructs are special: they currently don't use the
      generic oacc_loop infrastructure and attribute/dimension processing.  */
   if (is_oacc_kernels && is_oacc_kernels_parallelized)
@@ -1555,7 +1760,7 @@
     }
 
   int dims[GOMP_DIM_MAX];
-  oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
+  oacc_validate_dims (current_function_decl, attr, dims, fn_level, used_mask);
 
   if (dump_file)
     {
@@ -1595,10 +1800,38 @@
 	free_oacc_loop (l);
     }
 
+  free_oacc_loop (loops);
+
   /* Offloaded targets may introduce new basic blocks, which require
      dominance information to update SSA.  */
   calculate_dominance_info (CDI_DOMINATORS);
 
+  return 0;
+}
+
+int
+execute_oacc_gimple_workers (void)
+{
+  oacc_do_neutering ();
+  calculate_dominance_info (CDI_DOMINATORS);
+  return 0;
+}
+
+static unsigned int
+execute_oacc_device_lower ()
+{
+  int dims[GOMP_DIM_MAX];
+  tree attr = oacc_get_fn_attrib (current_function_decl);
+
+  if (!attr)
+    /* Not an offloaded function.  */
+    return 0;
+
+  for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
+    dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
+
+  hash_set<tree> adjusted_vars;
+
   /* Now lower internal loop functions to target-specific code
      sequences.  */
   basic_block bb;
@@ -1675,6 +1908,43 @@
 		case IFN_UNIQUE_OACC_TAIL_MARK:
 		  remove = true;
 		  break;
+
+		case IFN_UNIQUE_OACC_PRIVATE:
+		  {
+		    HOST_WIDE_INT level
+		      = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
+		    if (level == -1)
+		      break;
+		    for (unsigned i = 3;
+			 i < gimple_call_num_args (call);
+			 i++)
+		      {
+			tree arg = gimple_call_arg (call, i);
+			gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+			tree decl = TREE_OPERAND (arg, 0);
+			if (dump_file && (dump_flags & TDF_DETAILS))
+			  {
+			    static char const *const axes[] =
+			      /* Must be kept in sync with GOMP_DIM
+				 enumeration.  */
+			      { "gang", "worker", "vector" };
+			    fprintf (dump_file, "Decl UID %u has %s "
+				     "partitioning:", DECL_UID (decl),
+				     axes[level]);
+			    print_generic_decl (dump_file, decl, TDF_SLIM);
+			    fputc ('\n', dump_file);
+			  }
+			if (targetm.goacc.adjust_private_decl)
+			  {
+			    tree oldtype = TREE_TYPE (decl);
+			    targetm.goacc.adjust_private_decl (decl, level);
+			    if (TREE_TYPE (decl) != oldtype)
+			      adjusted_vars.add (decl);
+			  }
+		      }
+		    remove = true;
+		  }
+		  break;
 		}
 	      break;
 	    }
@@ -1706,7 +1976,54 @@
 	  gsi_next (&gsi);
       }
 
-  free_oacc_loop (loops);
+  /* Make adjustments to gang-private local variables if required by the
+     target, e.g. forcing them into a particular address space.  Afterwards,
+     ADDR_EXPR nodes which have adjusted variables as their argument need to
+     be modified in one of two ways:
+
+       1. They can be recreated, making a pointer to the variable in the new
+	  address space, or
+
+       2. The address of the variable in the new address space can be taken,
+	  converted to the default (original) address space, and the result of
+	  that conversion subsituted in place of the original ADDR_EXPR node.
+
+     Which of these is done depends on the gimple statement being processed.
+     At present atomic operations and inline asms use (1), and everything else
+     uses (2).  At least on AMD GCN, there are atomic operations that work
+     directly in the LDS address space.  */
+
+  if (targetm.goacc.adjust_private_decl)
+    {
+      tree var;
+      unsigned i;
+
+      FOR_ALL_BB_FN (bb, cfun)
+	for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+	     !gsi_end_p (gsi);
+	     gsi_next (&gsi))
+	  {
+	    gimple *stmt = gsi_stmt (gsi);
+	    walk_stmt_info wi;
+	    addr_expr_rewrite_info info;
+
+	    info.avoid_pointer_conversion
+	      = (is_gimple_call (stmt)
+		 && is_sync_builtin_call (as_a <gcall *> (stmt)))
+		|| gimple_code (stmt) == GIMPLE_ASM;
+	    info.stmt = stmt;
+	    info.modified = false;
+	    info.adjusted_vars = &adjusted_vars;
+
+	    memset (&wi, 0, sizeof (wi));
+	    wi.info = &info;
+
+	    walk_gimple_op (stmt, rewrite_addr_expr, &wi);
+
+	    if (info.modified)
+	      update_stmt (stmt);
+	  }
+    }
 
   return 0;
 }
@@ -1748,6 +2065,70 @@
 
 namespace {
 
+const pass_data pass_data_oacc_loop_designation =
+{
+  GIMPLE_PASS, /* type */
+  "oaccloops", /* name */
+  OPTGROUP_OMP, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  PROP_cfg, /* properties_required */
+  0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa | TODO_cleanup_cfg
+  | TODO_rebuild_alias, /* todo_flags_finish */
+};
+
+class pass_oacc_loop_designation : public gimple_opt_pass
+{
+public:
+  pass_oacc_loop_designation (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *) { return flag_openacc; };
+
+  virtual unsigned int execute (function *)
+    {
+      return execute_oacc_loop_designation ();
+    }
+
+}; // class pass_oacc_loop_designation
+
+const pass_data pass_data_oacc_gimple_workers =
+{
+  GIMPLE_PASS, /* type */
+  "oaccworkers", /* name */
+  OPTGROUP_OMP, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  PROP_cfg, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
+};
+
+class pass_oacc_gimple_workers : public gimple_opt_pass
+{
+public:
+  pass_oacc_gimple_workers (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_oacc_gimple_workers, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+  {
+    return flag_openacc && targetm.goacc.worker_partitioning;
+  };
+
+  virtual unsigned int execute (function *)
+    {
+      return execute_oacc_gimple_workers ();
+    }
+
+}; // class pass_oacc_gimple_workers
+
 const pass_data pass_data_oacc_device_lower =
 {
   GIMPLE_PASS, /* type */
@@ -1781,6 +2162,18 @@
 } // anon namespace
 
 gimple_opt_pass *
+make_pass_oacc_loop_designation (gcc::context *ctxt)
+{
+  return new pass_oacc_loop_designation (ctxt);
+}
+
+gimple_opt_pass *
+make_pass_oacc_gimple_workers (gcc::context *ctxt)
+{
+  return new pass_oacc_gimple_workers (ctxt);
+}
+
+gimple_opt_pass *
 make_pass_oacc_device_lower (gcc::context *ctxt)
 {
   return new pass_oacc_device_lower (ctxt);
diff --git a/gcc/omp-offload.h b/gcc/omp-offload.h
index 21c9236..b441854 100644
--- a/gcc/omp-offload.h
+++ b/gcc/omp-offload.h
@@ -29,6 +29,7 @@
 extern GTY(()) vec<tree, va_gc> *offload_funcs;
 extern GTY(()) vec<tree, va_gc> *offload_vars;
 
+extern int oacc_fn_attrib_level (tree attr);
 extern void omp_finish_file (void);
 
 #endif /* GCC_OMP_DEVICE_H */
diff --git a/gcc/omp-sese.c b/gcc/omp-sese.c
new file mode 100644
index 0000000..13d803f
--- /dev/null
+++ b/gcc/omp-sese.c
@@ -0,0 +1,2077 @@
+/* Find single-entry, single-exit regions for OpenACC.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "gimple.h"
+#include "tree-pass.h"
+#include "ssa.h"
+#include "cgraph.h"
+#include "pretty-print.h"
+#include "fold-const.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "tree-inline.h"
+#include "langhooks.h"
+#include "omp-general.h"
+#include "omp-low.h"
+#include "omp-grid.h"
+#include "gimple-pretty-print.h"
+#include "cfghooks.h"
+#include "insn-config.h"
+#include "recog.h"
+#include "internal-fn.h"
+#include "bitmap.h"
+#include "tree-nested.h"
+#include "stor-layout.h"
+#include "tree-ssa-threadupdate.h"
+#include "tree-into-ssa.h"
+#include "splay-tree.h"
+#include "target.h"
+#include "cfgloop.h"
+#include "tree-cfg.h"
+#include "omp-offload.h"
+#include "attribs.h"
+#include "omp-sese.h"
+
+/* Loop structure of the function.  The entire function is described as
+   a NULL loop.  */
+
+struct parallel_g
+{
+  /* Parent parallel.  */
+  parallel_g *parent;
+
+  /* Next sibling parallel.  */
+  parallel_g *next;
+
+  /* First child parallel.  */
+  parallel_g *inner;
+
+  /* Partitioning mask of the parallel.  */
+  unsigned mask;
+
+  /* Partitioning used within inner parallels. */
+  unsigned inner_mask;
+
+  /* Location of parallel forked and join.  The forked is the first
+     block in the parallel and the join is the first block after of
+     the partition.  */
+  basic_block forked_block;
+  basic_block join_block;
+
+  gimple *forked_stmt;
+  gimple *join_stmt;
+
+  gimple *fork_stmt;
+  gimple *joining_stmt;
+
+  /* Basic blocks in this parallel, but not in child parallels.  The
+     FORKED and JOINING blocks are in the partition.  The FORK and JOIN
+     blocks are not.  */
+  auto_vec<basic_block> blocks;
+
+  tree record_type;
+  tree sender_decl;
+  tree receiver_decl;
+
+public:
+  parallel_g (parallel_g *parent, unsigned mode);
+  ~parallel_g ();
+};
+
+/* Constructor links the new parallel into it's parent's chain of
+   children.  */
+
+parallel_g::parallel_g (parallel_g *parent_, unsigned mask_)
+  :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
+{
+  forked_block = join_block = 0;
+  forked_stmt = join_stmt = NULL;
+  fork_stmt = joining_stmt = NULL;
+
+  record_type = NULL_TREE;
+  sender_decl = NULL_TREE;
+  receiver_decl = NULL_TREE;
+
+  if (parent)
+    {
+      next = parent->inner;
+      parent->inner = this;
+    }
+}
+
+parallel_g::~parallel_g ()
+{
+  delete inner;
+  delete next;
+}
+
+static bool
+local_var_based_p (tree decl)
+{
+  switch (TREE_CODE (decl))
+    {
+    case VAR_DECL:
+      return !is_global_var (decl);
+
+    case COMPONENT_REF:
+    case BIT_FIELD_REF:
+    case ARRAY_REF:
+      return local_var_based_p (TREE_OPERAND (decl, 0));
+
+    default:
+      return false;
+    }
+}
+
+/* Map of basic blocks to gimple stmts.  */
+typedef hash_map<basic_block, gimple *> bb_stmt_map_t;
+
+/* Calls to OpenACC routines are made by all workers/wavefronts/warps, since
+   the routine likely contains partitioned loops (else will do its own
+   neutering and variable propagation). Return TRUE if a function call CALL
+   should be made in (worker) single mode instead, rather than redundant
+   mode.  */
+
+static bool
+omp_sese_active_worker_call (gcall *call)
+{
+#define GOMP_DIM_SEQ GOMP_DIM_MAX
+  tree fndecl = gimple_call_fndecl (call);
+
+  if (!fndecl)
+    return true;
+
+  tree attrs = oacc_get_fn_attrib (fndecl);
+
+  if (!attrs)
+    return true;
+
+  int level = oacc_fn_attrib_level (attrs);
+
+  /* Neither regular functions nor "seq" routines should be run by all threads
+     in worker-single mode.  */
+  return level == -1 || level == GOMP_DIM_SEQ;
+#undef GOMP_DIM_SEQ
+}
+
+/* Split basic blocks such that each forked and join unspecs are at
+   the start of their basic blocks.  Thus afterwards each block will
+   have a single partitioning mode.  We also do the same for return
+   insns, as they are executed by every thread.  Return the
+   partitioning mode of the function as a whole.  Populate MAP with
+   head and tail blocks.  We also clear the BB visited flag, which is
+   used when finding partitions.  */
+
+static void
+omp_sese_split_blocks (bb_stmt_map_t *map)
+{
+  auto_vec<gimple *> worklist;
+  basic_block block;
+
+  /* Locate all the reorg instructions of interest.  */
+  FOR_ALL_BB_FN (block, cfun)
+    {
+      /* Clear visited flag, for use by parallel locator  */
+      block->flags &= ~BB_VISITED;
+
+      for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+	   !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+
+	  if (gimple_call_internal_p (stmt, IFN_UNIQUE))
+	    {
+	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
+		TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
+
+	      if (k == IFN_UNIQUE_OACC_JOIN)
+	        worklist.safe_push (stmt);
+	      else if (k == IFN_UNIQUE_OACC_FORK)
+	        {
+		  gcc_assert (gsi_one_before_end_p (gsi));
+		  basic_block forked_block = single_succ (block);
+		  gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block);
+
+		  /* We push a NOP as a placeholder for the "forked" stmt.
+		     This is then recognized in omp_sese_find_par.  */
+		  gimple *nop = gimple_build_nop ();
+		  gsi_insert_before (&gsi2, nop, GSI_SAME_STMT);
+
+		  worklist.safe_push (nop);
+		}
+	    }
+	  else if (gimple_code (stmt) == GIMPLE_RETURN
+		   || gimple_code (stmt) == GIMPLE_COND
+		   || gimple_code (stmt) == GIMPLE_SWITCH
+		   || (gimple_code (stmt) == GIMPLE_CALL
+		       && !gimple_call_internal_p (stmt)
+		       && !omp_sese_active_worker_call (as_a <gcall *> (stmt))))
+	    worklist.safe_push (stmt);
+	  else if (is_gimple_assign (stmt))
+	    {
+	      tree lhs = gimple_assign_lhs (stmt);
+
+	      /* Force assignments to components/fields/elements of local
+		 aggregates into fully-partitioned (redundant) mode.  This
+		 avoids having to broadcast the whole aggregate.  The RHS of
+		 the assignment will be propagated using the normal
+		 mechanism.  */
+
+	      switch (TREE_CODE (lhs))
+	        {
+		case COMPONENT_REF:
+		case BIT_FIELD_REF:
+		case ARRAY_REF:
+		  {
+		    tree aggr = TREE_OPERAND (lhs, 0);
+
+		    if (local_var_based_p (aggr))
+		      worklist.safe_push (stmt);
+		  }
+		  break;
+
+		default:
+		  ;
+		}
+	    }
+	}
+    }
+
+  /* Split blocks on the worklist.  */
+  unsigned ix;
+  gimple *stmt;
+
+  for (ix = 0; worklist.iterate (ix, &stmt); ix++)
+    {
+      basic_block block = gimple_bb (stmt);
+
+      if (gimple_code (stmt) == GIMPLE_COND)
+        {
+	  gcond *orig_cond = as_a <gcond *> (stmt);
+	  tree_code code = gimple_expr_code (orig_cond);
+	  tree pred = make_ssa_name (boolean_type_node);
+	  gimple *asgn = gimple_build_assign (pred, code,
+			   gimple_cond_lhs (orig_cond),
+			   gimple_cond_rhs (orig_cond));
+	  gcond *new_cond
+	    = gimple_build_cond (NE_EXPR, pred, boolean_false_node,
+				 gimple_cond_true_label (orig_cond),
+				 gimple_cond_false_label (orig_cond));
+
+	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+	  gsi_insert_before (&gsi, asgn, GSI_SAME_STMT);
+	  gsi_replace (&gsi, new_cond, true);
+
+	  edge e = split_block (block, asgn);
+	  block = e->dest;
+	  map->get_or_insert (block) = new_cond;
+	}
+      else if ((gimple_code (stmt) == GIMPLE_CALL
+		&& !gimple_call_internal_p (stmt))
+	       || is_gimple_assign (stmt))
+        {
+	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+	  gsi_prev (&gsi);
+
+	  edge call = split_block (block, gsi_stmt (gsi));
+
+	  gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest));
+
+	  edge call_to_ret = split_block (call->dest, call_stmt);
+
+	  map->get_or_insert (call_to_ret->src) = call_stmt;
+	}
+      else
+	{
+	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+	  gsi_prev (&gsi);
+
+	  if (gsi_end_p (gsi))
+	    map->get_or_insert (block) = stmt;
+	  else
+	    {
+	      /* Split block before insn. The insn is in the new block.  */
+	      edge e = split_block (block, gsi_stmt (gsi));
+
+	      block = e->dest;
+	      map->get_or_insert (block) = stmt;
+	    }
+	}
+    }
+}
+
+static const char *
+mask_name (unsigned mask)
+{
+  switch (mask)
+    {
+    case 0: return "gang redundant";
+    case 1: return "gang partitioned";
+    case 2: return "worker partitioned";
+    case 3: return "gang+worker partitioned";
+    case 4: return "vector partitioned";
+    case 5: return "gang+vector partitioned";
+    case 6: return "worker+vector partitioned";
+    case 7: return "fully partitioned";
+    default: return "<illegal>";
+    }
+}
+
+/* Dump this parallel and all its inner parallels.  */
+
+static void
+omp_sese_dump_pars (parallel_g *par, unsigned depth)
+{
+  fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n",
+	   depth, par->mask, mask_name (par->mask),
+	   par->forked_block ? par->forked_block->index : -1,
+	   par->join_block ? par->join_block->index : -1);
+
+  fprintf (dump_file, "    blocks:");
+
+  basic_block block;
+  for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
+    fprintf (dump_file, " %d", block->index);
+  fprintf (dump_file, "\n");
+  if (par->inner)
+    omp_sese_dump_pars (par->inner, depth + 1);
+
+  if (par->next)
+    omp_sese_dump_pars (par->next, depth);
+}
+
+/* If BLOCK contains a fork/join marker, process it to create or
+   terminate a loop structure.  Add this block to the current loop,
+   and then walk successor blocks.   */
+
+static parallel_g *
+omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block)
+{
+  if (block->flags & BB_VISITED)
+    return par;
+  block->flags |= BB_VISITED;
+
+  if (gimple **stmtp = map->get (block))
+    {
+      gimple *stmt = *stmtp;
+
+      if (gimple_code (stmt) == GIMPLE_COND
+	  || gimple_code (stmt) == GIMPLE_SWITCH
+	  || gimple_code (stmt) == GIMPLE_RETURN
+	  || (gimple_code (stmt) == GIMPLE_CALL
+	      && !gimple_call_internal_p (stmt))
+	  || is_gimple_assign (stmt))
+	{
+	  /* A single block that is forced to be at the maximum partition
+	     level.  Make a singleton par for it.  */
+	  par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG)
+				   | GOMP_DIM_MASK (GOMP_DIM_WORKER)
+				   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+	  par->forked_block = block;
+	  par->forked_stmt = stmt;
+	  par->blocks.safe_push (block);
+	  par = par->parent;
+	  goto walk_successors;
+	}
+      else if (gimple_nop_p (stmt))
+	{
+	  basic_block pred = single_pred (block);
+	  gcc_assert (pred);
+	  gimple_stmt_iterator gsi = gsi_last_bb (pred);
+	  gimple *final_stmt = gsi_stmt (gsi);
+
+	  if (gimple_call_internal_p (final_stmt, IFN_UNIQUE))
+	    {
+	      gcall *call = as_a <gcall *> (final_stmt);
+	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
+	        TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
+
+	      if (k == IFN_UNIQUE_OACC_FORK)
+	        {
+		  HOST_WIDE_INT dim
+		    = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
+		  unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
+
+		  par = new parallel_g (par, mask);
+		  par->forked_block = block;
+		  par->forked_stmt = final_stmt;
+		  par->fork_stmt = stmt;
+		}
+	      else
+	        gcc_unreachable ();
+	    }
+	  else
+	    gcc_unreachable ();
+	}
+      else if (gimple_call_internal_p (stmt, IFN_UNIQUE))
+        {
+	  gcall *call = as_a <gcall *> (stmt);
+	  enum ifn_unique_kind k = ((enum ifn_unique_kind)
+	    TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
+	  if (k == IFN_UNIQUE_OACC_JOIN)
+	    {
+	      HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
+	      unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
+
+	      gcc_assert (par->mask == mask);
+	      par->join_block = block;
+	      par->join_stmt = stmt;
+	      par = par->parent;
+	    }
+	  else
+	    gcc_unreachable ();
+	}
+      else
+        gcc_unreachable ();
+    }
+
+  if (par)
+    /* Add this block onto the current loop's list of blocks.  */
+    par->blocks.safe_push (block);
+  else
+    /* This must be the entry block.  Create a NULL parallel.  */
+    par = new parallel_g (0, 0);
+
+walk_successors:
+  /* Walk successor blocks.  */
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, block->succs)
+    omp_sese_find_par (map, par, e->dest);
+
+  return par;
+}
+
+/* DFS walk the CFG looking for fork & join markers.  Construct
+   loop structures as we go.  MAP is a mapping of basic blocks
+   to head & tail markers, discovered when splitting blocks.  This
+   speeds up the discovery.  We rely on the BB visited flag having
+   been cleared when splitting blocks.  */
+
+static parallel_g *
+omp_sese_discover_pars (bb_stmt_map_t *map)
+{
+  basic_block block;
+
+  /* Mark exit blocks as visited.  */
+  block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+  block->flags |= BB_VISITED;
+
+  /* And entry block as not.  */
+  block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+  block->flags &= ~BB_VISITED;
+
+  parallel_g *par = omp_sese_find_par (map, 0, block);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\nLoops\n");
+      omp_sese_dump_pars (par, 0);
+      fprintf (dump_file, "\n");
+    }
+
+  return par;
+}
+
+static void
+populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single,
+			      bitmap vector_single, unsigned outer_mask,
+			      int depth)
+{
+  unsigned mask = outer_mask | par->mask;
+
+  basic_block block;
+
+  for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
+    {
+      if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
+        bitmap_set_bit (worker_single, block->index);
+
+      if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0)
+        bitmap_set_bit (vector_single, block->index);
+    }
+
+  if (par->inner)
+    populate_single_mode_bitmaps (par->inner, worker_single, vector_single,
+				  mask, depth + 1);
+  if (par->next)
+    populate_single_mode_bitmaps (par->next, worker_single, vector_single,
+				  outer_mask, depth);
+}
+
+/* A map from SSA names or var decls to record fields.  */
+
+typedef hash_map<tree, tree> field_map_t;
+
+/* For each propagation record type, this is a map from SSA names or var decls
+   to propagate, to the field in the record type that should be used for
+   transmission and reception.  */
+
+typedef hash_map<tree, field_map_t *> record_field_map_t;
+
+static GTY(()) record_field_map_t *field_map;
+
+static void
+install_var_field (tree var, tree record_type)
+{
+  field_map_t *fields = *field_map->get (record_type);
+  tree name;
+  char tmp[20];
+
+  if (TREE_CODE (var) == SSA_NAME)
+    name = SSA_NAME_IDENTIFIER (var);
+  else if (TREE_CODE (var) == VAR_DECL)
+    name = DECL_NAME (var);
+  else
+    gcc_unreachable ();
+
+  gcc_assert (!fields->get (var));
+
+  if (!name)
+    {
+      sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var));
+      name = get_identifier (tmp);
+    }
+
+  tree type = TREE_TYPE (var);
+
+  if (POINTER_TYPE_P (type)
+      && TYPE_RESTRICT (type))
+    type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT);
+
+  tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type);
+
+  if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var))
+    {
+      SET_DECL_ALIGN (field, DECL_ALIGN (var));
+      DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
+      TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
+    }
+  else
+    SET_DECL_ALIGN (field, TYPE_ALIGN (type));
+
+  fields->put (var, field);
+
+  insert_field_into_struct (record_type, field);
+}
+
+/* Sets of SSA_NAMES or VAR_DECLs to propagate.  */
+typedef hash_set<tree> propagation_set;
+
+static void
+find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask,
+			     bitmap worker_single, bitmap vector_single,
+			     vec<propagation_set *> *prop_set)
+{
+  unsigned mask = outer_mask | par->mask;
+
+  if (par->inner)
+    find_ssa_names_to_propagate (par->inner, mask, worker_single,
+				 vector_single, prop_set);
+  if (par->next)
+    find_ssa_names_to_propagate (par->next, outer_mask, worker_single,
+				 vector_single, prop_set);
+
+  if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+    {
+      basic_block block;
+      int ix;
+
+      for (ix = 0; par->blocks.iterate (ix, &block); ix++)
+	{
+	  for (gphi_iterator psi = gsi_start_phis (block);
+	       !gsi_end_p (psi); gsi_next (&psi))
+	    {
+	      gphi *phi = psi.phi ();
+	      use_operand_p use;
+	      ssa_op_iter iter;
+
+	      FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE)
+	        {
+		  tree var = USE_FROM_PTR (use);
+
+		  if (TREE_CODE (var) != SSA_NAME)
+		    continue;
+
+		  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
+
+		  if (gimple_nop_p (def_stmt))
+		    continue;
+
+		  basic_block def_bb = gimple_bb (def_stmt);
+
+		  if (bitmap_bit_p (worker_single, def_bb->index))
+		    {
+		      if (!(*prop_set)[def_bb->index])
+		        (*prop_set)[def_bb->index] = new propagation_set;
+
+		      propagation_set *ws_prop = (*prop_set)[def_bb->index];
+
+		      ws_prop->add (var);
+		    }
+		}
+	    }
+
+	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+	       !gsi_end_p (gsi); gsi_next (&gsi))
+	    {
+	      use_operand_p use;
+	      ssa_op_iter iter;
+	      gimple *stmt = gsi_stmt (gsi);
+
+	      FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
+	        {
+		  tree var = USE_FROM_PTR (use);
+
+		  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
+
+		  if (gimple_nop_p (def_stmt))
+		    continue;
+
+		  basic_block def_bb = gimple_bb (def_stmt);
+
+		  if (bitmap_bit_p (worker_single, def_bb->index))
+		    {
+		      if (!(*prop_set)[def_bb->index])
+		        (*prop_set)[def_bb->index] = new propagation_set;
+
+		      propagation_set *ws_prop = (*prop_set)[def_bb->index];
+
+		      ws_prop->add (var);
+		    }
+		}
+	    }
+	}
+    }
+}
+
+/* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a
+   statement.  */
+
+static tree
+find_partitioned_var_uses_1 (tree *node, int *, void *data)
+{
+  walk_stmt_info *wi = (walk_stmt_info *) data;
+  hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info;
+
+  if (!wi->is_lhs && VAR_P (*node))
+    partitioned_var_uses->add (*node);
+
+  return NULL_TREE;
+}
+
+static void
+find_partitioned_var_uses (parallel_g *par, unsigned outer_mask,
+			   hash_set<tree> *partitioned_var_uses)
+{
+  unsigned mask = outer_mask | par->mask;
+
+  if (par->inner)
+    find_partitioned_var_uses (par->inner, mask, partitioned_var_uses);
+  if (par->next)
+    find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses);
+
+  if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+    {
+      basic_block block;
+      int ix;
+
+      for (ix = 0; par->blocks.iterate (ix, &block); ix++)
+	for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+	     !gsi_end_p (gsi); gsi_next (&gsi))
+	  {
+	    walk_stmt_info wi;
+	    memset (&wi, 0, sizeof (wi));
+	    wi.info = (void *) partitioned_var_uses;
+	    walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi);
+	  }
+    }
+}
+
+/* Gang-private variables (typically placed in a GPU's shared memory) do not
+   need to be processed by the worker-propagation mechanism.  Populate the
+   GANGPRIVATE_VARS set with any such variables found in the current
+   function.  */
+
+static void
+find_gangprivate_vars (hash_set<tree> *gangprivate_vars)
+{
+  basic_block block;
+
+  FOR_EACH_BB_FN (block, cfun)
+    {
+      for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+	   !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+
+	  if (gimple_call_internal_p (stmt, IFN_UNIQUE))
+	    {
+	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
+		TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
+	      if (k == IFN_UNIQUE_OACC_PRIVATE)
+		{
+		  HOST_WIDE_INT level
+		    = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
+		  if (level != GOMP_DIM_GANG)
+		    continue;
+		  for (unsigned i = 3; i < gimple_call_num_args (stmt); i++)
+		    {
+		      tree arg = gimple_call_arg (stmt, i);
+		      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+		      tree decl = TREE_OPERAND (arg, 0);
+		      gangprivate_vars->add (decl);
+		    }
+		}
+	    }
+	}
+    }
+}
+
+static void
+find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
+			      hash_set<tree> *partitioned_var_uses,
+			      hash_set<tree> *gangprivate_vars,
+			      vec<propagation_set *> *prop_set)
+{
+  unsigned mask = outer_mask | par->mask;
+
+  if (par->inner)
+    find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
+				  gangprivate_vars, prop_set);
+  if (par->next)
+    find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
+				  gangprivate_vars, prop_set);
+
+  if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+    {
+      basic_block block;
+      int ix;
+
+      for (ix = 0; par->blocks.iterate (ix, &block); ix++)
+	{
+	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+	       !gsi_end_p (gsi); gsi_next (&gsi))
+	    {
+	      gimple *stmt = gsi_stmt (gsi);
+	      tree var;
+	      unsigned i;
+
+	      FOR_EACH_LOCAL_DECL (cfun, i, var)
+		{
+		  if (!VAR_P (var)
+		      || is_global_var (var)
+		      || AGGREGATE_TYPE_P (TREE_TYPE (var))
+		      || !partitioned_var_uses->contains (var)
+		      || gangprivate_vars->contains (var))
+		    continue;
+
+		  if (stmt_may_clobber_ref_p (stmt, var))
+		    {
+		      if (dump_file)
+			{
+			  fprintf (dump_file, "bb %u: local variable may be "
+				   "clobbered in %s mode: ", block->index,
+				   mask_name (mask));
+			  print_generic_expr (dump_file, var, TDF_SLIM);
+			  fprintf (dump_file, "\n");
+			}
+
+		      if (!(*prop_set)[block->index])
+			(*prop_set)[block->index] = new propagation_set;
+
+		      propagation_set *ws_prop
+			= (*prop_set)[block->index];
+
+		      ws_prop->add (var);
+		    }
+		}
+	    }
+	}
+    }
+}
+
+/* Transform basic blocks FROM, TO (which may be the same block) into:
+   if (GOACC_single_start ())
+     BLOCK;
+   GOACC_barrier ();
+			      \  |  /
+			      +----+
+			      |    |        (new) predicate block
+			      +----+--
+   \  |  /   \  |  /	        |t    \
+   +----+    +----+	      +----+  |
+   |	|    |    |	===>  |    |  | f   (old) from block
+   +----+    +----+	      +----+  |
+     |       t/  \f	        |    /
+			      +----+/
+  (split  (split before       |    |        skip block
+  at end)   condition)	      +----+
+			      t/  \f
+*/
+
+static void
+worker_single_simple (basic_block from, basic_block to,
+		      hash_set<tree> *def_escapes_block)
+{
+  gimple *call, *cond;
+  tree lhs, decl;
+  basic_block skip_block;
+
+  gimple_stmt_iterator gsi = gsi_last_bb (to);
+  if (EDGE_COUNT (to->succs) > 1)
+    {
+      gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND);
+      gsi_prev (&gsi);
+    }
+  edge e = split_block (to, gsi_stmt (gsi));
+  skip_block = e->dest;
+
+  gimple_stmt_iterator start = gsi_after_labels (from);
+
+  decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START);
+  lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
+  call = gimple_build_call (decl, 0);
+  gimple_call_set_lhs (call, lhs);
+  gsi_insert_before (&start, call, GSI_NEW_STMT);
+  update_stmt (call);
+
+  cond = gimple_build_cond (EQ_EXPR, lhs,
+			    fold_convert_loc (UNKNOWN_LOCATION,
+					      TREE_TYPE (lhs),
+					      boolean_true_node),
+			    NULL_TREE, NULL_TREE);
+  gsi_insert_after (&start, cond, GSI_NEW_STMT);
+  update_stmt (cond);
+
+  edge et = split_block (from, cond);
+  et->flags &= ~EDGE_FALLTHRU;
+  et->flags |= EDGE_TRUE_VALUE;
+  /* Make the active worker the more probable path so we prefer fallthrough
+     (letting the idle workers jump around more).  */
+  et->probability = profile_probability::likely ();
+
+  edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE);
+  ef->probability = et->probability.invert ();
+
+  basic_block neutered = split_edge (ef);
+  gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered);
+
+  for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      ssa_op_iter iter;
+      tree var;
+
+      FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF)
+        {
+	  if (def_escapes_block->contains (var))
+	    {
+	      gphi *join_phi = create_phi_node (NULL_TREE, skip_block);
+	      create_new_def_for (var, join_phi,
+				  gimple_phi_result_ptr (join_phi));
+	      add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION);
+
+	      tree neutered_def = copy_ssa_name (var, NULL);
+	      /* We really want "don't care" or some value representing
+		 undefined here, but optimizers will probably get rid of the
+		 zero-assignments anyway.  */
+	      gassign *zero = gimple_build_assign (neutered_def,
+				build_zero_cst (TREE_TYPE (neutered_def)));
+
+	      gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING);
+	      update_stmt (zero);
+
+	      add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered),
+			   UNKNOWN_LOCATION);
+	      update_stmt (join_phi);
+	    }
+	}
+    }
+
+  gsi = gsi_start_bb (skip_block);
+
+  decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+  gimple *acc_bar = gimple_build_call (decl, 0);
+
+  gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
+  update_stmt (acc_bar);
+}
+
+/* This is a copied and renamed omp-low.c:omp_build_component_ref.  */
+
+static tree
+oacc_build_component_ref (tree obj, tree field)
+{
+  tree ret = build3 (COMPONENT_REF, TREE_TYPE (field), obj, field, NULL);
+  if (TREE_THIS_VOLATILE (field))
+    TREE_THIS_VOLATILE (ret) |= 1;
+  if (TREE_READONLY (field))
+    TREE_READONLY (ret) |= 1;
+  return ret;
+}
+
+static tree
+build_receiver_ref (tree record_type, tree var, tree receiver_decl)
+{
+  field_map_t *fields = *field_map->get (record_type);
+  tree x = build_simple_mem_ref (receiver_decl);
+  tree field = *fields->get (var);
+  TREE_THIS_NOTRAP (x) = 1;
+  x = oacc_build_component_ref (x, field);
+  return x;
+}
+
+static tree
+build_sender_ref (tree record_type, tree var, tree sender_decl)
+{
+  field_map_t *fields = *field_map->get (record_type);
+  tree field = *fields->get (var);
+  return oacc_build_component_ref (sender_decl, field);
+}
+
+static int
+sort_by_ssa_version_or_uid (const void *p1, const void *p2)
+{
+  const tree t1 = *(const tree *)p1;
+  const tree t2 = *(const tree *)p2;
+
+  if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME)
+    return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2);
+  else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME)
+    return -1;
+  else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME)
+    return 1;
+  else
+    return DECL_UID (t1) - DECL_UID (t2);
+}
+
+static int
+sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2)
+{
+  const tree t1 = *(const tree *)p1;
+  const tree t2 = *(const tree *)p2;
+  unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1)));
+  unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2)));
+  if (s1 != s2)
+    return s2 - s1;
+  else
+    return sort_by_ssa_version_or_uid (p1, p2);
+}
+
+static void
+worker_single_copy (basic_block from, basic_block to,
+		    hash_set<tree> *def_escapes_block,
+		    hash_set<tree> *worker_partitioned_uses,
+		    tree record_type)
+{
+  /* If we only have virtual defs, we'll have no record type, but we still want
+     to emit single_copy_start and (particularly) single_copy_end to act as
+     a vdef source on the neutered edge representing memory writes on the
+     non-neutered edge.  */
+  if (!record_type)
+    record_type = char_type_node;
+
+  tree sender_decl
+    = targetm.goacc.create_propagation_record (record_type, true,
+					       ".oacc_worker_o");
+  tree receiver_decl
+    = targetm.goacc.create_propagation_record (record_type, false,
+					       ".oacc_worker_i");
+
+  gimple_stmt_iterator gsi = gsi_last_bb (to);
+  if (EDGE_COUNT (to->succs) > 1)
+    gsi_prev (&gsi);
+  edge e = split_block (to, gsi_stmt (gsi));
+  basic_block barrier_block = e->dest;
+
+  gimple_stmt_iterator start = gsi_after_labels (from);
+
+  tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START);
+
+  tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
+
+  gimple *call = gimple_build_call (decl, 1,
+				    build_fold_addr_expr (sender_decl));
+  gimple_call_set_lhs (call, lhs);
+  gsi_insert_before (&start, call, GSI_NEW_STMT);
+  update_stmt (call);
+
+  tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
+
+  gimple *conv = gimple_build_assign (conv_tmp,
+				      fold_convert (TREE_TYPE (receiver_decl),
+						    lhs));
+  update_stmt (conv);
+  gsi_insert_after (&start, conv, GSI_NEW_STMT);
+  gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp);
+  gsi_insert_after (&start, asgn, GSI_NEW_STMT);
+  update_stmt (asgn);
+
+  tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0);
+
+  tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
+  asgn = gimple_build_assign (recv_tmp, receiver_decl);
+  gsi_insert_after (&start, asgn, GSI_NEW_STMT);
+  update_stmt (asgn);
+
+  gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE,
+				    NULL_TREE);
+  update_stmt (cond);
+
+  gsi_insert_after (&start, cond, GSI_NEW_STMT);
+
+  edge et = split_block (from, cond);
+  et->flags &= ~EDGE_FALLTHRU;
+  et->flags |= EDGE_TRUE_VALUE;
+  /* Make the active worker the more probable path so we prefer fallthrough
+     (letting the idle workers jump around more).  */
+  et->probability = profile_probability::likely ();
+
+  basic_block body = et->dest;
+
+  edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
+  ef->probability = et->probability.invert ();
+
+  decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+  gimple *acc_bar = gimple_build_call (decl, 0);
+
+  gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
+  gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
+
+  cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
+  gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
+
+  edge et2 = split_block (barrier_block, cond);
+  et2->flags &= ~EDGE_FALLTHRU;
+  et2->flags |= EDGE_TRUE_VALUE;
+  et2->probability = profile_probability::unlikely ();
+
+  basic_block exit_block = et2->dest;
+
+  basic_block copyout_block = split_edge (et2);
+  edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE);
+  ef2->probability = et2->probability.invert ();
+
+  gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block);
+
+  edge copyout_to_exit = single_succ_edge (copyout_block);
+
+  gimple_seq sender_seq = NULL;
+
+  /* Make sure we iterate over definitions in a stable order.  */
+  auto_vec<tree> escape_vec (def_escapes_block->elements ());
+  for (hash_set<tree>::iterator it = def_escapes_block->begin ();
+       it != def_escapes_block->end (); ++it)
+    escape_vec.quick_push (*it);
+  escape_vec.qsort (sort_by_ssa_version_or_uid);
+
+  for (unsigned i = 0; i < escape_vec.length (); i++)
+    {
+      tree var = escape_vec[i];
+
+      if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var))
+        continue;
+
+      tree barrier_def = 0;
+
+      if (TREE_CODE (var) == SSA_NAME)
+        {
+	  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
+
+	  if (gimple_nop_p (def_stmt))
+            continue;
+
+	  /* The barrier phi takes one result from the actual work of the
+	     block we're neutering, and the other result is constant zero of
+	     the same type.  */
+
+	  gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block);
+	  barrier_def = create_new_def_for (var, barrier_phi,
+			  gimple_phi_result_ptr (barrier_phi));
+
+	  add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION);
+	  add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef,
+		       UNKNOWN_LOCATION);
+
+	  update_stmt (barrier_phi);
+	}
+      else
+        gcc_assert (TREE_CODE (var) == VAR_DECL);
+
+      /* If we had no record type, we will have no fields map.  */
+      field_map_t **fields_p = field_map->get (record_type);
+      field_map_t *fields = fields_p ? *fields_p : NULL;
+
+      if (worker_partitioned_uses->contains (var)
+	  && fields
+	  && fields->get (var))
+	{
+	  tree neutered_def = make_ssa_name (TREE_TYPE (var));
+
+	  /* Receive definition from shared memory block.  */
+
+	  tree receiver_ref = build_receiver_ref (record_type, var,
+						  receiver_decl);
+	  gassign *recv = gimple_build_assign (neutered_def,
+					       receiver_ref);
+	  gsi_insert_after (&copyout_gsi, recv, GSI_CONTINUE_LINKING);
+	  update_stmt (recv);
+
+	  if (TREE_CODE (var) == VAR_DECL)
+	    {
+	      /* If it's a VAR_DECL, we only copied to an SSA temporary.  Copy
+		 to the final location now.  */
+	      gassign *asgn = gimple_build_assign (var, neutered_def);
+	      gsi_insert_after (&copyout_gsi, asgn, GSI_CONTINUE_LINKING);
+	      update_stmt (asgn);
+	    }
+	  else
+	    {
+	      /* If it's an SSA name, create a new phi at the join node to
+	         represent either the output from the active worker (the
+		 barrier) or the inactive workers (the copyout block).  */
+	      gphi *join_phi = create_phi_node (NULL_TREE, exit_block);
+	      create_new_def_for (barrier_def, join_phi,
+				  gimple_phi_result_ptr (join_phi));
+	      add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION);
+	      add_phi_arg (join_phi, neutered_def, copyout_to_exit,
+			   UNKNOWN_LOCATION);
+	      update_stmt (join_phi);
+	    }
+
+	  /* Send definition to shared memory block.  */
+
+	  tree sender_ref = build_sender_ref (record_type, var, sender_decl);
+
+	  if (TREE_CODE (var) == SSA_NAME)
+	    {
+	      gassign *send = gimple_build_assign (sender_ref, var);
+	      gimple_seq_add_stmt (&sender_seq, send);
+	      update_stmt (send);
+	    }
+	  else if (TREE_CODE (var) == VAR_DECL)
+	    {
+	      tree tmp = make_ssa_name (TREE_TYPE (var));
+	      gassign *send = gimple_build_assign (tmp, var);
+	      gimple_seq_add_stmt (&sender_seq, send);
+	      update_stmt (send);
+	      send = gimple_build_assign (sender_ref, tmp);
+	      gimple_seq_add_stmt (&sender_seq, send);
+	      update_stmt (send);
+	    }
+	  else
+	    gcc_unreachable ();
+	}
+    }
+
+  /* It's possible for the ET->DEST block (the work done by the active thread)
+     to finish with a control-flow insn, e.g. a UNIQUE function call.  Split
+     the block and add SENDER_SEQ in the latter part to avoid having control
+     flow in the middle of a BB.  */
+
+  decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
+  call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl));
+  gimple_seq_add_stmt (&sender_seq, call);
+
+  gsi = gsi_last_bb (body);
+  gimple *last = gsi_stmt (gsi);
+  basic_block sender_block = split_block (body, last)->dest;
+  gsi = gsi_last_bb (sender_block);
+  gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
+}
+
+static void
+neuter_worker_single (parallel_g *par, unsigned outer_mask,
+		      bitmap worker_single, bitmap vector_single,
+		      vec<propagation_set *> *prop_set,
+		      hash_set<tree> *partitioned_var_uses)
+{
+  unsigned mask = outer_mask | par->mask;
+
+  if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
+    {
+      basic_block block;
+
+      for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
+	{
+	  bool has_defs = false;
+	  hash_set<tree> def_escapes_block;
+	  hash_set<tree> worker_partitioned_uses;
+	  unsigned j;
+	  tree var;
+
+	  FOR_EACH_SSA_NAME (j, var, cfun)
+            {
+	      if (SSA_NAME_IS_VIRTUAL_OPERAND (var))
+		{
+		  has_defs = true;
+		  continue;
+		}
+
+	      gimple *def_stmt = SSA_NAME_DEF_STMT (var);
+
+	      if (gimple_nop_p (def_stmt))
+		continue;
+
+	      if (gimple_bb (def_stmt)->index != block->index)
+	        continue;
+
+	      gimple *use_stmt;
+	      imm_use_iterator use_iter;
+	      bool uses_outside_block = false;
+	      bool worker_partitioned_use = false;
+
+	      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var)
+	        {
+		  int blocknum = gimple_bb (use_stmt)->index;
+
+		  /* Don't propagate SSA names that are only used in the
+		     current block, unless the usage is in a phi node: that
+		     means the name left the block, then came back in at the
+		     top.  */
+		  if (blocknum != block->index
+		      || gimple_code (use_stmt) == GIMPLE_PHI)
+		    uses_outside_block = true;
+		  if (!bitmap_bit_p (worker_single, blocknum))
+		    worker_partitioned_use = true;
+		}
+
+	      if (uses_outside_block)
+		def_escapes_block.add (var);
+
+	      if (worker_partitioned_use)
+		{
+		  worker_partitioned_uses.add (var);
+		  has_defs = true;
+		}
+	    }
+
+	  propagation_set *ws_prop = (*prop_set)[block->index];
+
+	  if (ws_prop)
+	    {
+	      for (propagation_set::iterator it = ws_prop->begin ();
+		   it != ws_prop->end ();
+		   ++it)
+		{
+	          tree var = *it;
+		  if (TREE_CODE (var) == VAR_DECL)
+		    {
+		      def_escapes_block.add (var);
+		      if (partitioned_var_uses->contains (var))
+			{
+			  worker_partitioned_uses.add (var);
+			  has_defs = true;
+			}
+		    }
+		}
+
+	      delete ws_prop;
+	      (*prop_set)[block->index] = 0;
+	    }
+
+	  tree record_type = (tree) block->aux;
+
+	  if (has_defs)
+	    worker_single_copy (block, block, &def_escapes_block,
+				&worker_partitioned_uses, record_type);
+	  else
+	    worker_single_simple (block, block, &def_escapes_block);
+	}
+    }
+
+  if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
+    {
+      basic_block block;
+
+      for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
+	for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+	     !gsi_end_p (gsi);
+	     gsi_next (&gsi))
+	  {
+	    gimple *stmt = gsi_stmt (gsi);
+
+	    if (gimple_code (stmt) == GIMPLE_CALL
+		&& !gimple_call_internal_p (stmt)
+		&& !omp_sese_active_worker_call (as_a <gcall *> (stmt)))
+	      {
+		/* If we have an OpenACC routine call in worker-single mode,
+		   place barriers before and afterwards to prevent
+		   clobbering re-used shared memory regions (as are used
+		   for AMDGCN at present, for example).  */
+		tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+		gsi_insert_before (&gsi, gimple_build_call (decl, 0),
+				   GSI_SAME_STMT);
+		gsi_insert_after (&gsi, gimple_build_call (decl, 0),
+				  GSI_NEW_STMT);
+	      }
+	  }
+    }
+
+  if (par->inner)
+    neuter_worker_single (par->inner, mask, worker_single, vector_single,
+			  prop_set, partitioned_var_uses);
+  if (par->next)
+    neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
+			  prop_set, partitioned_var_uses);
+}
+
+
+void
+oacc_do_neutering (void)
+{
+  bb_stmt_map_t bb_stmt_map;
+  auto_bitmap worker_single, vector_single;
+
+  omp_sese_split_blocks (&bb_stmt_map);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\n\nAfter splitting:\n\n");
+      dump_function_to_file (current_function_decl, dump_file, dump_flags);
+    }
+
+  unsigned mask = 0;
+
+  /* If this is a routine, calculate MASK as if the outer levels are already
+     partitioned.  */
+  tree attr = oacc_get_fn_attrib (current_function_decl);
+  if (attr)
+    {
+      tree dims = TREE_VALUE (attr);
+      unsigned ix;
+      for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+	{
+	  tree allowed = TREE_PURPOSE (dims);
+	  if (allowed && integer_zerop (allowed))
+	    mask |= GOMP_DIM_MASK (ix);
+	}
+    }
+
+  parallel_g *par = omp_sese_discover_pars (&bb_stmt_map);
+  populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0);
+
+  basic_block bb;
+  FOR_ALL_BB_FN (bb, cfun)
+    bb->aux = NULL;
+
+  field_map = record_field_map_t::create_ggc (40);
+
+  vec<propagation_set *> prop_set;
+  prop_set.create (last_basic_block_for_fn (cfun));
+
+  for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
+    prop_set.quick_push (0);
+
+  find_ssa_names_to_propagate (par, mask, worker_single, vector_single,
+			       &prop_set);
+
+  hash_set<tree> partitioned_var_uses;
+  hash_set<tree> gangprivate_vars;
+
+  find_gangprivate_vars (&gangprivate_vars);
+  find_partitioned_var_uses (par, mask, &partitioned_var_uses);
+  find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
+				&gangprivate_vars, &prop_set);
+
+  FOR_ALL_BB_FN (bb, cfun)
+    {
+      propagation_set *ws_prop = prop_set[bb->index];
+      if (ws_prop)
+	{
+	  tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
+	  tree name = create_tmp_var_name (".oacc_ws_data_s");
+	  name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type);
+	  DECL_ARTIFICIAL (name) = 1;
+	  DECL_NAMELESS (name) = 1;
+	  TYPE_NAME (record_type) = name;
+	  TYPE_ARTIFICIAL (record_type) = 1;
+
+	  auto_vec<tree> field_vec (ws_prop->elements ());
+	  for (hash_set<tree>::iterator it = ws_prop->begin ();
+	       it != ws_prop->end (); ++it)
+	    field_vec.quick_push (*it);
+
+	  field_vec.qsort (sort_by_size_then_ssa_version_or_uid);
+
+	  field_map->put (record_type, field_map_t::create_ggc (17));
+
+	  /* Insert var fields in reverse order, so the last inserted element
+	     is the first in the structure.  */
+	  for (int i = field_vec.length () - 1; i >= 0; i--)
+	    install_var_field (field_vec[i], record_type);
+
+	  layout_type (record_type);
+
+	  bb->aux = (tree) record_type;
+	}
+    }
+
+  neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
+			&partitioned_var_uses);
+
+  prop_set.release ();
+
+  /* This doesn't seem to make a difference.  */
+  loops_state_clear (LOOP_CLOSED_SSA);
+
+  /* Neutering worker-single neutered blocks will invalidate dominance info.
+     It may be possible to incrementally update just the affected blocks, but
+     obliterate everything for now.  */
+  free_dominance_info (CDI_DOMINATORS);
+  free_dominance_info (CDI_POST_DOMINATORS);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\n\nAfter neutering:\n\n");
+      dump_function_to_file (current_function_decl, dump_file, dump_flags);
+    }
+}
+
+/* Analyse a group of BBs within a partitioned region and create N
+   Single-Entry-Single-Exit regions.  Some of those regions will be
+   trivial ones consisting of a single BB.  The blocks of a
+   partitioned region might form a set of disjoint graphs -- because
+   the region encloses a differently partitoned sub region.
+
+   We use the linear time algorithm described in 'Finding Regions Fast:
+   Single Entry Single Exit and control Regions in Linear Time'
+   Johnson, Pearson & Pingali.  That algorithm deals with complete
+   CFGs, where a back edge is inserted from END to START, and thus the
+   problem becomes one of finding equivalent loops.
+
+   In this case we have a partial CFG.  We complete it by redirecting
+   any incoming edge to the graph to be from an arbitrary external BB,
+   and similarly redirecting any outgoing edge to be to  that BB.
+   Thus we end up with a closed graph.
+
+   The algorithm works by building a spanning tree of an undirected
+   graph and keeping track of back edges from nodes further from the
+   root in the tree to nodes nearer to the root in the tree.  In the
+   description below, the root is up and the tree grows downwards.
+
+   We avoid having to deal with degenerate back-edges to the same
+   block, by splitting each BB into 3 -- one for input edges, one for
+   the node itself and one for the output edges.  Such back edges are
+   referred to as 'Brackets'.  Cycle equivalent nodes will have the
+   same set of brackets.
+
+   Determining bracket equivalency is done by maintaining a list of
+   brackets in such a manner that the list length and final bracket
+   uniquely identify the set.
+
+   We use coloring to mark all BBs with cycle equivalency with the
+   same color.  This is the output of the 'Finding Regions Fast'
+   algorithm.  Notice it doesn't actually find the set of nodes within
+   a particular region, just unorderd sets of nodes that are the
+   entries and exits of SESE regions.
+
+   After determining cycle equivalency, we need to find the minimal
+   set of SESE regions.  Do this with a DFS coloring walk of the
+   complete graph.  We're either 'looking' or 'coloring'.  When
+   looking, and we're in the subgraph, we start coloring the color of
+   the current node, and remember that node as the start of the
+   current color's SESE region.  Every time we go to a new node, we
+   decrement the count of nodes with thet color.  If it reaches zero,
+   we remember that node as the end of the current color's SESE region
+   and return to 'looking'.  Otherwise we color the node the current
+   color.
+
+   This way we end up with coloring the inside of non-trivial SESE
+   regions with the color of that region.  */
+
+/* A node in the undirected CFG.  The discriminator SECOND indicates just
+   above or just below the BB idicated by FIRST.  */
+typedef std::pair<basic_block, int> pseudo_node_t;
+
+/* A bracket indicates an edge towards the root of the spanning tree of the
+   undirected graph.  Each bracket has a color, determined
+   from the currrent set of brackets.  */
+struct bracket
+{
+  pseudo_node_t back; /* Back target */
+
+  /* Current color and size of set.  */
+  unsigned color;
+  unsigned size;
+
+  bracket (pseudo_node_t back_)
+  : back (back_), color (~0u), size (~0u)
+  {
+  }
+
+  unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
+  {
+    if (length != size)
+      {
+	size = length;
+	color = color_counts.length ();
+	color_counts.quick_push (0);
+      }
+    color_counts[color]++;
+    return color;
+  }
+};
+
+typedef auto_vec<bracket> bracket_vec_t;
+
+/* Basic block info for finding SESE regions.    */
+
+struct bb_sese
+{
+  int node;  /* Node number in spanning tree.  */
+  int parent; /* Parent node number.  */
+
+  /* The algorithm splits each node A into Ai, A', Ao. The incoming
+     edges arrive at pseudo-node Ai and the outgoing edges leave at
+     pseudo-node Ao.  We have to remember which way we arrived at a
+     particular node when generating the spanning tree.  dir > 0 means
+     we arrived at Ai, dir < 0 means we arrived at Ao.  */
+  int dir;
+
+  /* Lowest numbered pseudo-node reached via a backedge from thsis
+     node, or any descendant.  */
+  pseudo_node_t high;
+
+  int color;  /* Cycle-equivalence color  */
+
+  /* Stack of brackets for this node.  */
+  bracket_vec_t brackets;
+
+  bb_sese (unsigned node_, unsigned p, int dir_)
+  :node (node_), parent (p), dir (dir_)
+  {
+  }
+  ~bb_sese ();
+
+  /* Push a bracket ending at BACK.  */
+  void push (const pseudo_node_t &back)
+  {
+    if (dump_file)
+      fprintf (dump_file, "Pushing backedge %d:%+d\n",
+	       back.first ? back.first->index : 0, back.second);
+    brackets.safe_push (bracket (back));
+  }
+
+  void append (bb_sese *child);
+  void remove (const pseudo_node_t &);
+
+  /* Set node's color.  */
+  void set_color (auto_vec<unsigned> &color_counts)
+  {
+    color = brackets.last ().get_color (color_counts, brackets.length ());
+  }
+};
+
+bb_sese::~bb_sese ()
+{
+}
+
+/* Destructively append CHILD's brackets.  */
+
+void
+bb_sese::append (bb_sese *child)
+{
+  if (int len = child->brackets.length ())
+    {
+      int ix;
+
+      if (dump_file)
+	{
+	  for (ix = 0; ix < len; ix++)
+	    {
+	      const pseudo_node_t &pseudo = child->brackets[ix].back;
+	      fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
+		       child->node, pseudo.first ? pseudo.first->index : 0,
+		       pseudo.second);
+	    }
+	}
+      if (!brackets.length ())
+	std::swap (brackets, child->brackets);
+      else
+	{
+	  brackets.reserve (len);
+	  for (ix = 0; ix < len; ix++)
+	    brackets.quick_push (child->brackets[ix]);
+	}
+    }
+}
+
+/* Remove brackets that terminate at PSEUDO.  */
+
+void
+bb_sese::remove (const pseudo_node_t &pseudo)
+{
+  unsigned removed = 0;
+  int len = brackets.length ();
+
+  for (int ix = 0; ix < len; ix++)
+    {
+      if (brackets[ix].back == pseudo)
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "Removing backedge %d:%+d\n",
+		     pseudo.first ? pseudo.first->index : 0, pseudo.second);
+	  removed++;
+	}
+      else if (removed)
+	brackets[ix-removed] = brackets[ix];
+    }
+  while (removed--)
+    brackets.pop ();
+}
+
+/* Accessors for BB's aux pointer.  */
+#define BB_SET_SESE(B, S) ((B)->aux = (S))
+#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
+
+/* DFS walk creating SESE data structures.  Only cover nodes with
+   BB_VISITED set.  Append discovered blocks to LIST.  We number in
+   increments of 3 so that the above and below pseudo nodes can be
+   implicitly numbered too.  */
+
+static int
+omp_sese_number (int n, int p, int dir, basic_block b,
+		   auto_vec<basic_block> *list)
+{
+  if (BB_GET_SESE (b))
+    return n;
+
+  if (dump_file)
+    fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
+	     b->index, n, p, dir);
+
+  BB_SET_SESE (b, new bb_sese (n, p, dir));
+  p = n;
+
+  n += 3;
+  list->quick_push (b);
+
+  /* First walk the nodes on the 'other side' of this node, then walk
+     the nodes on the same side.  */
+  for (unsigned ix = 2; ix; ix--)
+    {
+      vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
+      size_t offset = (dir > 0 ? offsetof (edge_def, dest)
+		       : offsetof (edge_def, src));
+      edge e;
+      edge_iterator ei;
+
+      FOR_EACH_EDGE (e, ei, edges)
+	{
+	  basic_block target = *(basic_block *)((char *)e + offset);
+
+	  if (target->flags & BB_VISITED)
+	    n = omp_sese_number (n, p, dir, target, list);
+	}
+      dir = -dir;
+    }
+  return n;
+}
+
+/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
+   EDGES are the outgoing edges and OFFSET is the offset to the src
+   or dst block on the edges.   */
+
+static void
+omp_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
+		   vec<edge, va_gc> *edges, size_t offset)
+{
+  edge e;
+  edge_iterator ei;
+  int hi_back = depth;
+  pseudo_node_t node_back (0, depth);
+  int hi_child = depth;
+  pseudo_node_t node_child (0, depth);
+  basic_block child = NULL;
+  unsigned num_children = 0;
+  int usd = -dir * sese->dir;
+
+  if (dump_file)
+    fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
+	     me->index, sese->node, dir);
+
+  if (dir < 0)
+    {
+      /* This is the above pseudo-child.  It has the BB itself as an
+	 additional child node.  */
+      node_child = sese->high;
+      hi_child = node_child.second;
+      if (node_child.first)
+	hi_child += BB_GET_SESE (node_child.first)->node;
+      num_children++;
+    }
+
+  /* Examine each edge.
+     - if it is a child (a) append its bracket list and (b) record
+          whether it is the child with the highest reaching bracket.
+     - if it is an edge to ancestor, record whether it's the highest
+          reaching backlink.  */
+  FOR_EACH_EDGE (e, ei, edges)
+    {
+      basic_block target = *(basic_block *)((char *)e + offset);
+
+      if (bb_sese *t_sese = BB_GET_SESE (target))
+	{
+	  if (t_sese->parent == sese->node && !(t_sese->dir + usd))
+	    {
+	      /* Child node.  Append its bracket list. */
+	      num_children++;
+	      sese->append (t_sese);
+
+	      /* Compare it's hi value.  */
+	      int t_hi = t_sese->high.second;
+
+	      if (basic_block child_hi_block = t_sese->high.first)
+		t_hi += BB_GET_SESE (child_hi_block)->node;
+
+	      if (hi_child > t_hi)
+		{
+		  hi_child = t_hi;
+		  node_child = t_sese->high;
+		  child = target;
+		}
+	    }
+	  else if (t_sese->node < sese->node + dir
+		   && !(dir < 0 && sese->parent == t_sese->node))
+	    {
+	      /* Non-parental ancestor node -- a backlink.  */
+	      int d = usd * t_sese->dir;
+	      int back = t_sese->node + d;
+
+	      if (hi_back > back)
+		{
+		  hi_back = back;
+		  node_back = pseudo_node_t (target, d);
+		}
+	    }
+	}
+      else
+	{ /* Fallen off graph, backlink to entry node.  */
+	  hi_back = 0;
+	  node_back = pseudo_node_t (0, 0);
+	}
+    }
+
+  /* Remove any brackets that terminate at this pseudo node.  */
+  sese->remove (pseudo_node_t (me, dir));
+
+  /* Now push any backlinks from this pseudo node.  */
+  FOR_EACH_EDGE (e, ei, edges)
+    {
+      basic_block target = *(basic_block *)((char *)e + offset);
+      if (bb_sese *t_sese = BB_GET_SESE (target))
+	{
+	  if (t_sese->node < sese->node + dir
+	      && !(dir < 0 && sese->parent == t_sese->node))
+	    /* Non-parental ancestor node - backedge from me.  */
+	    sese->push (pseudo_node_t (target, usd * t_sese->dir));
+	}
+      else
+	{
+	  /* back edge to entry node */
+	  sese->push (pseudo_node_t (0, 0));
+	}
+    }
+
+  /* If this node leads directly or indirectly to a no-return region of
+     the graph, then fake a backedge to entry node.  */
+  if (!sese->brackets.length () || !edges || !edges->length ())
+    {
+      hi_back = 0;
+      node_back = pseudo_node_t (0, 0);
+      sese->push (node_back);
+    }
+
+  /* Record the highest reaching backedge from us or a descendant.  */
+  sese->high = hi_back < hi_child ? node_back : node_child;
+
+  if (num_children > 1)
+    {
+      /* There is more than one child -- this is a Y shaped piece of
+	 spanning tree.  We have to insert a fake backedge from this
+	 node to the highest ancestor reached by not-the-highest
+	 reaching child.  Note that there may be multiple children
+	 with backedges to the same highest node.  That's ok and we
+	 insert the edge to that highest node.  */
+      hi_child = depth;
+      if (dir < 0 && child)
+	{
+	  node_child = sese->high;
+	  hi_child = node_child.second;
+	  if (node_child.first)
+	    hi_child += BB_GET_SESE (node_child.first)->node;
+	}
+
+      FOR_EACH_EDGE (e, ei, edges)
+	{
+	  basic_block target = *(basic_block *)((char *)e + offset);
+
+	  if (target == child)
+	    /* Ignore the highest child. */
+	    continue;
+
+	  bb_sese *t_sese = BB_GET_SESE (target);
+	  if (!t_sese)
+	    continue;
+	  if (t_sese->parent != sese->node)
+	    /* Not a child. */
+	    continue;
+
+	  /* Compare its hi value.  */
+	  int t_hi = t_sese->high.second;
+
+	  if (basic_block child_hi_block = t_sese->high.first)
+	    t_hi += BB_GET_SESE (child_hi_block)->node;
+
+	  if (hi_child > t_hi)
+	    {
+	      hi_child = t_hi;
+	      node_child = t_sese->high;
+	    }
+	}
+
+      sese->push (node_child);
+    }
+}
+
+
+/* DFS walk of BB graph.  Color node BLOCK according to COLORING then
+   proceed to successors.  Set SESE entry and exit nodes of
+   REGIONS.  */
+
+static void
+omp_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
+		basic_block block, int coloring)
+{
+  bb_sese *sese = BB_GET_SESE (block);
+
+  if (block->flags & BB_VISITED)
+    {
+      /* If we've already encountered this block, either we must not
+	 be coloring, or it must have been colored the current color.  */
+      gcc_assert (coloring < 0 || (sese && coloring == sese->color));
+      return;
+    }
+
+  block->flags |= BB_VISITED;
+
+  if (sese)
+    {
+      if (coloring < 0)
+	{
+	  /* Start coloring a region.  */
+	  regions[sese->color].first = block;
+	  coloring = sese->color;
+	}
+
+      if (!--color_counts[sese->color] && sese->color == coloring)
+	{
+	  /* Found final block of SESE region.  */
+	  regions[sese->color].second = block;
+	  coloring = -1;
+	}
+      else
+	/* Color the node, so we can assert on revisiting the node
+	   that the graph is indeed SESE.  */
+	sese->color = coloring;
+    }
+  else
+    /* Fallen off the subgraph, we cannot be coloring.  */
+    gcc_assert (coloring < 0);
+
+  /* Walk each successor block.  */
+  if (block->succs && block->succs->length ())
+    {
+      edge e;
+      edge_iterator ei;
+
+      FOR_EACH_EDGE (e, ei, block->succs)
+	omp_sese_color (color_counts, regions, e->dest, coloring);
+    }
+  else
+    gcc_assert (coloring < 0);
+}
+
+/* Find minimal set of SESE regions covering BLOCKS.  REGIONS might
+   end up with NULL entries in it.  */
+
+void
+omp_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
+{
+  basic_block block;
+  int ix;
+
+  /* First clear each BB of the whole function.  */
+  FOR_EACH_BB_FN (block, cfun)
+    {
+      block->flags &= ~BB_VISITED;
+      BB_SET_SESE (block, 0);
+    }
+  block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+  block->flags &= ~BB_VISITED;
+  BB_SET_SESE (block, 0);
+  block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+  block->flags &= ~BB_VISITED;
+  BB_SET_SESE (block, 0);
+
+  /* Mark blocks in the function that are in this graph.  */
+  for (ix = 0; blocks.iterate (ix, &block); ix++)
+    block->flags |= BB_VISITED;
+
+  /* Counts of nodes assigned to each color.  There cannot be more
+     colors than blocks (and hopefully there will be fewer).  */
+  auto_vec<unsigned> color_counts;
+  color_counts.reserve (blocks.length ());
+
+  /* Worklist of nodes in the spanning tree.  Again, there cannot be
+     more nodes in the tree than blocks (there will be fewer if the
+     CFG of blocks is disjoint).  */
+  auto_vec<basic_block> spanlist;
+  spanlist.reserve (blocks.length ());
+
+  /* Make sure every block has its cycle class determined.  */
+  for (ix = 0; blocks.iterate (ix, &block); ix++)
+    {
+      if (BB_GET_SESE (block))
+	/* We already met this block in an earlier graph solve.  */
+	continue;
+
+      if (dump_file)
+	fprintf (dump_file, "Searching graph starting at %d\n", block->index);
+
+      /* Number the nodes reachable from block initial DFS order.  */
+      int depth = omp_sese_number (2, 0, +1, block, &spanlist);
+
+      /* Now walk in reverse DFS order to find cycle equivalents.  */
+      while (spanlist.length ())
+	{
+	  block = spanlist.pop ();
+	  bb_sese *sese = BB_GET_SESE (block);
+
+	  /* Do the pseudo node below.  */
+	  omp_sese_pseudo (block, sese, depth, +1,
+			     sese->dir > 0 ? block->succs : block->preds,
+			     (sese->dir > 0 ? offsetof (edge_def, dest)
+			      : offsetof (edge_def, src)));
+	  sese->set_color (color_counts);
+	  /* Do the pseudo node above.  */
+	  omp_sese_pseudo (block, sese, depth, -1,
+			     sese->dir < 0 ? block->succs : block->preds,
+			     (sese->dir < 0 ? offsetof (edge_def, dest)
+			      : offsetof (edge_def, src)));
+	}
+      if (dump_file)
+	fprintf (dump_file, "\n");
+    }
+
+  if (dump_file)
+    {
+      unsigned count;
+      const char *comma = "";
+
+      fprintf (dump_file, "Found %d cycle equivalents\n",
+	       color_counts.length ());
+      for (ix = 0; color_counts.iterate (ix, &count); ix++)
+	{
+	  fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
+
+	  comma = "";
+	  for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
+	    if (BB_GET_SESE (block)->color == ix)
+	      {
+		block->flags |= BB_VISITED;
+		fprintf (dump_file, "%s%d", comma, block->index);
+		comma=",";
+	      }
+	  fprintf (dump_file, "}");
+	  comma = ", ";
+	}
+      fprintf (dump_file, "\n");
+   }
+
+  /* Now we've colored every block in the subgraph.  We now need to
+     determine the minimal set of SESE regions that cover that
+     subgraph.  Do this with a DFS walk of the complete function.
+     During the walk we're either 'looking' or 'coloring'.  When we
+     reach the last node of a particular color, we stop coloring and
+     return to looking.  */
+
+  /* There cannot be more SESE regions than colors.  */
+  regions.reserve (color_counts.length ());
+  for (ix = color_counts.length (); ix--;)
+    regions.quick_push (bb_pair_t (0, 0));
+
+  for (ix = 0; blocks.iterate (ix, &block); ix++)
+    block->flags &= ~BB_VISITED;
+
+  omp_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
+
+  if (dump_file)
+    {
+      const char *comma = "";
+      int len = regions.length ();
+
+      fprintf (dump_file, "SESE regions:");
+      for (ix = 0; ix != len; ix++)
+	{
+	  basic_block from = regions[ix].first;
+	  basic_block to = regions[ix].second;
+
+	  if (from)
+	    {
+	      fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
+	      if (to != from)
+		fprintf (dump_file, "->%d", to->index);
+
+	      int color = BB_GET_SESE (from)->color;
+
+	      /* Print the blocks within the region (excluding ends).  */
+	      FOR_EACH_BB_FN (block, cfun)
+		{
+		  bb_sese *sese = BB_GET_SESE (block);
+
+		  if (sese && sese->color == color
+		      && block != from && block != to)
+		    fprintf (dump_file, ".%d", block->index);
+		}
+	      fprintf (dump_file, "}");
+	    }
+	  comma = ",";
+	}
+      fprintf (dump_file, "\n\n");
+    }
+
+  for (ix = 0; blocks.iterate (ix, &block); ix++)
+    delete BB_GET_SESE (block);
+}
+
+#undef BB_SET_SESE
+#undef BB_GET_SESE
diff --git a/gcc/omp-sese.h b/gcc/omp-sese.h
new file mode 100644
index 0000000..0b82a24
--- /dev/null
+++ b/gcc/omp-sese.h
@@ -0,0 +1,32 @@
+/* Find single-entry, single-exit regions for OpenACC.
+
+   Copyright (C) 2005-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_OMP_SESE_H
+#define GCC_OMP_SESE_H
+
+/* A pair of BBs.  We use this to represent SESE regions.  */
+typedef std::pair<basic_block, basic_block> bb_pair_t;
+typedef auto_vec<bb_pair_t> bb_pair_vec_t;
+
+extern void omp_find_sese (auto_vec<basic_block> &blocks,
+			   bb_pair_vec_t &regions);
+extern void oacc_do_neutering (void);
+
+#endif
diff --git a/gcc/passes.def b/gcc/passes.def
index 9a85907..7b2ebff 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -34,6 +34,7 @@
   NEXT_PASS (pass_warn_unused_result);
   NEXT_PASS (pass_diagnose_omp_blocks);
   NEXT_PASS (pass_diagnose_tm_blocks);
+  NEXT_PASS (pass_convert_oacc_kernels);
   NEXT_PASS (pass_lower_omp);
   NEXT_PASS (pass_lower_cf);
   NEXT_PASS (pass_lower_tm);
@@ -177,6 +178,8 @@
   INSERT_PASSES_AFTER (all_passes)
   NEXT_PASS (pass_fixup_cfg);
   NEXT_PASS (pass_lower_eh_dispatch);
+  NEXT_PASS (pass_oacc_loop_designation);
+  NEXT_PASS (pass_oacc_gimple_workers);
   NEXT_PASS (pass_oacc_device_lower);
   NEXT_PASS (pass_omp_device_lower);
   NEXT_PASS (pass_omp_target_link);
diff --git a/gcc/target.def b/gcc/target.def
index 66cee07..d490138 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1719,6 +1719,43 @@
 void, (gcall *call),
 default_goacc_reduction)
 
+DEFHOOK
+(expand_accel_var,
+"This hook, if defined, is used by accelerator target back-ends to expand\n\
+specially handled kinds of VAR_DECL expressions.  A particular use is to\n\
+place variables with specific attributes inside special accelarator\n\
+memories.  A return value of NULL indicates that the target does not\n\
+handle this VAR_DECL, and normal RTL expanding is resumed.",
+rtx, (tree var),
+NULL)
+
+DEFHOOK
+(adjust_private_decl,
+"Tweak variable declaration for a private variable at the specified\n\
+parallelism level.",
+void, (tree var, int),
+NULL)
+
+DEFHOOK
+(create_propagation_record,
+"Create a record used to propagate local-variable state from an active\n\
+worker to other workers.  A possible implementation might adjust the type\n\
+of REC to place the new variable in shared GPU memory.",
+tree, (tree rec, bool sender, const char *name),
+default_goacc_create_propagation_record)
+
+DEFHOOK
+(explode_args,
+"Define this hook to TRUE if arguments to offload regions should be\n\
+exploded, i.e. passed as true arguments rather than in an argument array.",
+bool, (void),
+hook_bool_void_false)
+
+DEFHOOKPOD
+(worker_partitioning,
+"Use gimple transformation for worker neutering/broadcasting.",
+bool, false)
+
 HOOK_VECTOR_END (goacc)
 
 /* Functions relating to vectorization.  */
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 5943627..29b0258 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -125,6 +125,7 @@
 extern int default_goacc_dim_limit (int);
 extern bool default_goacc_fork_join (gcall *, const int [], bool);
 extern void default_goacc_reduction (gcall *);
+extern tree default_goacc_create_propagation_record (tree, bool, const char *);
 
 /* These are here, and not in hooks.[ch], because not all users of
    hooks.h include tm.h, and thus we don't have CUMULATIVE_ARGS.  */
diff --git a/gcc/testsuite/ChangeLog.omp b/gcc/testsuite/ChangeLog.omp
new file mode 100644
index 0000000..1b51698
--- /dev/null
+++ b/gcc/testsuite/ChangeLog.omp
@@ -0,0 +1,417 @@
+2019-10-09  Tobias Burnus  <tobias@codesourcery.com>
+
+	* gfortran.dg/gomp/use_device_ptr1.f90: New.
+	* gfortran.dg/gomp/use_device_ptr2.f90: New.
+	* gfortran.dg/gomp/use_device_ptr3.f90: New.
+
+2019-10-08  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from mainline
+	2019-10-08  Tobias Burnus  <tobias@codesourcery.com>
+
+	* testsuite/libgomp.fortran/target-simd.f90: New.
+
+2019-10-08  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from mainline
+	2019-10-08  Tobias Burnus  <tobias@codesourcery.com>
+
+	* gfortran.dg/goacc/continuation-free-form.f95: Update dg-error.
+
+2019-10-02  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from mainline
+	2019-10-02  Tobias Burnus  <tobias@codesourcery.com>
+
+	* gfortran.dg/goacc/asyncwait-1.f95: Handle new error message.
+	* gfortran.dg/goacc/asyncwait-2.f95: Likewise.
+	* gfortran.dg/goacc/asyncwait-3.f95: Likewise.
+	* gfortran.dg/goacc/asyncwait-4.f95: Likewise.
+	* gfortran.dg/goacc/default-2.f: Likewise.
+	* gfortran.dg/goacc/enter-exit-data.f95: Likewise.
+	* gfortran.dg/goacc/if.f95: Likewise.
+	* gfortran.dg/goacc/list.f95: Likewise.
+	* gfortran.dg/goacc/literal.f95: Likewise.
+	* gfortran.dg/goacc/loop-2-kernels-tile.f95: Likewise.
+	* gfortran.dg/goacc/loop-2-parallel-tile.f95: Likewise.
+	* gfortran.dg/goacc/loop-7.f95: Likewise.
+	* gfortran.dg/goacc/parallel-kernels-clauses.f95: Likewise.
+	* gfortran.dg/goacc/routine-6.f90: Likewise.
+	* gfortran.dg/goacc/several-directives.f95: Likewise.
+	* gfortran.dg/goacc/sie.f95: Likewise.
+	* gfortran.dg/goacc/tile-1.f90: Likewise.
+	* gfortran.dg/goacc/update-if_present-2.f90: Likewise.
+	* gfortran.dg/gomp/declare-simd-1.f90: Likewise.
+	* gfortran.dg/gomp/pr29759.f90: Likewise.
+
+2019-09-20  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from mainline
+	2019-09-20  Tobias Burnus  <tobias@codesourcery.com>
+
+	* gfortran.dg/goacc/parameter.f95: Change
+	dg-error as it is now detected earlier.
+	* gfortran.dg/goacc/pr85701.f90: Modify to
+	use a separate result variable.
+	* gfortran.dg/goacc/pr78260.f90: New.
+	* gfortran.dg/goacc/pr78260-2.f90: New.
+	* gfortran.dg/gomp/pr78260.f90: New.
+	* gfortran.dg/gomp/pr78260-2.f90: New.
+	* gfortran.dg/gomp/pr78260-3.f90: New.
+
+2019-09-19  Tobias Burnus  <tobias@codesourcery.com>
+
+	* gfortran.dg/goacc/classify-kernels-unparallelized.f95: Add
+	one dg-message for additional -fopt-info-optimized-omp output.
+	* gfortran.dg/goacc/classify-kernels.f95: Likewise.
+	* gfortran.dg/goacc/kernels-decompose-1.f95: Change 'note' to
+	'optimized' in dg-message.
+
+2019-09-17  Julian Brown  <julian@codesourcery.com>
+
+	* c-c++-common/goacc/classify-kernels-unparallelized.c: Update expected
+	message/warning/error output.
+	* c-c++-common/goacc/classify-kernels.c: Likewise.
+	* c-c++-common/goacc/kernels-decompose.c: Likewise.
+	* c-c++-common/goacc/note-parallelism-1-kernels-loop-auto.c: Likewise.
+	* c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-kernels-loop-auto.c: Likewise.
+	* c-c++-common/goacc/routine-1.c: Likewise.
+	* c-c++-common/goacc/routine-4-extern.c: Likewise.
+
+2019-09-05  Julian Brown  <julian@codesourcery.com>
+
+	* c-c++-common/goacc/classify-kernels-unparallelized.c,
+	c-c++-common/goacc/classify-kernels.c,
+	c-c++-common/goacc/classify-parallel.c,
+	c-c++-common/goacc/classify-routine.c,
+	gfortran.dg/goacc/classify-kernels-unparallelized.f95,
+	gfortran.dg/goacc/classify-kernels.f95,
+	gfortran.dg/goacc/classify-parallel.f95,
+	gfortran.dg/goacc/classify-routine.f95: Scan oaccloops dump instead of
+	oaccdevlow pass.
+
+2019-07-10  Julian Brown  <julian@codesourcery.com>
+
+	* c-c++-common/goacc/mdc-1.c: Update clause matching patterns.
+
+2019-07-09  Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+	2019-07-08  Jakub Jelinek  <jakub@redhat.com>
+
+	PR c++/91110
+	* g++.dg/gomp/pr91110.C: New test.
+
+2019-07-04  Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+	2019-07-04  Andrew Stubbs  <ams@codesourcery.com>
+
+	* g++.dg/gomp/unmappable-1.C: New file.
+
+2019-05-31  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* c-c++-common/goacc/kernels-decompose-1.c: Change 'note:' to
+	'optimized:'.  Fix typo.
+	* c-c++-common/goacc/note-parallelism-1-kernels-conditional-loop-independent_seq.c:
+	Change 'note:' to 'optimized:'.
+	* c-c++-common/goacc/note-parallelism-1-kernels-loop-auto.c: Likewise.
+	* c-c++-common/goacc/note-parallelism-1-kernels-loop-independent_seq.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-1-kernels-loops.c: Likewise.
+	* c-c++-common/goacc/note-parallelism-1-kernels-straight-line.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-combined-kernels-loop-independent_seq.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-kernels-conditional-loop-independent_seq.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-kernels-loop-auto.c: Likewise.
+	* c-c++-common/goacc/note-parallelism-kernels-loop-independent_seq.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-kernels-loops.c: Likewise.
+
+2019-01-09  Julian Brown  <julian@codesourcery.com>
+
+	* c-c++-common/cpp/openacc-define-3.c: Update expected value for
+	_OPENACC define.
+	* gfortran.dg/openacc-define-3.f90: Likewise.
+
+2019-01-23  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* c-c++-common/goacc/note-parallelism-1-kernels-conditional-loop-independent_seq.c:
+	New file.
+	* c-c++-common/goacc/note-parallelism-1-kernels-loop-auto.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-1-kernels-loop-independent_seq.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-1-kernels-loops.c: Likewise.
+	* c-c++-common/goacc/note-parallelism-1-kernels-straight-line.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-combined-kernels-loop-independent_seq.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-kernels-conditional-loop-independent_seq.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-kernels-loop-auto.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-kernels-loop-independent_seq.c:
+	Likewise.
+	* c-c++-common/goacc/note-parallelism-kernels-loops.c: Likewise.
+	* c-c++-common/goacc/classify-kernels-unparallelized.c: Update.
+	* c-c++-common/goacc/classify-kernels.c: Likewise.
+	* c-c++-common/goacc/classify-parallel.c: Likewise.
+	* c-c++-common/goacc/classify-routine.c: Likewise.
+	* c-c++-common/goacc/dtype-1.c: Likewise.
+	* c-c++-common/goacc/if-clause-2.c: Likewise.
+	* c-c++-common/goacc/kernels-conversion.c: Likewise.
+	* c-c++-common/goacc/kernels-decompose-1.c: Likewise.
+	* c-c++-common/goacc/loop-2-kernels.c: Likewise.
+	* c-c++-common/goacc/note-parallelism.c: Likewise.
+	* c-c++-common/goacc/routine-1.c: Likewise.
+	* c-c++-common/goacc/uninit-dim-clause.c: Likewise.
+	* gfortran.dg/goacc/dtype-1.f95: Likewise.
+	* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
+	* gfortran.dg/goacc/kernels-decompose-1.f95: Likewise.
+	* gfortran.dg/goacc/kernels-tree.f95: Likewise.
+
+2019-01-24  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* c-c++-common/goacc/kernels-conversion.c: Adjust test.
+	* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
+	* c-c++-common/goacc/kernels-decompose-1.c: New file.
+	* gfortran.dg/goacc/kernels-decompose-1.f95: Likewise.
+
+2019-01-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* c-c++-common/goacc/kernels-conversion.c: Test automatically generated
+	async clauses.
+	* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
+
+2019-01-23  Gergö Barany  <gergo@codesourcery.com>
+
+	* c-c++-common/goacc/kernels-conversion.c: Add test for conditionally
+	executed code.
+	* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
+
+2019-01-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* c-c++-common/goacc/kernels-conversion.c: Test for a gang-single
+	region.
+	* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
+
+2019-01-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* c-c++-common/goacc/kernels-conversion.c: New test.
+	* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
+	* c-c++-common/goacc/if-clause-2.c: Update.
+	* gfortran.dg/goacc/kernels-tree.f95: Likewise.
+
+2019-01-30  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* c-c++-common/goacc/kernels-1.c: Add
+	"-fopenacc-kernels=parloops".
+	* c-c++-common/goacc/kernels-acc-loop-reduction.c: Likewise.
+	* c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-2.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-3.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-4.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-5.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-6.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-7.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-8.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-ipa-pta-2.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-ipa-pta-3.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-ipa-pta-4.c: Likewise.
+	* c-c++-common/goacc/kernels-alias-ipa-pta.c: Likewise.
+	* c-c++-common/goacc/kernels-alias.c: Likewise.
+	* c-c++-common/goacc/kernels-counter-var-redundant-load.c:
+	Likewise.
+	* c-c++-common/goacc/kernels-counter-vars-function-scope.c:
+	Likewise.
+	* c-c++-common/goacc/kernels-double-reduction-n.c: Likewise.
+	* c-c++-common/goacc/kernels-double-reduction.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-2-acc-loop.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-2.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-3-acc-loop.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-3.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-acc-loop.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-data-2.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-data-enter-exit-2.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-data-enter-exit.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-data-update.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-data.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-g.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-mod-not-zero.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-n-acc-loop.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-n.c: Likewise.
+	* c-c++-common/goacc/kernels-loop-nest.c: Likewise.
+	* c-c++-common/goacc/kernels-loop.c: Likewise.
+	* c-c++-common/goacc/kernels-one-counter-var.c: Likewise.
+	* c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c:
+	Likewise.
+	* c-c++-common/goacc/kernels-reduction.c: Likewise.
+	* gfortran.dg/goacc/kernels-alias-2.f95: Likewise.
+	* gfortran.dg/goacc/kernels-alias-3.f95: Likewise.
+	* gfortran.dg/goacc/kernels-alias-4.f95: Likewise.
+	* gfortran.dg/goacc/kernels-alias.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loop-2.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loop-data-2.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loop-data-enter-exit.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loop-data-update.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loop-data.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loop-inner.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loop-n.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loop.f95: Likewise.
+	* gfortran.dg/goacc/kernels-loops-adjacent.f95: Likewise.
+	* gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95:
+	Likewise.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* c-c++-common/goacc/serial-dims.c: New test.
+
+2017-12-21  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* c-c++-common/goacc/large_array.c: New test.
+
+2018-12-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* c-c++-common/goacc/host_data-1.c: Add tests of if and if_present
+	clauses on host_data.
+	* gfortran.dg/goacc/host_data-tree.f95: Likewise.
+
+2018-12-20  Gergö Barany  <gergo@codesourcery.com>
+
+	* c-c++-common/goacc/nested-reductions-fail.c: New test.
+	* c-c++-common/goacc/nested-reductions.c: New test.
+	* c-c++-common/goacc/reduction-6.c: Adjust.
+
+2018-10-02  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* gfortran.dg/goacc/pr72741.f90: Update test for current diagnostics.
+	* gfortran.dg/goacc/pr72741-2.f: New test.
+	* gfortran.dg/goacc/pr72741-intrinsic-1.f: New test.
+	* gfortran.dg/goacc/pr72741-intrinsic-2.f: New test.
+
+2018-09-05  Cesar Philippidis  <cesar@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* c-c++-common/goacc/reduction-8.c: New test.
+
+2018-09-20  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* c-c++-common/goacc/loop-auto-1.c: Adjust test case to conform to
+	the new behavior of the auto clause in OpenACC 2.5.
+	* c-c++-common/goacc/loop-auto-2.c: Likewise.
+	* gcc.dg/goacc/loop-processing-1.c: Likewise.
+	* c-c++-common/goacc/loop-auto-3.c: New test.
+	* gfortran.dg/goacc/loop-auto-1.f90: New test.
+
+2018-10-04  Cesar Philippidis  <cesar@codesourcery.com>
+            Julian Brown  <julian@codesourcery.com>
+
+	* gfortran.dg/goacc/declare-allocatable-1.f90: New test.
+
+2018-08-28  Julian Brown  <julian@codesourcery.com>
+            Cesar Philippidis  <cesar@codesourcery.com>
+
+	* c-c++-common/goacc/acc-data-chain.c: New test.
+	* gfortran.dg/goacc/pr70828.f90: New test.
+	* gfortran.dg/goacc/pr70828-2.f90: New test.
+
+2018-10-05  Nathan Sidwell  <nathan@acm.org>
+	    Tom de Vries  <tdevries@suse.de>
+	    Thomas Schwinge  <thomas@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+
+	* c-c++-common/goacc/acc-icf.c: Update.
+	* c-c++-common/goacc/parallel-dims-1.c: Likewise.
+	* c-c++-common/goacc/parallel-reduction.c: Likewise.
+	* c-c++-common/goacc/pr70688.c: Likewise.
+	* c-c++-common/goacc/routine-1.c: Likewise.
+	* c-c++-common/goacc/uninit-dim-clause.c: Likewise.
+	* gfortran.dg/goacc/parallel-tree.f95: Likewise.
+	* gfortran.dg/goacc/routine-4.f90: Likewise.
+	* gfortran.dg/goacc/routine-level-of-parallelism-1.f90: Likewise.
+	* gfortran.dg/goacc/uninit-dim-clause.f95: Likewise.
+
+2019-09-20  Chung-Lin Tang <cltang@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* c-c++-common/goacc/kernels-loop-acc-loop.c: New test.
+	* c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test.
+	* c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test.
+	* c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test.
+	* c-c++-common/goacc/kernels-acc-loop-reduction.c: New test.
+	* c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test.
+
+2018-10-22  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* g++.dg/goacc/loop-1.c: New test.
+	* g++.dg/goacc/loop-2.c: New test.
+	* g++.dg/goacc/loop-3.c: New test.
+
+2018-12-13  Cesar Philippidis  <cesar@codesourcery.com>
+            Nathan Sidwell  <nathan@acm.org>
+            Julian Brown  <julian@codesourcery.com>
+
+        * c-c++-common/goacc/orphan-reductions-1.c: New test.
+        * c-c++-common/goacc/reduction-7.c: New test.
+        * c-c++-common/goacc/routine-4.c: Update.
+        * g++.dg/goacc/reductions-1.C: New test.
+        * gcc.dg/goacc/loop-processing-1.c: Update.
+        * gfortran.dg/goacc/orphan-reductions-1.f90: New test.
+
+2018-06-29  Cesar Philippidis  <cesar@codesourcery.com>
+	    James Norris  <jnorris@codesourcery.com>
+
+	* c-c++-common/goacc/deviceptr-4.c: Update.
+	* gfortran.dg/goacc/common-block-1.f90: New test.
+	* gfortran.dg/goacc/common-block-2.f90: New test.
+	* gfortran.dg/goacc/loop-2.f95: Update.
+	* gfortran.dg/goacc/loop-3-2.f95: Update.
+	* gfortran.dg/goacc/loop-3.f95: Update.
+	* gfortran.dg/goacc/loop-5.f95: Update.
+	* gfortran.dg/goacc/pr72715.f90: New test.
+	* gfortran.dg/goacc/sie.f95: Update.
+	* gfortran.dg/goacc/tile-1.f90: Update.
+	* gfortran.dg/gomp/pr77516.f90: Update.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* c-c++-common/goacc/routine-level-of-parallelism-1.c: New test.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* c-c++-common/goacc/classify-routine.c: Adjust test.
+	* c-c++-common/goacc/routine-1.c: Likewise.
+	* c-c++-common/goacc/routine-2.c: Likewise.
+	* c-c++-common/goacc/routine-nohost-1.c: New test.
+	* g++.dg/goacc/routine-2.C: Adjust test.
+	* gfortran.dg/goacc/pr72741.f90: New test.
+
+2019-02-05  Julian Brown  <julian@codesourcery.com>
+
+	* c-c++-common/goacc/deep-copy-multidim.c: Add test.
+
+2019-01-31  Julian Brown  <julian@codesourcery.com>
+
+	* c-c++-common/goacc/deep-copy-arrayofstruct.c: New test.
+
+2018-12-14  Julian Brown  <julian@codesourcery.com>
+
+	* c-c++-common/goacc/mdc-1.c: New test.
+	* c-c++-common/goacc/mdc-2.c: New test.
+	* gcc.dg/goacc/mdc.C: New test.
+	* gfortran.dg/goacc/data-clauses.f95: New test.
+	* gfortran.dg/goacc/derived-types.f90: New test.
+	* gfortran.dg/goacc/enter-exit-data.f95: New test.
+
diff --git a/gcc/testsuite/c-c++-common/cpp/openacc-define-3.c b/gcc/testsuite/c-c++-common/cpp/openacc-define-3.c
index ccedcd9..f2122f5 100644
--- a/gcc/testsuite/c-c++-common/cpp/openacc-define-3.c
+++ b/gcc/testsuite/c-c++-common/cpp/openacc-define-3.c
@@ -6,6 +6,6 @@
 # error _OPENACC not defined
 #endif
 
-#if _OPENACC != 201306
+#if _OPENACC != 201711
 # error _OPENACC defined to wrong value
 #endif
diff --git a/gcc/testsuite/c-c++-common/goacc/acc-data-chain.c b/gcc/testsuite/c-c++-common/goacc/acc-data-chain.c
new file mode 100644
index 0000000..8a039be
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/acc-data-chain.c
@@ -0,0 +1,24 @@
+/* Ensure that the gimplifier does not remove any existing clauses as
+   it inserts new implicit data clauses.  */
+
+/* { dg-additional-options "-fdump-tree-gimple" }  */
+
+#define N 100
+static int a[N], b[N];
+
+int main(int argc, char *argv[])
+{
+  int i;
+
+#pragma acc data copyin(a[0:N]) copyout (b[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      b[i] = a[i];
+  }
+
+ return 0;
+}
+
+// { dg-final { scan-tree-dump-times "omp target oacc_data map\\(from:b\\\[0\\\] \\\[len: 400\\\]\\) map\\(to:a\\\[0\\\] \\\[len: 400\\\]\\)" 1 "gimple" } }
+// { dg-final { scan-tree-dump-times "omp target oacc_parallel map\\(force_present:b\\\[0\\\] \\\[len: 400\\\]\\) map.alloc:b \\\[pointer assign, bias: 0\\\]\\) map\\(force_present:a\\\[0\\\] \\\[len: 400\\\]\\) map\\(alloc:a \\\[pointer assign, bias: 0\\\]\\)" 1 "gimple" } }
diff --git a/gcc/testsuite/c-c++-common/goacc/acc-icf.c b/gcc/testsuite/c-c++-common/goacc/acc-icf.c
index ecfe3f2..fb2c791 100644
--- a/gcc/testsuite/c-c++-common/goacc/acc-icf.c
+++ b/gcc/testsuite/c-c++-common/goacc/acc-icf.c
@@ -4,7 +4,7 @@
 
 #pragma acc routine gang
 int
-routine1 (int n)
+routine1 (int n) /* { dg-bogus "region is worker partitioned but does not contain worker partitioned code" "" { xfail *-*-* } } */
 {
   int i;
 
@@ -17,7 +17,7 @@
 
 #pragma acc routine gang
 int
-routine2 (int n)
+routine2 (int n) /* { dg-bogus "region is worker partitioned but does not contain worker partitioned code" "" { xfail *-*-* } } */
 {
   int i;
 
diff --git a/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized.c b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized.c
index d4c4b2c..32af533 100644
--- a/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized.c
+++ b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized.c
@@ -1,11 +1,11 @@
 /* Check offloaded function's attributes and classification for unparallelized
-   OpenACC kernels.  */
+   OpenACC 'kernels'.  */
 
 /* { dg-additional-options "-O2" }
    { dg-additional-options "-fopt-info-optimized-omp" }
    { dg-additional-options "-fdump-tree-ompexp" }
    { dg-additional-options "-fdump-tree-parloops1-all" }
-   { dg-additional-options "-fdump-tree-oaccdevlow" } */
+   { dg-additional-options "-fdump-tree-oaccloops" } */
 
 #define N 1024
 
@@ -13,14 +13,15 @@
 extern unsigned int *__restrict b;
 extern unsigned int *__restrict c;
 
-/* An "extern"al mapping of loop iterations/array indices makes the loop
-   unparallelizable.  */
 extern unsigned int f (unsigned int);
+#pragma acc routine (f) seq
 
 void KERNELS ()
 {
 #pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
-  for (unsigned int i = 0; i < N; i++)
+  for (unsigned int i = 0; i < N; i++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    /* An "extern"al mapping of loop iterations/array indices makes the loop
+       unparallelizable.  */
     c[i] = a[f (i)] + b[f (i)];
 }
 
@@ -35,6 +36,6 @@
 
 /* Check the offloaded function's classification and compute dimensions (will
    always be 1 x 1 x 1 for non-offloading compilation).
-   { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccdevlow" } }
-   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccdevlow" } }
-   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccdevlow" } } */
+   { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccloops" } }
+   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops" } }
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/classify-kernels.c b/gcc/testsuite/c-c++-common/goacc/classify-kernels.c
index 16e9b9e..93e6406 100644
--- a/gcc/testsuite/c-c++-common/goacc/classify-kernels.c
+++ b/gcc/testsuite/c-c++-common/goacc/classify-kernels.c
@@ -1,11 +1,11 @@
 /* Check offloaded function's attributes and classification for OpenACC
-   kernels.  */
+   'kernels'.  */
 
 /* { dg-additional-options "-O2" }
    { dg-additional-options "-fopt-info-optimized-omp" }
    { dg-additional-options "-fdump-tree-ompexp" }
    { dg-additional-options "-fdump-tree-parloops1-all" }
-   { dg-additional-options "-fdump-tree-oaccdevlow" } */
+   { dg-additional-options "-fdump-tree-oaccloops" } */
 
 #define N 1024
 
@@ -16,7 +16,7 @@
 void KERNELS ()
 {
 #pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
-  for (unsigned int i = 0; i < N; i++)
+  for (unsigned int i = 0; i < N; i++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
     c[i] = a[i] + b[i];
 }
 
@@ -31,6 +31,6 @@
 
 /* Check the offloaded function's classification and compute dimensions (will
    always be 1 x 1 x 1 for non-offloading compilation).
-   { dg-final { scan-tree-dump-times "(?n)Function is parallelized OpenACC kernels offload" 1 "oaccdevlow" } }
-   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccdevlow" } }
-   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccdevlow" } } */
+   { dg-final { scan-tree-dump-times "(?n)Function is parallelized OpenACC kernels offload" 1 "oaccloops" } }
+   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops" } }
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/classify-parallel.c b/gcc/testsuite/c-c++-common/goacc/classify-parallel.c
index 66a6d13..fa2e476 100644
--- a/gcc/testsuite/c-c++-common/goacc/classify-parallel.c
+++ b/gcc/testsuite/c-c++-common/goacc/classify-parallel.c
@@ -1,10 +1,10 @@
 /* Check offloaded function's attributes and classification for OpenACC
-   parallel.  */
+   'parallel'.  */
 
 /* { dg-additional-options "-O2" }
    { dg-additional-options "-fopt-info-optimized-omp" }
    { dg-additional-options "-fdump-tree-ompexp" }
-   { dg-additional-options "-fdump-tree-oaccdevlow" } */
+   { dg-additional-options "-fdump-tree-oaccloops" } */
 
 #define N 1024
 
@@ -24,6 +24,6 @@
 
 /* Check the offloaded function's classification and compute dimensions (will
    always be 1 x 1 x 1 for non-offloading compilation).
-   { dg-final { scan-tree-dump-times "(?n)Function is OpenACC parallel offload" 1 "oaccdevlow" } }
-   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccdevlow" } }
-   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), omp target entrypoint\\)\\)" 1 "oaccdevlow" } } */
+   { dg-final { scan-tree-dump-times "(?n)Function is OpenACC parallel offload" 1 "oaccloops" } }
+   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops" } }
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), omp target entrypoint\\)\\)" 1 "oaccloops" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/classify-routine.c b/gcc/testsuite/c-c++-common/goacc/classify-routine.c
index a723d2c..af0069a 100644
--- a/gcc/testsuite/c-c++-common/goacc/classify-routine.c
+++ b/gcc/testsuite/c-c++-common/goacc/classify-routine.c
@@ -1,10 +1,10 @@
 /* Check offloaded function's attributes and classification for OpenACC
-   routine.  */
+   'routine'.  */
 
 /* { dg-additional-options "-O2" }
    { dg-additional-options "-fopt-info-optimized-omp" }
    { dg-additional-options "-fdump-tree-ompexp" }
-   { dg-additional-options "-fdump-tree-oaccdevlow" } */
+   { dg-additional-options "-fdump-tree-oaccloops" } */
 
 #define N 1024
 
@@ -22,10 +22,10 @@
 }
 
 /* Check the offloaded function's attributes.
-   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(omp declare target, oacc function \\(0 1, 1 0, 1 0\\)\\)\\)" 1 "ompexp" } } */
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(omp declare target \\(worker\\), oacc function \\(0 1, 1 0, 1 0\\)\\)\\)" 1 "ompexp" } } */
 
 /* Check the offloaded function's classification and compute dimensions (will
    always be 1 x 1 x 1 for non-offloading compilation).
-   { dg-final { scan-tree-dump-times "(?n)Function is OpenACC routine level 1" 1 "oaccdevlow" } }
-   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccdevlow" } }
-   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(0 1, 1 1, 1 1\\), omp declare target, oacc function \\(0 1, 1 0, 1 0\\)\\)\\)" 1 "oaccdevlow" } } */
+   { dg-final { scan-tree-dump-times "(?n)Function is OpenACC routine level 1" 1 "oaccloops" } }
+   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops" } }
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(0 1, 1 1, 1 1\\), omp declare target \\(worker\\), oacc function \\(0 1, 1 0, 1 0\\)\\)\\)" 1 "oaccloops" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/deep-copy-arrayofstruct.c b/gcc/testsuite/c-c++-common/goacc/deep-copy-arrayofstruct.c
new file mode 100644
index 0000000..d411bcf
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/deep-copy-arrayofstruct.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+typedef struct {
+  int *a;
+  int *b;
+  int *c;
+} mystruct;
+
+int main(int argc, char* argv[])
+{
+  const int N = 1024;
+  const int S = 32;
+  mystruct *m = (mystruct *) calloc (S, sizeof (*m));
+  int i, j;
+
+  for (i = 0; i < S; i++)
+    {
+      m[i].a = (int *) malloc (N * sizeof (int));
+      m[i].b = (int *) malloc (N * sizeof (int));
+      m[i].c = (int *) malloc (N * sizeof (int));
+    }
+
+  for (j = 0; j < S; j++)
+    for (i = 0; i < N; i++)
+      {
+	m[j].a[i] = 0;
+	m[j].b[i] = 0;
+	m[j].c[i] = 0;
+      }
+
+#pragma acc enter data copyin(m[0:1])
+
+  for (int i = 0; i < 99; i++)
+    {
+      int j, k;
+      for (k = 0; k < S; k++)
+#pragma acc parallel loop copy(m[k].a[0:N]) /* { dg-error "expected .\\\). before .\\\.. token" } */
+        for (j = 0; j < N; j++)
+          m[k].a[j]++;
+
+      for (k = 0; k < S; k++)
+#pragma acc parallel loop copy(m[k].b[0:N], m[k].c[5:N-10]) /* { dg-error "expected .\\\). before .\\\.. token" } */
+	/* { dg-error ".m. appears more than once in data clauses" "" { target c++ } .-1 } */
+	for (j = 0; j < N; j++)
+	  {
+	    m[k].b[j]++;
+	    if (j > 5 && j < N - 5)
+	      m[k].c[j]++;
+	}
+    }
+
+#pragma acc exit data copyout(m[0:1])
+
+  for (j = 0; j < S; j++)
+    {
+      for (i = 0; i < N; i++)
+	{
+	  if (m[j].a[i] != 99)
+	    abort ();
+	  if (m[j].b[i] != 99)
+	    abort ();
+	  if (i > 5 && i < N-5)
+	    {
+	      if (m[j].c[i] != 99)
+		abort ();
+	    }
+	  else
+	    {
+	      if (m[j].c[i] != 0)
+		abort ();
+	    }
+	}
+
+      free (m[j].a);
+      free (m[j].b);
+      free (m[j].c);
+    }
+  free (m);
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/deep-copy-multidim.c b/gcc/testsuite/c-c++-common/goacc/deep-copy-multidim.c
new file mode 100644
index 0000000..1696f0c
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/deep-copy-multidim.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+
+#include <stdlib.h>
+#include <assert.h>
+
+struct dc
+{
+  int a;
+  int **b;
+};
+
+int
+main ()
+{
+  int n = 100, i, j;
+  struct dc v = { .a = 3 };
+
+  v.b = (int **) malloc (sizeof (int *) * n);
+  for (i = 0; i < n; i++)
+    v.b[i] = (int *) malloc (sizeof (int) * n);
+
+#pragma acc parallel loop copy(v.a, v.b[:n][:n]) /* { dg-error "dynamic arrays cannot be used within structs" } */
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++)
+      v.b[i][j] = v.a + i + j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++)
+      assert (v.b[i][j] == v.a + i + j);
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/deviceptr-4.c b/gcc/testsuite/c-c++-common/goacc/deviceptr-4.c
index db1b916..79a5162 100644
--- a/gcc/testsuite/c-c++-common/goacc/deviceptr-4.c
+++ b/gcc/testsuite/c-c++-common/goacc/deviceptr-4.c
@@ -8,4 +8,4 @@
   a[0] += 1.0;
 }
 
-/* { dg-final { scan-tree-dump-times "#pragma omp target oacc_parallel.*map\\(tofrom:a" 1 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "#pragma omp target oacc_parallel.*map\\(force_deviceptr:a" 1 "gimple" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/host_data-1.c b/gcc/testsuite/c-c++-common/goacc/host_data-1.c
index 0c7a857..658b7a6 100644
--- a/gcc/testsuite/c-c++-common/goacc/host_data-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/host_data-1.c
@@ -7,6 +7,9 @@
 {
 #pragma acc host_data use_device(v1)
   ;
+
+#pragma acc host_data use_device(v1) if_present
+  ;
 }
 
 
@@ -16,9 +19,32 @@
 foo (float *x, float *y)
 {
   int n = 1 << 10;
-#pragma acc data create(x[0:n]) copyout(y[0:n])
+#pragma acc data create(x[0:n])
   {
+    bar (x, y);
+
+    /* This should fail at run time because y is not mapped.  */
 #pragma acc host_data use_device(x,y)
     bar (x, y);
+
+    /* y is still not mapped, but this should not fail at run time but
+       continue execution with y remaining as the host address.  */
+#pragma acc host_data use_device(x,y) if_present
+    bar (x, y);
+
+#pragma acc data copyout(y[0:n])
+    {
+#pragma acc host_data use_device(x,y)
+      bar (x, y);
+
+#pragma acc host_data use_device(x,y) if_present
+      bar (x, y);
+
+#pragma acc host_data use_device(x,y) if(x != y)
+      bar (x, y);
+
+#pragma acc host_data use_device(x,y) if_present if(x != y)
+      bar (x, y);
+    }
   }
 }
diff --git a/gcc/testsuite/c-c++-common/goacc/if-clause-2.c b/gcc/testsuite/c-c++-common/goacc/if-clause-2.c
index 5ab8459..9920b4f 100644
--- a/gcc/testsuite/c-c++-common/goacc/if-clause-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/if-clause-2.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fdump-tree-convert_oacc_kernels" } */
+
 void
 f (short c)
 {
@@ -9,3 +11,7 @@
   ;
 #pragma acc update device(c) if(c)
 }
+
+/* Verify that the 'if' clause gets duplicated.
+   { dg-final { scan-tree-dump-times "#pragma omp target oacc_data_kernels if\\(" 1 "convert_oacc_kernels" } }
+   { dg-final { scan-tree-dump-times "#pragma omp target oacc_parallel_kernels_gang_single .* if\\(" 1 "convert_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-1.c b/gcc/testsuite/c-c++-common/goacc/kernels-1.c
index 016abbd..ba3169a 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-1.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-fopt-info-optimized-omp" } */
 
 int
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
new file mode 100644
index 0000000..19739ad
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
@@ -0,0 +1,25 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n, unsigned int *a)
+{
+  unsigned int sum = 0;
+
+#pragma acc kernels loop gang reduction(+:sum)
+  for (int i = 0; i < n; i++)
+    sum += a[i];
+
+  return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
new file mode 100644
index 0000000..0dd95b3
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
@@ -0,0 +1,25 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n)
+{
+  unsigned int sum = 1;
+
+  #pragma acc kernels loop
+  for (int i = 1; i <= n; i++)
+    sum += i;
+
+  return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-2.c
index 7576a64..57f1e08 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-2.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-ealias-all" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-3.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-3.c
index 2934f12..fa8afe2 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-3.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-3.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-ealias-all" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-4.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-4.c
index f6ee5b5..e5c264a 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-4.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-4.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-ealias-all" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-5.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-5.c
index 74425fb..9fb3189 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-5.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-5.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-ealias-all" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-6.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-6.c
index 908e1ca..015bded 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-6.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-6.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-ealias-all" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-7.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-7.c
index 923d002..0c828cc 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-7.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-7.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-ealias-all" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-8.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-8.c
index 69200cc..902e71d 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-8.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-8.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-ealias-all" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-2.c
index f16d698..7a57477c 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-2.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fipa-pta -fdump-tree-optimized" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-3.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-3.c
index 1ea0e73..879141d 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-3.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-3.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fipa-pta -fdump-tree-optimized" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-4.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-4.c
index 20b21dc..41f90158 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-4.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-4.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fipa-pta -fdump-tree-optimized" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta.c
index 969b466..e587f96 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fipa-pta -fdump-tree-optimized" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias.c
index e8ff018..bbc0508 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-alias.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-ealias-all" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
new file mode 100644
index 0000000..8cb63f0
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
@@ -0,0 +1,61 @@
+/* { dg-additional-options "-fdump-tree-convert_oacc_kernels" } */
+
+#define N 1024
+
+unsigned int a[N];
+
+int
+main (void)
+{
+  int i;
+  unsigned int sum = 1;
+
+#pragma acc kernels copyin(a[0:N]) copy(sum)
+  {
+    /* converted to "oacc_kernels" */
+    #pragma acc loop
+    for (i = 0; i < N; ++i)
+      sum += a[i];
+
+    /* converted to "oacc_parallel_kernels_gang_single" */
+    sum++;
+    a[0]++;
+
+    /* converted to "oacc_parallel_kernels_parallelized" */
+    #pragma acc loop independent
+    for (i = 0; i < N; ++i)
+      sum += a[i];
+
+    /* converted to "oacc_kernels" */
+    if (sum > 10)
+      { 
+        #pragma acc loop
+        for (i = 0; i < N; ++i)
+          sum += a[i];
+      }
+
+    /* converted to "oacc_kernels" */
+    #pragma acc loop auto
+    for (i = 0; i < N; ++i)
+      sum += a[i];
+  }
+
+  return 0;
+}
+
+/* Check that the kernels region is split into a data region and enclosed
+   parallel regions.  */ 
+/* { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 "convert_oacc_kernels" } } */
+
+/* As noted in the comments above, we get one gang-single serial region; one
+   parallelized loop region; and three "old-style" kernel regions. */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_kernels " 3 "convert_oacc_kernels" } } */
+
+/* Each of the parallel regions is async, and there is a final call to
+   __builtin_GOACC_wait.  */
+/* { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\)" 3 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single async\\(-1\\)" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized async\\(-1\\)" 1 "convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "convert_oacc_kernels" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c b/gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c
index 0304254..dbb04d3 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-counter-var-redundant-load.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-dom3" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-counter-vars-function-scope.c b/gcc/testsuite/c-c++-common/goacc/kernels-counter-vars-function-scope.c
index c475333..f40de67 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-counter-vars-function-scope.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-counter-vars-function-scope.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c
new file mode 100644
index 0000000..feee09e
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1.c
@@ -0,0 +1,122 @@
+/* Test OpenACC 'kernels' construct decomposition.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+/* { dg-additional-options "-O2" } for "parloops".  */
+
+/* See also "../../gfortran.dg/goacc/kernels-decompose-1.f95".  */
+
+#pragma acc routine gang
+extern int
+f_g (int);
+
+#pragma acc routine worker
+extern int
+f_w (int);
+
+#pragma acc routine vector
+extern int
+f_v (int);
+
+#pragma acc routine seq
+extern int
+f_s (int);
+
+int
+main ()
+{
+  int x, y, z;
+#define N 10
+  int a[N], b[N], c[N];
+
+#pragma acc kernels
+  {
+    x = 0; /* { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" } */
+    y = x < 10;
+    z = x++;
+    ;
+  }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  for (int i = 0; i < N; i++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    a[i] = 0;
+
+#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (int i = 0; i < N; i++)
+    b[i] = a[N - i - 1];
+
+#pragma acc kernels
+  {
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; i++)
+      b[i] = a[N - i - 1];
+
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; i++)
+      c[i] = a[i] * b[i];
+
+    a[z] = 0; /* { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" } */
+
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; i++)
+      c[i] += a[i];
+
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+    for (int i = 0 + 1; i < N; i++)
+      c[i] += c[i - 1];
+  }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+  {
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; ++i)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+      for (int j = 0; j < N; ++j)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+	 /* { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 } */
+	for (int k = 0; k < N; ++k)
+	  a[(i + j + k) % N]
+	    = b[j]
+	    + f_v (c[k]); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+
+    //TODO Should the following turn into "gang-single" instead of "parloops"?
+    //TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>; else goto <D.2548>;", thus "parloops".
+    if (y < 5) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+#pragma acc loop independent /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" } */
+      for (int j = 0; j < N; ++j)
+	b[j] = f_w (c[j]);
+  }
+
+#pragma acc kernels /* { dg-warning "region contains gang partitioned code but is not gang partitioned" } */
+  {
+    /* { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" "" { target *-*-* } .+1 } */
+    y = f_g (a[5]); /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+    for (int j = 0; j < N; ++j)
+      b[j] = y + f_w (c[j]); /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+  }
+
+#pragma acc kernels
+  {
+    y = 3; /* { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" } */
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+    /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+    for (int j = 0; j < N; ++j)
+      b[j] = y + f_v (c[j]); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+
+    z = 2; /* { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" } */
+  }
+
+#pragma acc kernels /* { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" } */
+  ;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction-n.c b/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction-n.c
index 8f7f415..d456925 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction-n.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction-n.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fopt-info-optimized-omp" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction.c
index c11d36f..caab7c8 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-double-reduction.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fopt-info-optimized-omp" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
new file mode 100644
index 0000000..c6e66fc
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
@@ -0,0 +1,20 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop-2.c"
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c
index acef6a1..238956c 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
new file mode 100644
index 0000000..4d04149
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
@@ -0,0 +1,17 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop-3.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c
index 75e2bb7..2bbb071 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
new file mode 100644
index 0000000..54b5946
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
@@ -0,0 +1,17 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
index 7180021..e7830b6 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-2.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
index 0c9f833..b5c2670 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit-2.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
index 0bd21b6..84f92a9 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-enter-exit.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
index dd5a841..dbdce45 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data-update.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
index a658182..23f4e22 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-data.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-g.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-g.c
index 73b469d..2cbd6be 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-g.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-g.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-g" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-mod-not-zero.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-mod-not-zero.c
index 5592623..28480ae 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-mod-not-zero.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-mod-not-zero.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
new file mode 100644
index 0000000..164c394
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
@@ -0,0 +1,17 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop-n.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c
index e86be1b..26bc3e0 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
index 2b0e186..b3fdde1 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop.c
index 9619d53..d0423a8 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-loop.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-one-counter-var.c b/gcc/testsuite/c-c++-common/goacc/kernels-one-counter-var.c
index 69539b2..15a8d37 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-one-counter-var.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-one-counter-var.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c b/gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
index 81b0fee..457a79a 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-parallel-loop-data-enter-exit.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c
index 5921b88..7603988 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-reduction.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-O2" } */
 /* { dg-additional-options "-fdump-tree-parloops1-all" } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/large_array.c b/gcc/testsuite/c-c++-common/goacc/large_array.c
new file mode 100644
index 0000000..ce0f4c1
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/large_array.c
@@ -0,0 +1,18 @@
+/* Ensure that alloca'ed arrays can be transferred to the
+   accelerator. */
+
+/* { dg-require-effective-target alloca } */
+
+int
+main ()
+{
+  int n = 100, m = 10, i, j;
+  float a[n][m];
+
+  #pragma acc parallel loop
+  for (i = 0; i < n; i++)
+    for (j = 0; j < m; j++)
+      a[i][j] = 0;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c b/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c
index 0151508..c989222 100644
--- a/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c
+++ b/gcc/testsuite/c-c++-common/goacc/loop-2-kernels.c
@@ -37,7 +37,7 @@
 	for (j = 0; j < 10; j++)
 	  { }
       }
-#pragma acc loop seq gang // { dg-error "'seq' overrides" }
+#pragma acc loop seq gang // { dg-error "'seq' overrides" "TODO" { xfail *-*-* } }
     for (i = 0; i < 10; i++)
       { }
 
@@ -63,7 +63,7 @@
 	for (j = 0; j < 10; j++)
 	  { }
       }
-#pragma acc loop seq worker // { dg-error "'seq' overrides" }
+#pragma acc loop seq worker // { dg-error "'seq' overrides" "TODO" { xfail *-*-* } }
     for (i = 0; i < 10; i++)
       { }
 #pragma acc loop gang worker
@@ -92,7 +92,7 @@
 	for (j = 1; j < 10; j++)
 	  { }
       }
-#pragma acc loop seq vector // { dg-error "'seq' overrides" }
+#pragma acc loop seq vector // { dg-error "'seq' overrides" "TODO" { xfail *-*-* } }
     for (i = 0; i < 10; i++)
       { }
 #pragma acc loop gang vector
@@ -105,7 +105,7 @@
 #pragma acc loop auto
     for (i = 0; i < 10; i++)
       { }
-#pragma acc loop seq auto // { dg-error "'seq' overrides" }
+#pragma acc loop seq auto // { dg-error "'seq' overrides" "TODO" { xfail *-*-* } }
     for (i = 0; i < 10; i++)
       { }
 #pragma acc loop gang auto // { dg-error "'auto' conflicts" }
@@ -147,7 +147,7 @@
 #pragma acc kernels loop worker(num:5)
   for (i = 0; i < 10; i++)
     { }
-#pragma acc kernels loop seq worker // { dg-error "'seq' overrides" }
+#pragma acc kernels loop seq worker // { dg-error "'seq' overrides" "TODO" { xfail *-*-* } }
   for (i = 0; i < 10; i++)
     { }
 #pragma acc kernels loop gang worker
@@ -163,7 +163,7 @@
 #pragma acc kernels loop vector(length:5)
   for (i = 0; i < 10; i++)
     { }
-#pragma acc kernels loop seq vector // { dg-error "'seq' overrides" }
+#pragma acc kernels loop seq vector // { dg-error "'seq' overrides" "TODO" { xfail *-*-* } }
   for (i = 0; i < 10; i++)
     { }
 #pragma acc kernels loop gang vector
@@ -176,7 +176,7 @@
 #pragma acc kernels loop auto
   for (i = 0; i < 10; i++)
     { }
-#pragma acc kernels loop seq auto // { dg-error "'seq' overrides" }
+#pragma acc kernels loop seq auto // { dg-error "'seq' overrides" "TODO" { xfail *-*-* } }
   for (i = 0; i < 10; i++)
     { }
 #pragma acc kernels loop gang auto // { dg-error "'auto' conflicts" }
diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
index 124befc..dcad07f 100644
--- a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
@@ -10,7 +10,7 @@
 #pragma acc loop seq
 	for (int jx = 0; jx < 10; jx++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++) {}
       }
 
@@ -20,7 +20,7 @@
 #pragma acc loop auto
 	for (int jx = 0; jx < 10; jx++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++)
 	  {
 #pragma acc loop vector
@@ -51,7 +51,7 @@
 #pragma acc loop vector
 	for (int jx = 0; jx < 10; jx++)
 	  {
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	    for (int kx = 0; kx < 10; kx++) {}
 	  }
 
@@ -64,27 +64,27 @@
 
       }
     
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int ix = 0; ix < 10; ix++)
       {
-#pragma acc loop auto
+#pragma acc loop auto independent
 	for (int jx = 0; jx < 10; jx++)
 	  {
-#pragma acc loop auto
+#pragma acc loop auto independent
 	    for (int kx = 0; kx < 10; kx++) {}
 	  }
       }
 
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int ix = 0; ix < 10; ix++)
       {
-#pragma acc loop auto
+#pragma acc loop auto independent
 	for (int jx = 0; jx < 10; jx++)
 	  {
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	    for (int kx = 0; kx < 10; kx++)
 	      {
-#pragma acc loop auto
+#pragma acc loop auto independent
 		for (int lx = 0; lx < 10; lx++) {}
 	      }
 	  }
@@ -101,7 +101,7 @@
 #pragma acc loop seq
 	for (int jx = 0; jx < 10; jx++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++) {}
       }
 
@@ -111,7 +111,7 @@
 #pragma acc loop auto
 	for (int jx = 0; jx < 10; jx++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++)
 	  {
 #pragma acc loop vector
@@ -142,7 +142,7 @@
 #pragma acc loop vector
 	for (int jx = 0; jx < 10; jx++)
 	  {
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	    for (int kx = 0; kx < 10; kx++) {}
 	  }
 
@@ -176,7 +176,7 @@
 #pragma acc loop seq
 	for (int jx = 0; jx < 10; jx++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++) {}
       }
 
@@ -186,7 +186,7 @@
 #pragma acc loop auto
 	for (int jx = 0; jx < 10; jx++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++)
 	  {
 #pragma acc loop vector
@@ -194,20 +194,20 @@
 	  }
       }
 
-#pragma acc loop auto
+#pragma acc loop
     for (int ix = 0; ix < 10; ix++)
       {
-#pragma acc loop auto
+#pragma acc loop
 	for (int jx = 0; jx < 10; jx++) {}
       }
 
-#pragma acc loop auto
+#pragma acc loop
     for (int ix = 0; ix < 10; ix++)
       {
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++)
 	  {
-#pragma acc loop auto
+#pragma acc loop
 	    for (int kx = 0; kx < 10; kx++) {}
 	  }
       }
@@ -222,17 +222,17 @@
 #pragma acc loop seq
 	for (int jx = 0; jx < 10; jx++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++) {}
       }
 
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int ix = 0; ix < 10; ix++) {}
 
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
     for (int ix = 0; ix < 10; ix++)
       {
-#pragma acc loop auto
+#pragma acc loop auto independent
 	for (int jx = 0; jx < 10; jx++) {}
       }
 }
@@ -240,6 +240,6 @@
 #pragma acc routine seq
 void Seq (void)
 {
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
     for (int ix = 0; ix < 10; ix++) {}
 }
diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c
index af3f0bd..5aa36e9 100644
--- a/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c
@@ -72,12 +72,12 @@
 #pragma acc loop tile(*) gang vector
     for (int ix = 0; ix < 10; ix++)
       {
-	#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	#pragma acc loop auto independent /* { dg-warning "insufficient partitioning" } */
 	for (int jx = 0; jx < 10; jx++)
 	  ;
       }
 
-#pragma acc loop tile(*) auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop tile(*) auto independent /* { dg-warning "insufficient partitioning" } */
     for (int ix = 0; ix < 10; ix++)
       {
 	#pragma acc loop worker
diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-3.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-3.c
new file mode 100644
index 0000000..8f79ead
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-3.c
@@ -0,0 +1,78 @@
+/* Ensure that the auto clause falls back to seq parallelism when the
+   OpenACC loop is not explicitly independent.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+void
+test ()
+{
+  int i, j, k, l, n = 100;
+  
+#pragma acc parallel loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (i = 0; i < n; i++)
+#pragma acc loop auto independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    for (j = 0; j < n; j++)
+#pragma acc loop worker vector /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+      for (k = 0; k < n; k++)
+	;
+
+#pragma acc parallel loop auto independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  for (i = 0; i < n; i++)
+#pragma acc loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    for (j = 0; j < n; j++)
+#pragma acc loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (k = 0; k < n; k++)
+#pragma acc loop auto independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+	for (l = 0; l < n; l++)
+	  ;
+
+#pragma acc parallel loop gang /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  for (i = 0; i < n; i++)
+#pragma acc loop worker /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+    for (j = 0; j < n; j++)
+#pragma acc loop vector /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (k = 0; k < n; k++)
+	{
+#pragma acc loop auto independent /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+	  /* { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 } */
+	  for (l = 0; l < n; l++)
+	    ;
+#pragma acc loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+	  for (l = 0; l < n; l++)
+	    ;
+	}
+
+#pragma acc parallel loop /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 } */
+  for (i = 0; i < n; i++)
+    {
+#pragma acc loop gang worker /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+      for (j = 0; j < n; j++)
+#pragma acc loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+	for (k = 0; k < n; k++)
+	  {
+#pragma acc loop vector /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+	    for (l = 0; l < n; l++)
+	      ;
+#pragma acc loop auto independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+	    for (l = 0; l < n; l++)
+	      ;
+	  }
+#pragma acc loop worker /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+      for (j = 0; j < n; j++)
+#pragma acc loop vector /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+	for (k = 0; k < n; k++)
+	  ;
+    }
+
+#pragma acc parallel loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  for (i = 0; i < n; i++)
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+    for (j = 0; j < n; j++)
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      /* { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 } */
+      for (k = 0; k < n; k++)
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+	  for (l = 0; l < n; l++)
+	    ;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-1.c b/gcc/testsuite/c-c++-common/goacc/mdc-1.c
new file mode 100644
index 0000000..6c6a81e
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/mdc-1.c
@@ -0,0 +1,55 @@
+/* Test OpenACC's support for manual deep copy, including the attach
+   and detach clauses.  */
+
+/* { dg-do compile { target int32 } } */
+/* { dg-additional-options "-fdump-tree-omplower" } */
+
+void
+t1 ()
+{
+  struct foo {
+    int *a, *b, c, d, *e;
+  } s;
+
+  int *a, *z;
+
+#pragma acc enter data copyin(s)
+  {
+#pragma acc data copy(s.a[0:10]) copy(z[0:10])
+    {
+      s.e = z;
+#pragma acc parallel loop attach(s.e)
+      for (int i = 0; i < 10; i++)
+        s.a[i] = s.e[i];
+
+
+      a = s.e;
+#pragma acc enter data attach(a)
+#pragma acc exit data detach(a)
+    }
+
+#pragma acc enter data copyin(a)
+#pragma acc acc enter data attach(s.e)
+#pragma acc exit data detach(s.e)
+
+#pragma acc data attach(s.e)
+    {
+    }
+#pragma acc exit data delete(a)
+
+#pragma acc exit data detach(a) finalize
+#pragma acc exit data detach(s.a) finalize
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:s .len: 32.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.tofrom:.z .len: 40.. map.struct:s .len: 1.. map.alloc:s.a .len: 8.. map.tofrom:._1 .len: 40.. map.attach:s.a .bias: 0.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_parallel map.attach:s.e .bias: 8.. map.tofrom:s .len: 32" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.attach:a .bias: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:a .bias: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.to:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.detach:s.e .bias: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_data map.attach:s.e .bias: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data map.release:a .len: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:a .bias: 8.." 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "pragma omp target oacc_enter_exit_data finalize map.force_detach:s.a .bias: 8.." 1 "omplower" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/mdc-2.c b/gcc/testsuite/c-c++-common/goacc/mdc-2.c
new file mode 100644
index 0000000..fae8667
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/mdc-2.c
@@ -0,0 +1,62 @@
+/* Test OpenACC's support for manual deep copy, including the attach
+   and detach clauses.  */
+
+void
+t1 ()
+{
+  struct foo {
+    int *a, *b, c, d, *e;
+  } s;
+
+  int *a, *z, scalar, **y;
+
+#pragma acc enter data copyin(s) detach(z) /* { dg-error ".detach. is not valid for" } */
+  {
+#pragma acc data copy(s.a[0:10]) copy(z[0:10])
+    {
+      s.e = z;
+#pragma acc parallel loop attach(s.e) detach(s.b) /* { dg-error ".detach. is not valid for" } */
+      for (int i = 0; i < 10; i++)
+        s.a[i] = s.e[i];
+
+      a = s.e;
+#pragma acc enter data attach(a) detach(s.c) /* { dg-error ".detach. is not valid for" } */
+#pragma acc exit data detach(a)
+    }
+
+#pragma acc enter data attach(z[:5]) /* { dg-error "expected single pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(z[:5]) /* { dg-error "expected single pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(z[1:]) /* { dg-error "expected single pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(z[1:]) /* { dg-error "expected single pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(z[:]) /* { dg-error "expected single pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(z[:]) /* { dg-error "expected single pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(z[3]) /* { dg-error "expected pointer in .attach. clause" } */
+#pragma acc exit data detach(z[3]) /* { dg-error "expected pointer in .detach. clause" } */
+
+#pragma acc acc enter data attach(s.e)
+#pragma acc exit data detach(s.e) attach(z) /* { dg-error ".attach. is not valid for" } */
+
+#pragma acc data attach(s.e)
+    {
+    }
+#pragma acc exit data delete(a) attach(s.a) /* { dg-error ".attach. is not valid for" } */
+
+#pragma acc enter data attach(scalar) /* { dg-error "expected pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(scalar) /* { dg-error "expected pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(s) /* { dg-error "expected pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(s) /* { dg-error "expected pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+  }
+
+#pragma acc enter data attach(y[10])
+#pragma acc exit data detach(y[10])
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/nested-reductions-fail.c b/gcc/testsuite/c-c++-common/goacc/nested-reductions-fail.c
new file mode 100644
index 0000000..a642dd0
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/nested-reductions-fail.c
@@ -0,0 +1,492 @@
+/* Test erroneous cases of nested reduction loops.  */
+
+void acc_parallel (void)
+{
+  int i, j, k, l, sum, diff;
+
+  #pragma acc parallel
+  {
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop collapse(2) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(+:sum) reduction(-:diff)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(-:diff) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(+:sum) // { dg-error "nested loop in reduction needs reduction clause for .diff." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+/* The same tests as above, but using a combined parallel loop construct.  */
+
+void acc_parallel_loop (void)
+{
+  int i, j, k, l, sum, diff;
+
+  #pragma acc parallel loop
+  for (int h = 0; h < 10; ++h)
+  {
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop collapse(2) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(+:sum) reduction(-:diff)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(-:diff) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(+:sum) // { dg-error "nested loop in reduction needs reduction clause for .diff." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+/* The same tests as above, but now the outermost reduction clause is on
+   the parallel region, not the outermost loop.  */
+void acc_parallel_reduction (void)
+{
+  int i, j, k, l, sum, diff;
+
+  #pragma acc parallel reduction(+:sum)
+  {
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop collapse(2) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(max:sum) // { dg-error "conflicting reduction operations for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(max:sum) // { dg-error "conflicting reduction operations for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(-:diff) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(-:diff) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(+:sum) // { dg-error "nested loop in reduction needs reduction clause for .diff." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+/* The same tests as above, but using a combined parallel loop construct, and
+   the outermost reduction clause is on that one, not the outermost loop.  */
+void acc_parallel_loop_reduction (void)
+{
+  int i, j, k, l, sum, diff;
+
+  #pragma acc parallel loop reduction(+:sum)
+  for (int h = 0; h < 10; ++h)
+  {
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop collapse(2) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(max:sum) // { dg-error "conflicting reduction operations for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(max:sum) // { dg-error "conflicting reduction operations for .sum." }
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(-:diff) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(-:diff) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(+:sum) // { dg-error "nested loop in reduction needs reduction clause for .diff." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+/* The same tests as above, but inside a routine construct.  */
+#pragma acc routine gang
+void acc_routine (void)
+{
+  int i, j, k, l, sum, diff;
+
+  {
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop collapse(2) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+          #pragma acc loop reduction(+:sum)
+          for (l = 0; l < 10; l++)
+            sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum) // { dg-error "conflicting reduction operations for .sum." }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+	  #pragma acc loop reduction(*:sum) // { dg-error "conflicting reduction operations for .sum." }
+	  for (l = 0; l < 10; l++)
+	    sum = 1;
+
+    #pragma acc loop reduction(+:sum) reduction(-:diff)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(-:diff) // { dg-error "nested loop in reduction needs reduction clause for .sum." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(+:sum) // { dg-error "nested loop in reduction needs reduction clause for .diff." }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+void acc_kernels (void)
+{
+  int i, j, k, sum, diff;
+
+  /* FIXME:  No diagnostics are produced for these loops because reductions
+     in kernels regions are not supported yet.  */
+  #pragma acc kernels
+  {
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:diff)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(-:sum)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+  }
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/nested-reductions.c b/gcc/testsuite/c-c++-common/goacc/nested-reductions.c
new file mode 100644
index 0000000..bff6652
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/nested-reductions.c
@@ -0,0 +1,420 @@
+/* Test cases of nested reduction loops that should compile cleanly.  */
+
+void acc_parallel (void)
+{
+  int i, j, k, sum, diff;
+
+  #pragma acc parallel
+  {
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop collapse(2) reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop collapse(2) reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum) reduction(-:diff)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(+:sum)
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(-:diff)
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+/* The same tests as above, but using a combined parallel loop construct.  */
+
+void acc_parallel_loop (void)
+{
+  int i, j, k, l, sum, diff;
+
+  #pragma acc parallel loop
+  for (int h = 0; h < 10; ++h)
+  {
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop collapse(2) reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop collapse(2) reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum) // { dg-warning "insufficient partitioning available to parallelize loop" }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum) reduction(-:diff)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(+:sum) // { dg-warning "insufficient partitioning available to parallelize loop" }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(-:diff) // { dg-warning "insufficient partitioning available to parallelize loop" }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+/* The same tests as above, but now the outermost reduction clause is on
+   the parallel region, not the outermost loop.  */
+
+void acc_parallel_reduction (void)
+{
+  int i, j, k, sum, diff;
+
+  #pragma acc parallel reduction(+:sum)
+  {
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    for (i = 0; i < 10; i++)
+      #pragma acc loop
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum) reduction(-:diff)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(+:sum)
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(-:diff)
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(+:sum)
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(-:diff)
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(+:sum)
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+/* The same tests as above, but using a combined parallel loop construct, and
+   the outermost reduction clause is on that one, not the outermost loop.  */
+void acc_parallel_loop_reduction (void)
+{
+  int i, j, k, sum, diff;
+
+  #pragma acc parallel loop reduction(+:sum)
+  for (int h = 0; h < 10; ++h)
+  {
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    for (i = 0; i < 10; i++)
+      #pragma acc loop
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum) // { dg-warning "insufficient partitioning available to parallelize loop" }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum) reduction(-:diff)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(+:sum) // { dg-warning "insufficient partitioning available to parallelize loop" }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(-:diff) // { dg-warning "insufficient partitioning available to parallelize loop" }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(+:sum) // { dg-warning "insufficient partitioning available to parallelize loop" }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(-:diff) // { dg-warning "insufficient partitioning available to parallelize loop" }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(+:sum) // { dg-warning "insufficient partitioning available to parallelize loop" }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop // { dg-warning "insufficient partitioning available to parallelize loop" }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+/* The same tests as above, but inside a routine construct.  */
+#pragma acc routine gang
+void acc_routine (void) // { dg-bogus "region is gang partitioned but does not contain gang partitioned code" "TODO" { xfail *-*-* } }
+{
+  int i, j, k, sum, diff;
+
+  {
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop collapse(2) reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop collapse(2) reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum) // { dg-bogus "insufficient partitioning available to parallelize loop" "TODO" { xfail *-*-* } }
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum) reduction(-:diff)
+    for (i = 0; i < 10; i++)
+      {
+        #pragma acc loop reduction(+:sum) // { dg-bogus "insufficient partitioning available to parallelize loop" "TODO" { xfail *-*-* } }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(+:sum)
+          for (k = 0; k < 10; k++)
+            sum = 1;
+
+        #pragma acc loop reduction(-:diff) // { dg-bogus "insufficient partitioning available to parallelize loop" "TODO" { xfail *-*-* } }
+        for (j = 0; j < 10; j++)
+          #pragma acc loop reduction(-:diff)
+          for (k = 0; k < 10; k++)
+            diff = 1;
+      }
+  }
+}
+
+void acc_kernels (void)
+{
+  int i, j, k, sum, diff;
+
+  /* FIXME:  These tests are not meaningful yet because reductions in
+     kernels regions are not supported yet.  */
+  #pragma acc kernels
+  {
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+
+    #pragma acc loop reduction(+:sum)
+    for (i = 0; i < 10; i++)
+      #pragma acc loop reduction(+:sum)
+      for (j = 0; j < 10; j++)
+        #pragma acc loop reduction(+:sum)
+        for (k = 0; k < 10; k++)
+          sum = 1;
+  }
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-conditional-loop-independent_seq.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-conditional-loop-independent_seq.c
new file mode 100644
index 0000000..c21273a
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-conditional-loop-independent_seq.c
@@ -0,0 +1,129 @@
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+   construct containing conditionally executed 'loop' constructs with
+   'independent' or 'seq' clauses.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+extern int c;
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+  if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop seq
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent worker
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent vector
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang vector
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang worker
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent worker vector
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang worker vector
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent worker
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent vector
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop seq
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop seq
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop seq
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto.c
new file mode 100644
index 0000000..5268fd1
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto.c
@@ -0,0 +1,126 @@
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+   construct containing 'loop' constructs with explicit or implicit 'auto'
+   clause.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels
+ /* Strangely indented to keep this similar to other test cases.  */
+ {
+#pragma acc loop
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-independent_seq.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-independent_seq.c
new file mode 100644
index 0000000..dad1bdb
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-independent_seq.c
@@ -0,0 +1,126 @@
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+   construct containing 'loop' constructs with 'independent' or 'seq'
+   clauses.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels
+ /* Strangely indented to keep this similar to other test cases.  */
+ {
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent worker /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent vector /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang vector /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang worker /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent worker vector /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang worker vector /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent gang /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent worker /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent vector /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loops.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loops.c
new file mode 100644
index 0000000..336be88
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loops.c
@@ -0,0 +1,47 @@
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+   construct containing loops.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ {
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    ;
+
+  for (x = 0; x < 10; x++)
+    ;
+
+  for (x = 0; x < 10; x++)
+    for (y = 0; y < 10; y++)
+      for (z = 0; z < 10; z++)
+	;
+
+  for (x = 0; x < 10; x++)
+    ;
+
+  for (x = 0; x < 10; x++)
+    for (y = 0; y < 10; y++)
+      ;
+
+  for (x = 0; x < 10; x++)
+    for (y = 0; y < 10; y++)
+      for (z = 0; z < 10; z++)
+	;
+
+  for (x = 0; x < 10; x++)
+    for (y = 0; y < 10; y++)
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-straight-line.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-straight-line.c
new file mode 100644
index 0000000..09b58e2
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-straight-line.c
@@ -0,0 +1,82 @@
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+   construct containing straight-line code.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+#pragma acc routine gang
+extern int
+f_g (int);
+
+#pragma acc routine worker
+extern int
+f_w (int);
+
+#pragma acc routine vector
+extern int
+f_v (int);
+
+#pragma acc routine seq
+extern int
+f_s (int);
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels /* { dg-warning "region contains gang partitioned code but is not gang partitioned" } */
+  {
+    x = 0; /* { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" } */
+    y = x < 10;
+    z = x++;
+    ;
+
+    y = 0;
+    z = y < 10;
+    x -= f_g (y++); /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */
+    ;
+
+    x = f_w (0); /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+    z = f_v (x < 10); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+    y -= f_s (x++); /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    ;
+
+    x = 0;
+    y = x < 10;
+    z = (x++);
+    y = 0;
+    x = y < 10;
+    z += (y++);
+    ;
+
+    x = 0;
+    y += f_s (x < 10); /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    x++;
+    y = 0;
+    y += f_v (y < 10); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+    y++;
+    z = 0;
+    y += f_w (z < 10); /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+    z++;
+    ;
+
+    x = 0;
+    y *= f_g ( /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */
+	      f_w (x < 10) /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+	      + f_g (x < 10) /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */
+	      );
+    x++;
+    y = 0;
+    y *= y < 10;
+    y++;
+    z = 0;
+    y *= z < 10;
+    z++;
+    ;
+  }
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c
new file mode 100644
index 0000000..806ccc7
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c
@@ -0,0 +1,121 @@
+/* Test the output of "-fopt-info-optimized-omp" for combined OpenACC 'kernels
+   loop' constructs with explicit or implicit 'auto' clause.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop auto gang vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop auto gang worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop auto worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop auto gang worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop
+      for (z = 0; z < 10; z++)
+	;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-independent_seq.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-independent_seq.c
new file mode 100644
index 0000000..b743636
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-independent_seq.c
@@ -0,0 +1,121 @@
+/* Test the output of "-fopt-info-optimized-omp" for combined OpenACC 'kernels
+   loop' constructs with 'independent' or 'seq' clauses.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop independent gang /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop independent worker /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop independent vector /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop independent gang vector /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop independent gang worker /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop independent worker vector /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop independent gang worker vector /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop independent gang /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent worker /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent vector /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop independent /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc kernels loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-conditional-loop-independent_seq.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-conditional-loop-independent_seq.c
new file mode 100644
index 0000000..21d31c4
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-conditional-loop-independent_seq.c
@@ -0,0 +1,204 @@
+/* Test the output of "-fopt-info-optimized-omp" for OpenACC 'kernels'
+   constructs containing conditionally executed 'loop' constructs with
+   'independent' or 'seq' clauses.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+extern int c;
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop seq
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent gang
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent worker
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent vector
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent gang vector
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent gang worker
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent worker vector
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent gang worker vector
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent gang
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent worker
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent vector
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+      ;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop seq
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop seq
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop independent
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+ /* Strangely indented to keep this similar to other test cases.  */
+ if (c) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+ {
+#pragma acc loop seq
+  /* { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq
+      for (z = 0; z < 10; z++)
+	;
+ }
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loop-auto.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loop-auto.c
new file mode 100644
index 0000000..1fe1f29
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loop-auto.c
@@ -0,0 +1,138 @@
+/* Test the output of "-fopt-info-optimized-omp" for OpenACC 'kernels'
+   constructs containing 'loop' constructs with explicit or implicit 'auto'
+   clause.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels
+#pragma acc loop
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop auto gang vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop auto gang worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop auto worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop auto gang worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc kernels
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop auto
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop
+  /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop
+      for (z = 0; z < 10; z++)
+	;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loop-independent_seq.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loop-independent_seq.c
new file mode 100644
index 0000000..6824d70
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loop-independent_seq.c
@@ -0,0 +1,138 @@
+/* Test the output of "-fopt-info-optimized-omp" for OpenACC 'kernels'
+   constructs containing 'loop' constructs with 'independent' or 'seq'
+   clauses.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop independent gang /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop independent worker /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop independent vector /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop independent gang vector /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop independent gang worker /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop independent worker vector /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop independent gang worker vector /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop independent gang /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent worker /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent vector /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc kernels
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc kernels
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  /* { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (z = 0; z < 10; z++)
+	;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops.c
new file mode 100644
index 0000000..365464b
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops.c
@@ -0,0 +1,50 @@
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+   construct containing loops.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    for (y = 0; y < 10; y++)
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    for (y = 0; y < 10; y++)
+      for (z = 0; z < 10; z++)
+	;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" } */
+    for (y = 0; y < 10; y++)
+      for (z = 0; z < 10; z++)
+	;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c
index 735df7d..2b49a8b 100644
--- a/gcc/testsuite/c-c++-common/goacc/note-parallelism.c
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism.c
@@ -1,4 +1,5 @@
-/* Test the output of "-fopt-info-optimized-omp".  */
+/* Test the output of "-fopt-info-optimized-omp" for OpenACC 'parallel'
+   constructs.  */
 
 /* { dg-additional-options "-fopt-info-optimized-omp" } */
 
diff --git a/gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c b/gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c
new file mode 100644
index 0000000..b0bd4a7
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/orphan-reductions-1.c
@@ -0,0 +1,56 @@
+/* Test orphan reductions.  */
+
+#include <assert.h>
+
+#pragma acc routine seq
+int
+seq_reduction (int n)
+{
+  int i, sum = 0;
+#pragma acc loop seq reduction(+:sum)
+  for (i = 0; i < n; i++)
+    sum = sum + 1;
+
+  return sum;
+}
+
+#pragma acc routine gang
+int
+gang_reduction (int n)
+{
+  int i, s1 = 0, s2 = 0;
+#pragma acc loop gang reduction(+:s1) /* { dg-error "gang reduction on an orphan loop" } */
+  for (i = 0; i < n; i++)
+    s1 = s1 + 2;
+
+#pragma acc loop gang reduction(+:s2) /* { dg-error "gang reduction on an orphan loop" } */
+  for (i = 0; i < n; i++)
+    s2 = s2 + 2;
+
+
+  return s1 + s2;
+}
+
+#pragma acc routine worker
+int
+worker_reduction (int n)
+{
+  int i, sum = 0;
+#pragma acc loop worker reduction(+:sum)
+  for (i = 0; i < n; i++)
+    sum = sum + 3;
+
+  return sum;
+}
+
+#pragma acc routine vector
+int
+vector_reduction (int n)
+{
+  int i, sum = 0;
+#pragma acc loop vector reduction(+:sum)
+  for (i = 0; i < n; i++)
+    sum = sum + 4;
+
+  return sum;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/parallel-dims-1.c b/gcc/testsuite/c-c++-common/goacc/parallel-dims-1.c
index 57f682f..6cdbebe 100644
--- a/gcc/testsuite/c-c++-common/goacc/parallel-dims-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/parallel-dims-1.c
@@ -3,9 +3,11 @@
 
 void f(int i)
 {
-#pragma acc kernels num_gangs(i) num_workers(i) vector_length(i)
+#pragma acc kernels \
+  num_gangs(i) num_workers(i) vector_length(i)
   ;
 
-#pragma acc parallel num_gangs(i) num_workers(i) vector_length(i)
+#pragma acc parallel /* { dg-bogus "region is (gang|worker|vector) partitioned" "" { xfail *-*-* } } */ \
+  num_gangs(i) num_workers(i) vector_length(i)
   ;
 }
diff --git a/gcc/testsuite/c-c++-common/goacc/parallel-reduction.c b/gcc/testsuite/c-c++-common/goacc/parallel-reduction.c
index d7cc947..9a142c4 100644
--- a/gcc/testsuite/c-c++-common/goacc/parallel-reduction.c
+++ b/gcc/testsuite/c-c++-common/goacc/parallel-reduction.c
@@ -6,7 +6,7 @@
 
 #pragma acc data copy (dummy)
   {
-#pragma acc parallel num_gangs (10) copy (sum) reduction (+:sum)
+#pragma acc parallel num_gangs (10) copy (sum) reduction (+:sum) /* { dg-warning "gang partitioned" } */
     {
       int v = 5;
       sum += 10 + v;
diff --git a/gcc/testsuite/c-c++-common/goacc/pr70688.c b/gcc/testsuite/c-c++-common/goacc/pr70688.c
index 5a23665..3f5584a 100644
--- a/gcc/testsuite/c-c++-common/goacc/pr70688.c
+++ b/gcc/testsuite/c-c++-common/goacc/pr70688.c
@@ -21,7 +21,7 @@
 
 #pragma acc data copy (dummy)
   {
-#pragma acc parallel num_gangs (10) copy (sum) reduction (+:sum)
+#pragma acc parallel num_gangs (10) copy (sum) reduction (+:sum) /* { dg-warning "region is gang partitioned" } */
     {
       int v = 5;
       sum += 10 + v;
@@ -36,11 +36,11 @@
 {
   int i, s = 0;
 
-#pragma acc parallel num_gangs (10) copy (s) reduction (+:s)
+#pragma acc parallel num_gangs (10) copy (s) reduction (+:s) /* { dg-warning "region is gang partitioned" } */
   for (i = 0; i < n; i++)
     s += i+1;
 
-#pragma acc parallel num_gangs (10) reduction (+:s) copy (s)
+#pragma acc parallel num_gangs (10) reduction (+:s) copy (s) /* { dg-warning "region is gang partitioned" } */
   for (i = 0; i < n; i++)
     s += i+1;
 
diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-6.c b/gcc/testsuite/c-c++-common/goacc/reduction-6.c
index 619f82b..3c10b4d 100644
--- a/gcc/testsuite/c-c++-common/goacc/reduction-6.c
+++ b/gcc/testsuite/c-c++-common/goacc/reduction-6.c
@@ -18,17 +18,6 @@
 
   #pragma acc parallel
   {
-    #pragma acc loop reduction(+:b)
-    for (int i = 0; i < N; i++)
-      {
-        #pragma acc loop
-	for (int j = 0; j < N; j++)
-	  b += 1;
-      }
-  }
-
-  #pragma acc parallel
-  {
     #pragma acc loop reduction(+:c)
     for (int i = 0; i < N; i++)
       {
diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-7.c b/gcc/testsuite/c-c++-common/goacc/reduction-7.c
new file mode 100644
index 0000000..eba1d02
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/reduction-7.c
@@ -0,0 +1,111 @@
+/* Exercise invalid reductions on array and struct members.  */
+
+void
+test_parallel ()
+{
+  struct {
+    int a;
+    float b[5];
+  } s1, s2[10];
+
+  int i;
+  double z[100];
+
+#pragma acc parallel reduction(+:s1.a) /* { dg-error "expected '\\\)' before '\\\.' token" } */
+  for (i = 0; i < 10; i++)
+    s1.a += 1;
+
+#pragma acc parallel reduction(+:s1.b[3]) /* { dg-error "expected '\\\)' before '\\\.' token" } */
+  for (i = 0; i < 10; i++)
+    s1.b[3] += 1;
+
+#pragma acc parallel reduction(+:s2[2].a) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+  for (i = 0; i < 10; i++)
+    s2[2].a += 1;
+
+#pragma acc parallel reduction(+:s2[3].b[4]) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+  for (i = 0; i < 10; i++)
+    s2[3].b[4] += 1;
+
+#pragma acc parallel reduction(+:z[5]) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+  for (i = 0; i < 10; i++)
+    z[5] += 1;
+}
+
+void
+test_combined ()
+{
+  struct {
+    int a;
+    float b[5];
+  } s1, s2[10];
+
+  int i;
+  double z[100];
+
+#pragma acc parallel loop reduction(+:s1.a) /* { dg-error "expected '\\\)' before '\\\.' token" } */
+  for (i = 0; i < 10; i++)
+    s1.a += 1;
+
+#pragma acc parallel loop reduction(+:s1.b[3]) /* { dg-error "expected '\\\)' before '\\\.' token" } */
+  for (i = 0; i < 10; i++)
+    s1.b[3] += 1;
+
+#pragma acc parallel loop reduction(+:s2[2].a) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+  for (i = 0; i < 10; i++)
+    s2[2].a += 1;
+
+#pragma acc parallel loop reduction(+:s2[3].b[4]) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+  for (i = 0; i < 10; i++)
+    s2[3].b[4] += 1;
+
+#pragma acc parallel loop reduction(+:z[5]) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+  for (i = 0; i < 10; i++)
+    z[5] += 1;
+
+}
+
+void
+test_loops ()
+{
+  struct {
+    int a;
+    float b[5];
+  } s1, s2[10];
+
+  int i;
+  double z[100];
+
+#pragma acc parallel
+  {
+#pragma acc loop reduction(+:s1.a) /* { dg-error "expected '\\\)' before '\\\.' token" } */
+  for (i = 0; i < 10; i++)
+    s1.a += 1;
+
+#pragma acc loop reduction(+:s1.b[3]) /* { dg-error "expected '\\\)' before '\\\.' token" } */
+  for (i = 0; i < 10; i++)
+    s1.b[3] += 1;
+
+#pragma acc loop reduction(+:s2[2].a) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+  for (i = 0; i < 10; i++)
+    s2[2].a += 1;
+
+#pragma acc loop reduction(+:s2[3].b[4]) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+  for (i = 0; i < 10; i++)
+    s2[3].b[4] += 1;
+
+#pragma acc loop reduction(+:z[5]) /* { dg-error "expected '\\\)' before '\\\[' token" } */
+  for (i = 0; i < 10; i++)
+    z[5] += 1;
+  }
+}
+
+int
+main ()
+{
+  test_parallel ();
+  test_combined ();
+  test_loops ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-8.c b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
new file mode 100644
index 0000000..8a0283f
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
@@ -0,0 +1,94 @@
+/* { dg-additional-options "-fdump-tree-gimple" } */
+
+#define n 1000
+
+int
+main(void)
+{
+  int i, j;
+  int result, array[n];
+
+#pragma acc parallel loop reduction (+:result)
+  for (i = 0; i < n; i++)
+    result ++;
+
+#pragma acc parallel
+#pragma acc loop reduction (+:result)
+  for (i = 0; i < n; i++)
+    result ++;
+
+#pragma acc parallel
+#pragma acc loop
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+
+#pragma acc parallel
+#pragma acc loop
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop worker vector reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+
+#pragma acc parallel
+#pragma acc loop // { dg-warning "insufficient partitioning" }
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop gang reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+
+#pragma acc parallel copy(result)
+#pragma acc loop // { dg-warning "insufficient partitioning" }
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop gang reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+  
+#pragma acc kernels
+#pragma acc loop
+  for (i = 0; i < n; i++)
+    {
+      result = i;
+
+#pragma acc loop reduction(+:result)
+      for (j = 0; j < n; j++)
+	result ++;
+
+      array[i] = result;
+    }
+
+  return 0;
+}
+
+/* Check that default copy maps are generated for loop reductions.  */
+/* { dg-final { scan-tree-dump-times "reduction..:result. map.tofrom:result .len: 4.." 1 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel map.tofrom:result .len: 4.." 2 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "map.tofrom:array .len: 4000.. firstprivate.result." 3 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "map.tofrom:result .len: 4.. map.tofrom:array .len: 4000.." 1 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "map.tofrom:array .len: 4000.. map.force_tofrom:result .len: 4.." 1 "gimple" } } */
+
diff --git a/gcc/testsuite/c-c++-common/goacc/routine-1.c b/gcc/testsuite/c-c++-common/goacc/routine-1.c
index a756922..e32398d 100644
--- a/gcc/testsuite/c-c++-common/goacc/routine-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/routine-1.c
@@ -1,16 +1,17 @@
+/* Test valid use of clauses with routine.  */
 
 #pragma acc routine gang
-void gang (void)
+void gang (void) /* { dg-warning "partitioned" 3 } */
 {
 }
 
 #pragma acc routine worker
-void worker (void)
+void worker (void) /* { dg-warning "partitioned" 2 } */
 {
 }
 
 #pragma acc routine vector
-void vector (void)
+void vector (void) /* { dg-warning "partitioned" 1 } */
 {
 }
 
@@ -19,14 +20,20 @@
 {
 }
 
+#pragma acc routine nohost
+void nohost (void)
+{
+}
+
 int main ()
 {
-#pragma acc kernels num_gangs (32) num_workers (32) vector_length (32)
+#pragma acc kernels num_gangs (32) num_workers (32) vector_length (32) /* { dg-warning "region contains gang partitioned code but is not gang partitioned" } */
   {
     gang ();
     worker ();
     vector ();
     seq ();
+    nohost ();
   }
 
 #pragma acc parallel num_gangs (32) num_workers (32) vector_length (32)
@@ -35,6 +42,7 @@
     worker ();
     vector ();
     seq ();
+    nohost ();
   }
 
   return 0;
diff --git a/gcc/testsuite/c-c++-common/goacc/routine-2.c b/gcc/testsuite/c-c++-common/goacc/routine-2.c
index fc5eb11..d1ea61e 100644
--- a/gcc/testsuite/c-c++-common/goacc/routine-2.c
+++ b/gcc/testsuite/c-c++-common/goacc/routine-2.c
@@ -1,19 +1,19 @@
-#pragma acc routine gang worker /* { dg-error "multiple loop axes" } */
+#pragma acc routine gang worker /* { dg-error "conflicting level" } */
 void gang (void)
 {
 }
 
-#pragma acc routine worker vector /* { dg-error "multiple loop axes" } */
+#pragma acc routine worker vector /* { dg-error "conflicting level" } */
 void worker (void)
 {
 }
 
-#pragma acc routine vector seq /* { dg-error "multiple loop axes" } */
+#pragma acc routine vector seq /* { dg-error "conflicting level" } */
 void vector (void)
 {
 }
 
-#pragma acc routine seq gang /* { dg-error "multiple loop axes" } */
+#pragma acc routine seq gang /* { dg-error "conflicting level" } */
 void seq (void)
 {
 }
diff --git a/gcc/testsuite/c-c++-common/goacc/routine-4-extern.c b/gcc/testsuite/c-c++-common/goacc/routine-4-extern.c
index ec21db1..c23ddcf 100644
--- a/gcc/testsuite/c-c++-common/goacc/routine-4-extern.c
+++ b/gcc/testsuite/c-c++-common/goacc/routine-4-extern.c
@@ -32,7 +32,7 @@
   for (int i = 0; i < 10; i++)
     red ++;
 
-#pragma acc loop gang reduction (+:red) // { dg-error "disallowed by containing routine" }
+#pragma acc loop gang reduction (+:red) // { dg-error "gang reduction on an orphan loop" }
   for (int i = 0; i < 10; i++)
     red ++;
 
@@ -58,7 +58,7 @@
   for (int i = 0; i < 10; i++)
     red ++;
 
-#pragma acc loop gang reduction (+:red) // { dg-error "disallowed by containing routine" }
+#pragma acc loop gang reduction (+:red) // { dg-error "gang reduction on an orphan loop" }
   for (int i = 0; i < 10; i++)
     red ++;
 
@@ -84,7 +84,7 @@
   for (int i = 0; i < 10; i++)
     red ++;
 
-#pragma acc loop gang reduction (+:red) // { dg-error "disallowed by containing routine" }
+#pragma acc loop gang reduction (+:red) // { dg-error "gang reduction on an orphan loop" }
   for (int i = 0; i < 10; i++)
     red ++;
 
@@ -110,7 +110,7 @@
   for (int i = 0; i < 10; i++)
     red ++;
 
-#pragma acc loop gang reduction (+:red)
+#pragma acc loop gang reduction (+:red) /* { dg-error "gang reduction on an orphan loop" } */
   for (int i = 0; i < 10; i++)
     red ++;
 
diff --git a/gcc/testsuite/c-c++-common/goacc/routine-4.c b/gcc/testsuite/c-c++-common/goacc/routine-4.c
index 5f2194c..ad17371 100644
--- a/gcc/testsuite/c-c++-common/goacc/routine-4.c
+++ b/gcc/testsuite/c-c++-common/goacc/routine-4.c
@@ -23,7 +23,7 @@
   for (int i = 0; i < 10; i++)
     red ++;
 
-#pragma acc loop gang reduction (+:red) // { dg-error "disallowed by containing routine" }
+#pragma acc loop seq reduction (+:red)
   for (int i = 0; i < 10; i++)
     red ++;
 
@@ -49,7 +49,7 @@
   for (int i = 0; i < 10; i++)
     red ++;
 
-#pragma acc loop gang reduction (+:red) // { dg-error "disallowed by containing routine" }
+#pragma acc loop seq reduction (+:red)
   for (int i = 0; i < 10; i++)
     red ++;
 
@@ -75,7 +75,7 @@
   for (int i = 0; i < 10; i++)
     red ++;
 
-#pragma acc loop gang reduction (+:red) // { dg-error "disallowed by containing routine" }
+#pragma acc loop seq reduction (+:red)
   for (int i = 0; i < 10; i++)
     red ++;
 
@@ -101,7 +101,7 @@
   for (int i = 0; i < 10; i++)
     red ++;
 
-#pragma acc loop gang reduction (+:red)
+#pragma acc loop seq reduction (+:red)
   for (int i = 0; i < 10; i++)
     red ++;
 
diff --git a/gcc/testsuite/c-c++-common/goacc/routine-level-of-parallelism-1.c b/gcc/testsuite/c-c++-common/goacc/routine-level-of-parallelism-1.c
new file mode 100644
index 0000000..d71d6e0
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/routine-level-of-parallelism-1.c
@@ -0,0 +1,265 @@
+/* Test various aspects of clauses specifying incompatible levels of
+   parallelism with the OpenACC routine directive.  The Fortran counterpart is
+   ../../gfortran.dg/goacc/routine-level-of-parallelism-1.f.  */
+
+extern void g_1 (void);
+#pragma acc routine (g_1) gang gang /* { dg-error "too many 'gang' clauses" } */
+
+#pragma acc routine worker worker /* { dg-error "too many 'worker' clauses" } */
+void w_1 (void)
+{
+}
+
+#pragma acc routine vector vector /* { dg-error "too many 'vector' clauses" } */
+void v_1 (void)
+{
+}
+
+#pragma acc routine seq seq /* { dg-error "too many 'seq' clauses" } */
+extern void s_1 (void);
+
+
+#pragma acc routine gang gang gang /* { dg-error "too many 'gang' clauses" } */
+void g_2 (void)
+{
+}
+
+#pragma acc routine worker worker worker /* { dg-error "too many 'worker' clauses" } */
+extern void w_2 (void);
+
+extern void v_2 (void);
+#pragma acc routine (v_2) vector vector vector /* { dg-error "too many 'vector' clauses" } */
+
+#pragma acc routine seq seq seq /* { dg-error "too many 'seq' clauses" } */
+void s_2 (void)
+{
+}
+
+
+#pragma acc routine \
+  gang \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */
+void g_3 (void)
+{
+}
+#pragma acc routine (g_3) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*g_3." } */ \
+  gang \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */
+#pragma acc routine (g_3) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*g_3." } */ \
+  gang \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */
+
+extern void w_3 (void);
+#pragma acc routine (w_3) \
+  worker \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */
+#pragma acc routine (w_3) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*w_3." } */ \
+  worker \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */
+#pragma acc routine (w_3) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*w_3." } */ \
+  worker \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */
+
+#pragma acc routine \
+  vector \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */
+void v_3 (void)
+{
+}
+#pragma acc routine (v_3) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*v_3." } */ \
+  vector \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */
+#pragma acc routine (v_3) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*v_3." } */ \
+  vector \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */
+
+extern void s_3 (void);
+#pragma acc routine (s_3) \
+  seq \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */
+#pragma acc routine (s_3) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*s_3." } */ \
+  seq \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */
+#pragma acc routine (s_3) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*s_3." } */ \
+  seq \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */
+
+
+#pragma acc routine \
+  gang gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */ \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */ \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */
+extern void g_4 (void);
+#pragma acc routine (g_4) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*g_4." } */ \
+  gang gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */ \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */ \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */
+#pragma acc routine (g_4) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*g_4." } */ \
+  gang gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */ \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */ \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */
+
+extern void w_4 (void);
+#pragma acc routine (w_4) \
+  worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */ \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */ \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */
+#pragma acc routine (w_4) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*w_4." } */ \
+  worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */ \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */ \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */
+#pragma acc routine (w_4) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*w_4." } */ \
+  worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */ \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */ \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */
+
+#pragma acc routine \
+  vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */ \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */ \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */
+void v_4 (void)
+{
+}
+#pragma acc routine (v_4) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*v_4." } */ \
+  vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */ \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */ \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */
+#pragma acc routine (v_4) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*v_4." } */ \
+  vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */ \
+  seq /* { dg-error ".seq. specifies a conflicting level of parallelism" } */ \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */
+
+#pragma acc routine \
+  seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */ \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */ \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */
+void s_4 (void)
+{
+}
+#pragma acc routine (s_4) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*s_4." } */ \
+  seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */ \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */ \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */
+#pragma acc routine (s_4) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*s_4." } */ \
+  seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  worker /* { dg-error ".worker. specifies a conflicting level of parallelism" } */ \
+  vector /* { dg-error ".vector. specifies a conflicting level of parallelism" } */ \
+  gang /* { dg-error ".gang. specifies a conflicting level of parallelism" } */
+
+
+#pragma acc routine \
+  gang gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  /* { dg-error ".worker. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  /* { dg-error ".vector. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  seq seq seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  /* { dg-error ".seq. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+void g_5 (void)
+{
+}
+#pragma acc routine (g_5) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*g_5." } */ \
+  gang gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  vector vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  /* { dg-error ".vector. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  /* { dg-error ".worker. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  seq seq seq seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  /* { dg-error ".seq. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+#pragma acc routine (g_5) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*g_5." } */ \
+  gang gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  seq seq seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  /* { dg-error ".seq. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  vector vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  /* { dg-error ".vector. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  worker worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  /* { dg-error ".worker. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+
+#pragma acc routine \
+  worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  seq seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  /* { dg-error ".seq. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  vector vector vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  /* { dg-error ".vector. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  /* { dg-error ".gang. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+extern void w_5 (void);
+#pragma acc routine (w_5) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*w_5." } */ \
+  worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  /* { dg-error ".vector. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  /* { dg-error ".seq. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  /* { dg-error ".gang. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+#pragma acc routine (w_5) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*w_5." } */ \
+  worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  /* { dg-error ".seq. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  /* { dg-error ".gang. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  vector vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  /* { dg-error ".vector. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+
+#pragma acc routine \
+  vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  /* { dg-error ".worker. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  seq seq seq seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  /* { dg-error ".seq. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  /* { dg-error ".gang. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+extern void v_5 (void);
+#pragma acc routine (v_5) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*v_5." } */ \
+  vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  seq seq seq seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  /* { dg-error ".seq. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  /* { dg-error ".worker. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  /* { dg-error ".gang. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+#pragma acc routine (v_5) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*v_5." } */ \
+  vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  /* { dg-error ".gang. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  seq seq seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  /* { dg-error ".seq. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  /* { dg-error ".worker. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+
+extern void s_5 (void);
+#pragma acc routine (s_5) \
+  seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  vector vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  /* { dg-error ".vector. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  worker worker worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  /* { dg-error ".worker. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  /* { dg-error ".gang. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+#pragma acc routine (s_5) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*s_5." } */ \
+  seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  vector vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  /* { dg-error ".vector. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  /* { dg-error ".gang. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  worker worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  /* { dg-error ".worker. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
+#pragma acc routine (s_5) /* { dg-error ".#pragma acc routine. already applied to .\[void \]*s_5." } */ \
+  seq seq seq /* { dg-error "too many 'seq' clauses" } */ \
+  worker worker /* { dg-error "too many 'worker' clauses" } */ \
+  /* { dg-error ".worker. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  vector vector vector vector /* { dg-error "too many 'vector' clauses" } */ \
+  /* { dg-error ".vector. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */ \
+  gang gang /* { dg-error "too many 'gang' clauses" } */ \
+  /* { dg-error ".gang. specifies a conflicting level of parallelism" "" { target *-*-* } .-1 } */
diff --git a/gcc/testsuite/c-c++-common/goacc/routine-nohost-1.c b/gcc/testsuite/c-c++-common/goacc/routine-nohost-1.c
new file mode 100644
index 0000000..9baa56c
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/routine-nohost-1.c
@@ -0,0 +1,28 @@
+/* Test the nohost clause for OpenACC routine directive.  Exercising different
+   variants for declaring routines.  */
+
+/* { dg-additional-options "-fdump-tree-oaccdevlow" } */
+
+#pragma acc routine nohost
+int THREE(void)
+{
+  return 3;
+}
+
+#pragma acc routine nohost
+extern void NOTHING(void);
+
+void NOTHING(void)
+{
+}
+
+extern float ADD(float, float);
+
+#pragma acc routine (ADD) nohost
+
+float ADD(float x, float y)
+{
+  return x + y;
+}
+
+/* { dg-final { scan-tree-dump-times "Discarding function" 3 "oaccdevlow" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/serial-dims.c b/gcc/testsuite/c-c++-common/goacc/serial-dims.c
new file mode 100644
index 0000000..41698d2
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/serial-dims.c
@@ -0,0 +1,12 @@
+/* Invalid use of OpenACC parallelism dimensions clauses: num_gangs,
+   num_workers, vector_length with the serial construct.  */
+
+void f(void)
+{
+#pragma acc serial num_gangs (1) /* { dg-error "'num_gangs' is not valid for '#pragma acc serial'" } */
+  ;
+#pragma acc serial num_workers (1) /* { dg-error "'num_workers' is not valid for '#pragma acc serial'" } */
+  ;
+#pragma acc serial vector_length (1) /* { dg-error "'vector_length' is not valid for '#pragma acc serial'" } */
+  ;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/uninit-dim-clause.c b/gcc/testsuite/c-c++-common/goacc/uninit-dim-clause.c
index 9f11196..6bc35d7 100644
--- a/gcc/testsuite/c-c++-common/goacc/uninit-dim-clause.c
+++ b/gcc/testsuite/c-c++-common/goacc/uninit-dim-clause.c
@@ -4,26 +4,29 @@
 {
   int i, j, k;
 
-  #pragma acc parallel num_gangs(i) /* { dg-warning "is used uninitialized in this function" } */
-  ;
+  #pragma acc parallel loop gang num_gangs(i) /* { dg-warning "is used uninitialized in this function" } */
+  for (i = 0; i < 1; i++)
+    ;
 
-  #pragma acc parallel num_workers(j) /* { dg-warning "is used uninitialized in this function" } */
-  ;
+  #pragma acc parallel loop worker num_workers(j) /* { dg-warning "is used uninitialized in this function" } */
+  for (j = 0; j < 1; j++)
+    ;
 
-  #pragma acc parallel vector_length(k) /* { dg-warning "is used uninitialized in this function" } */
-  ;
+  #pragma acc parallel loop vector vector_length(k) /* { dg-warning "is used uninitialized in this function" } */
+  for (k = 0; k < 1; k++)
+    ;
 }
 
 void acc_kernels()
 {
   int i, j, k;
 
-  #pragma acc kernels num_gangs(i) /* { dg-warning "is used uninitialized in this function" } */
+  #pragma acc kernels num_gangs(i) /* { dg-warning "is used uninitialized in this function" "TODO" { xfail *-*-* } } */
   ;
 
-  #pragma acc kernels num_workers(j) /* { dg-warning "is used uninitialized in this function" } */
+  #pragma acc kernels num_workers(j) /* { dg-warning "is used uninitialized in this function" "TODO" { xfail *-*-* } } */
   ;
 
-  #pragma acc kernels vector_length(k) /* { dg-warning "is used uninitialized in this function" } */
+  #pragma acc kernels vector_length(k) /* { dg-warning "is used uninitialized in this function" "TODO" { xfail *-*-* } } */
   ;
 }
diff --git a/gcc/testsuite/g++.dg/goacc/loop-1.c b/gcc/testsuite/g++.dg/goacc/loop-1.c
new file mode 100644
index 0000000..51b20b0
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/loop-1.c
@@ -0,0 +1,23 @@
+void
+f (int i, float j, int k)
+{
+#pragma acc parallel num_gangs (i) num_workers (i) vector_length (i)
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (j) /* { dg-error "'num_gangs' expression must be integral" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_workers (j) /* { dg-error "'num_workers' expression must be integral" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel vector_length (j) /* { dg-error "'vector_length' expression must be integral" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+}
diff --git a/gcc/testsuite/g++.dg/goacc/loop-2.c b/gcc/testsuite/g++.dg/goacc/loop-2.c
new file mode 100644
index 0000000..ddfb480
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/loop-2.c
@@ -0,0 +1,70 @@
+void
+f (int i, int j, int k)
+{
+#pragma acc kernels
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (num: 10)
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 10)
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10)
+  for (i = 0; i < 20; ++i)
+    ;
+
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10, *) /* { dg-error "duplicate operand to clause" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10, static: *) /* { dg-error "duplicate 'num' argument" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop worker (static: 234) /* { dg-error "expected 'num' before" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop worker (num: 234)
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop worker (num: 234, num: 12) /* { dg-error "duplicate operand to clause" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop vector /* { dg-error "gang, worker and vector must occur in this order in a loop nest" } */
+  for (i = 0; i < 20; ++i)
+#pragma acc loop worker
+    for (j = 0; j < 25; ++j)
+      ;
+
+#pragma acc kernels
+#pragma acc loop worker (length: 20) /* { dg-error "expected 'num' before 'length'" } */
+  for (i = 0; i < 20; ++i)
+#pragma acc loop vector (length: 10)
+    for (j = 0; j < 25; ++j)
+      ;
+
+#pragma acc kernels
+#pragma acc loop worker
+  for (i = 0; i < 20; ++i)
+#pragma acc loop vector
+    for (j = 0; j < 25; ++j)
+      ;
+}
diff --git a/gcc/testsuite/g++.dg/goacc/loop-3.c b/gcc/testsuite/g++.dg/goacc/loop-3.c
new file mode 100644
index 0000000..c43b4f3
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/loop-3.c
@@ -0,0 +1,43 @@
+void
+f (int i, int j, int k)
+{
+#pragma acc kernels num_gangs (10) /* { dg-error "'num_gangs' is not valid" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels num_workers (10) /* { dg-error "'num_workers' is not valid" } */
+#pragma acc loop worker
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels vector_length (10) /* { dg-error "'vector_length' is not valid" } */
+#pragma acc loop vector
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (10) num_workers (20) vector_length (32)
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (i) num_workers (j) vector_length (k)
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_workers (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel vector_length (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+}
diff --git a/gcc/testsuite/g++.dg/goacc/mdc.C b/gcc/testsuite/g++.dg/goacc/mdc.C
new file mode 100644
index 0000000..b3abab3
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/mdc.C
@@ -0,0 +1,68 @@
+/* Test OpenACC's support for manual deep copy, including the attach
+   and detach clauses.  */
+
+void
+t1 ()
+{
+  struct foo {
+    int *a, *b, c, d, *e;
+  } s;
+
+  struct foo& rs = s;
+  
+  int *a, *z, scalar, **y;
+  int* const &ra = a;
+  int* const &rz = z;
+  int& rscalar = scalar;
+  int** const &ry = y;
+
+#pragma acc enter data copyin(rs) detach(rz) /* { dg-error ".detach. is not valid for" } */
+  {
+#pragma acc data copy(rs.a[0:10]) copy(rz[0:10])
+    {
+      s.e = z;
+#pragma acc parallel loop attach(rs.e) detach(rs.b) /* { dg-error ".detach. is not valid for" } */
+      for (int i = 0; i < 10; i++)
+        s.a[i] = s.e[i];
+
+      a = s.e;
+#pragma acc enter data attach(ra) detach(rs.c) /* { dg-error ".detach. is not valid for" } */
+#pragma acc exit data detach(ra)
+    }
+
+#pragma acc enter data attach(rz[:5]) /* { dg-error "expected single pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(rz[:5]) /* { dg-error "expected single pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(rz[1:]) /* { dg-error "expected single pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(rz[1:]) /* { dg-error "expected single pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(rz[:]) /* { dg-error "expected single pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(rz[:]) /* { dg-error "expected single pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(rz[3]) /* { dg-error "expected pointer in .attach. clause" } */
+#pragma acc exit data detach(rz[3]) /* { dg-error "expected pointer in .detach. clause" } */
+
+#pragma acc acc enter data attach(rs.e)
+#pragma acc exit data detach(rs.e) attach(rz) /* { dg-error ".attach. is not valid for" } */
+
+#pragma acc data attach(rs.e)
+    {
+    }
+#pragma acc exit data delete(ra) attach(rs.a) /* { dg-error ".attach. is not valid for" } */
+
+#pragma acc enter data attach(rscalar) /* { dg-error "expected pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(rscalar) /* { dg-error "expected pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc enter data attach(rs) /* { dg-error "expected pointer in .attach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+#pragma acc exit data detach(rs) /* { dg-error "expected pointer in .detach. clause" } */
+/* { dg-error "has no data movement clause" "" { target *-*-* } .-1 } */
+  }
+
+#pragma acc enter data attach(ry[10])
+#pragma acc exit data detach(ry[10])
+}
diff --git a/gcc/testsuite/g++.dg/goacc/reductions-1.C b/gcc/testsuite/g++.dg/goacc/reductions-1.C
new file mode 100644
index 0000000..18f43f4
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/reductions-1.C
@@ -0,0 +1,548 @@
+// Test for invalid reduction variables.
+
+class C1
+{
+  int b, d[10];
+
+public:
+  int a, c[10];
+
+  C1 () { a = 0; b = 0; }
+  int& get_b () { return b; }
+  int* get_d () { return d; }
+};
+
+template <typename T>
+class C2
+{
+  T b, d[10];
+
+public:
+  T a, c[10];
+
+  C2 () { a = 0; b = 0; }
+  T& get_b () { return b; }
+  T* get_d () { return d; }
+};
+
+struct S1
+{
+  int a, b, c[10], d[10];
+
+  S1 () { a = 0; b = 0; }
+  int& get_b () { return b; }
+  int* get_d () { return d; }
+};
+
+template <typename T>
+struct S2
+{
+  T a, b, c[10], d[10];
+
+  S2 () { a = 0; b = 0; }
+  T& get_b () { return b; }
+  T* get_d () { return d; }
+};
+
+template <typename T>
+void
+test_parallel ()
+{
+  int i, a[10];
+  T b[10];
+  C1 c1, c1a[10];
+  C2<T> c2, c2a[10];
+  S1 s1, s1a[10];
+  S2<float> s2, s2a[10];
+
+  // Reductions on class members.
+
+#pragma acc parallel reduction(+:c1.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c1.a += 1;
+
+#pragma acc parallel reduction(+:c1.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c1.get_b () += 1;
+
+#pragma acc parallel reduction(+:c1.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c1.c[1] += 1;
+
+#pragma acc parallel reduction(+:c1.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c1.get_d ()[1] += 1;
+
+#pragma acc parallel reduction(+:c1a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c1a[1].a += 1;
+
+#pragma acc parallel reduction(+:c1a[1].get_b ()) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c1a[1].get_b () += 1;
+
+#pragma acc parallel reduction(+:c1a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c1a[1].c[1] += 1;
+
+#pragma acc parallel reduction(+:c1a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c1a[1].get_d ()[1] += 1;
+
+
+  // Reductions on a template class member.
+
+#pragma acc parallel reduction(+:c2.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c2.a += 1;
+
+#pragma acc parallel reduction(+:c2.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c2.get_b () += 1;
+
+#pragma acc parallel reduction(+:c2.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c2.c[1] += 1;
+
+#pragma acc parallel reduction(+:c2.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c2.get_d ()[1] += 1;
+
+
+#pragma acc parallel reduction(+:c2a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c2a[1].a += 1;
+
+#pragma acc parallel reduction(+:c2a[1].get_b ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c2a[1].get_b () += 1;
+
+#pragma acc parallel reduction(+:c2a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c2a[1].c[1] += 1;
+
+#pragma acc parallel reduction(+:c2a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c2a[1].get_d ()[1] += 1;
+
+
+  // Reductions on struct element.
+
+#pragma acc parallel reduction(+:s1.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s1.a += 1;
+
+#pragma acc parallel reduction(+:s1.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s1.get_b () += 1;
+
+#pragma acc parallel reduction(+:s1.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s1.c[1] += 1;
+
+#pragma acc parallel reduction(+:s1.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s1.get_d ()[1] += 1;
+
+#pragma acc parallel reduction(+:s1a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s1a[1].a += 1;
+
+#pragma acc parallel reduction(+:s1a[1].get_b ()) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s1a[1].get_b () += 1;
+
+#pragma acc parallel reduction(+:s1a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s1a[1].c[1] += 1;
+
+#pragma acc parallel reduction(+:s1a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s1a[1].get_d ()[1] += 1;
+
+
+  // Reductions on a template struct element.
+
+#pragma acc parallel reduction(+:s2.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s2.a += 1;
+
+#pragma acc parallel reduction(+:s2.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s2.get_b () += 1;
+
+#pragma acc parallel reduction(+:s2.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s2.c[1] += 1;
+
+#pragma acc parallel reduction(+:s2.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s2.get_d ()[1] += 1;
+
+#pragma acc parallel reduction(+:s2a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s2a[1].a += 1;
+
+#pragma acc parallel reduction(+:s2a[1].get_b ()) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s2a[1].get_b () += 1;
+
+#pragma acc parallel reduction(+:s2a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s2a[1].c[1] += 1;
+
+#pragma acc parallel reduction(+:s2a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s2a[1].get_d ()[1] += 1;
+
+
+  // Reductions on arrays.
+
+#pragma acc parallel reduction(+:a[10]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    a[10] += 1;
+
+#pragma acc parallel reduction(+:b[10]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    b[10] += 1;
+}
+
+template <typename T>
+void
+test_combined ()
+{
+  int i, a[10];
+  T b[10];
+  C1 c1, c1a[10];
+  C2<T> c2, c2a[10];
+  S1 s1, s1a[10];
+  S2<float> s2, s2a[10];
+
+  // Reductions on class members.
+
+#pragma acc parallel loop reduction(+:c1.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c1.a += 1;
+
+#pragma acc parallel loop reduction(+:c1.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c1.get_b () += 1;
+
+#pragma acc parallel loop reduction(+:c1.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c1.c[1] += 1;
+
+#pragma acc parallel loop reduction(+:c1.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c1.get_d ()[1] += 1;
+
+#pragma acc parallel loop reduction(+:c1a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c1a[1].a += 1;
+
+#pragma acc parallel loop reduction(+:c1a[1].get_b ()) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c1a[1].get_b () += 1;
+
+#pragma acc parallel loop reduction(+:c1a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c1a[1].c[1] += 1;
+
+#pragma acc parallel loop reduction(+:c1a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c1a[1].get_d ()[1] += 1;
+
+
+  // Reductions on a template class member.
+
+#pragma acc parallel loop reduction(+:c2.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c2.a += 1;
+
+#pragma acc parallel loop reduction(+:c2.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c2.get_b () += 1;
+
+#pragma acc parallel loop reduction(+:c2.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c2.c[1] += 1;
+
+#pragma acc parallel loop reduction(+:c2.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    c2.get_d ()[1] += 1;
+
+
+#pragma acc parallel loop reduction(+:c2a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c2a[1].a += 1;
+
+#pragma acc parallel loop reduction(+:c2a[1].get_b ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c2a[1].get_b () += 1;
+
+#pragma acc parallel loop reduction(+:c2a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c2a[1].c[1] += 1;
+
+#pragma acc parallel loop reduction(+:c2a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    c2a[1].get_d ()[1] += 1;
+
+
+  // Reductions on struct element.
+
+#pragma acc parallel loop reduction(+:s1.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s1.a += 1;
+
+#pragma acc parallel loop reduction(+:s1.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s1.get_b () += 1;
+
+#pragma acc parallel loop reduction(+:s1.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s1.c[1] += 1;
+
+#pragma acc parallel loop reduction(+:s1.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s1.get_d ()[1] += 1;
+
+#pragma acc parallel loop reduction(+:s1a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s1a[1].a += 1;
+
+#pragma acc parallel loop reduction(+:s1a[1].get_b ()) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s1a[1].get_b () += 1;
+
+#pragma acc parallel loop reduction(+:s1a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s1a[1].c[1] += 1;
+
+#pragma acc parallel loop reduction(+:s1a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s1a[1].get_d ()[1] += 1;
+
+
+  // Reductions on a template struct element.
+
+#pragma acc parallel loop reduction(+:s2.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s2.a += 1;
+
+#pragma acc parallel loop reduction(+:s2.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s2.get_b () += 1;
+
+#pragma acc parallel loop reduction(+:s2.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s2.c[1] += 1;
+
+#pragma acc parallel loop reduction(+:s2.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+  for (i = 0; i < 100; i++)
+    s2.get_d ()[1] += 1;
+
+#pragma acc parallel loop reduction(+:s2a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s2a[1].a += 1;
+
+#pragma acc parallel loop reduction(+:s2a[1].get_b ()) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s2a[1].get_b () += 1;
+
+#pragma acc parallel loop reduction(+:s2a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s2a[1].c[1] += 1;
+
+#pragma acc parallel loop reduction(+:s2a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    s2a[1].get_d ()[1] += 1;
+
+
+  // Reductions on arrays.
+
+#pragma acc parallel loop reduction(+:a[10]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    a[10] += 1;
+
+#pragma acc parallel loop reduction(+:b[10]) // { dg-error "expected '\\\)' before '\\\[' token" }
+  for (i = 0; i < 100; i++)
+    b[10] += 1;
+}
+
+template <typename T>
+void
+test_loop ()
+{
+  int i, a[10];
+  T b[10];
+  C1 c1, c1a[10];
+  C2<T> c2, c2a[10];
+  S1 s1, s1a[10];
+  S2<float> s2, s2a[10];
+
+  // Reductions on class members.
+
+  #pragma acc parallel
+  {
+
+#pragma acc loop reduction(+:c1.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      c1.a += 1;
+
+#pragma acc loop reduction(+:c1.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      c1.get_b () += 1;
+
+#pragma acc loop reduction(+:c1.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      c1.c[1] += 1;
+
+#pragma acc loop reduction(+:c1.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      c1.get_d ()[1] += 1;
+
+#pragma acc loop reduction(+:c1a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      c1a[1].a += 1;
+
+#pragma acc loop reduction(+:c1a[1].get_b ()) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      c1a[1].get_b () += 1;
+
+#pragma acc loop reduction(+:c1a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      c1a[1].c[1] += 1;
+
+#pragma acc loop reduction(+:c1a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      c1a[1].get_d ()[1] += 1;
+
+
+    // Reductions on a template class member.
+
+#pragma acc loop reduction(+:c2.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      c2.a += 1;
+
+#pragma acc loop reduction(+:c2.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      c2.get_b () += 1;
+
+#pragma acc loop reduction(+:c2.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      c2.c[1] += 1;
+
+#pragma acc loop reduction(+:c2.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      c2.get_d ()[1] += 1;
+
+
+#pragma acc loop reduction(+:c2a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      c2a[1].a += 1;
+
+#pragma acc loop reduction(+:c2a[1].get_b ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      c2a[1].get_b () += 1;
+
+#pragma acc loop reduction(+:c2a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      c2a[1].c[1] += 1;
+
+#pragma acc loop reduction(+:c2a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      c2a[1].get_d ()[1] += 1;
+
+
+    // Reductions on struct element.
+
+#pragma acc loop reduction(+:s1.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      s1.a += 1;
+
+#pragma acc loop reduction(+:s1.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      s1.get_b () += 1;
+
+#pragma acc loop reduction(+:s1.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      s1.c[1] += 1;
+
+#pragma acc loop reduction(+:s1.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      s1.get_d ()[1] += 1;
+
+#pragma acc loop reduction(+:s1a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      s1a[1].a += 1;
+
+#pragma acc loop reduction(+:s1a[1].get_b ()) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      s1a[1].get_b () += 1;
+
+#pragma acc loop reduction(+:s1a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      s1a[1].c[1] += 1;
+
+#pragma acc loop reduction(+:s1a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      s1a[1].get_d ()[1] += 1;
+
+
+    // Reductions on a template struct element.
+
+#pragma acc loop reduction(+:s2.a) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      s2.a += 1;
+
+#pragma acc loop reduction(+:s2.get_b ()) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      s2.get_b () += 1;
+
+#pragma acc loop reduction(+:s2.c[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      s2.c[1] += 1;
+
+#pragma acc loop reduction(+:s2.get_d ()[1]) // { dg-error "expected '\\\)' before '\\\.' token" }
+    for (i = 0; i < 100; i++)
+      s2.get_d ()[1] += 1;
+
+#pragma acc loop reduction(+:s2a[1].a) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      s2a[1].a += 1;
+
+#pragma acc loop reduction(+:s2a[1].get_b ()) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      s2a[1].get_b () += 1;
+
+#pragma acc loop reduction(+:s2a[1].c[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      s2a[1].c[1] += 1;
+
+#pragma acc loop reduction(+:s2a[1].get_d ()[1]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      s2a[1].get_d ()[1] += 1;
+
+
+    // Reductions on arrays.
+
+#pragma acc loop reduction(+:a[10]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      a[10] += 1;
+
+#pragma acc loop reduction(+:b[10]) // { dg-error "expected '\\\)' before '\\\[' token" }
+    for (i = 0; i < 100; i++)
+      b[10] += 1;
+  }
+}
+
+int
+main ()
+{
+  test_parallel<double> ();
+  test_combined<long> ();
+  test_loop<short> ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/g++.dg/goacc/routine-2.C b/gcc/testsuite/g++.dg/goacc/routine-2.C
index ea7c9bf..c824933 100644
--- a/gcc/testsuite/g++.dg/goacc/routine-2.C
+++ b/gcc/testsuite/g++.dg/goacc/routine-2.C
@@ -2,15 +2,8 @@
 
 template <typename T>
 extern T one_d();
-#pragma acc routine (one_d) /* { dg-error "names a set of overloads" } */
+#pragma acc routine (one_d) nohost /* { dg-error "names a set of overloads" } */
 
-template <typename T>
-T
-one()
-{
-  return 1;
-}
-#pragma acc routine (one) /* { dg-error "names a set of overloads" } */
 
 int incr (int);
 float incr (float);
diff --git a/gcc/testsuite/g++.dg/gomp/pr91110.C b/gcc/testsuite/g++.dg/gomp/pr91110.C
new file mode 100644
index 0000000..332c99a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/gomp/pr91110.C
@@ -0,0 +1,11 @@
+// PR c++/91110
+// { dg-do compile }
+
+void
+foo ()
+{
+  X b[2];	// { dg-error "'X' was not declared in this scope" }
+  b[0] = 1;	// { dg-error "'b' was not declared in this scope" }
+  #pragma omp target map(to: b)	// { dg-error "'b' does not have a mappable type in 'map' clause" }
+  ;
+}
diff --git a/gcc/testsuite/g++.dg/gomp/unmappable-1.C b/gcc/testsuite/g++.dg/gomp/unmappable-1.C
new file mode 100644
index 0000000..d00ccb5
--- /dev/null
+++ b/gcc/testsuite/g++.dg/gomp/unmappable-1.C
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-fopenmp" } */
+
+class C /* { dg-message "type .C. with virtual members is not mappable" } */
+{
+public:
+  static int static_member; /* { dg-message "static field .C::static_member. is not mappable" } */
+  virtual void f() {}
+};
+
+extern C v[];
+
+int
+main ()
+{
+#pragma omp target map(v) /* { dg-error ".v. does not have a mappable type in .map. clause" } */
+  /* { dg-message "incomplete type .C \\\[\\\]. is not mappable" "" { target *-*-* } .-1 } */
+  {
+  }
+}
diff --git a/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c b/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c
index bd4c07e..296b61d 100644
--- a/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c
+++ b/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c
@@ -9,10 +9,11 @@
   {
 #pragma acc loop gang
     for (int jx = 0; jx < 1; jx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int ix = 0; ix < size; ix++)
 	ary[ix] = place ();
   }
 }
 
-/* { dg-final { scan-tree-dump {OpenACC loops.*Loop 0\(0\).*Loop 24\(1\).*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*Head-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 0\);.*Loop 6\(6\).*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 1\);.*Head-1:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-1:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 2\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 1\);} "oaccdevlow" } } */
+/* { dg-final { scan-tree-dump {
+OpenACC loops.*Loop 0\(0\).*Loop [0-9]{2}\(1\).*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*Head-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 0\);.*Loop 6\(6\).*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 1\);.*Head-1:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-1:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 2\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 1\);} "oaccdevlow" } } */
diff --git a/gcc/testsuite/gfortran.dg/goacc/asyncwait-1.f95 b/gcc/testsuite/gfortran.dg/goacc/asyncwait-1.f95
index d630d38..c8a72fa 100644
--- a/gcc/testsuite/gfortran.dg/goacc/asyncwait-1.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/asyncwait-1.f95
@@ -11,13 +11,13 @@
   a(:) = 3.0
   b(:) = 0.0
 
-  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1 2) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1 2) ! { dg-error "Failed to match clause" }
   do i = 1, N
      b(i) = a(i)
   end do
   !$acc end parallel ! { dg-error "Unexpected \\\!\\\$ACC END PARALLEL" }
 
-  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1,) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1,) ! { dg-error "Failed to match clause" }
   do i = 1, N
      b(i) = a(i)
   end do
@@ -29,25 +29,25 @@
   end do
   !$acc end parallel ! { dg-error "Unexpected \\\!\\\$ACC END PARALLEL" }
 
-  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1,2,) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1,2,) ! { dg-error "Failed to match clause" }
   do i = 1, N
      b(i) = a(i)
   end do
   !$acc end parallel ! { dg-error "Unexpected \\\!\\\$ACC END PARALLEL" }
 
-  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1,2 3) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1,2 3) ! { dg-error "Failed to match clause" }
   do i = 1, N
      b(i) = a(i)
   end do
   !$acc end parallel ! { dg-error "Unexpected \\\!\\\$ACC END PARALLEL" }
 
-  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1,2,,) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1,2,,) ! { dg-error "Failed to match clause" }
   do i = 1, N
      b(i) = a(i)
   end do
   !$acc end parallel ! { dg-error "Unexpected \\\!\\\$ACC END PARALLEL" }
 
-  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1  ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel copyin (a(1:N)) copy (b(1:N)) async (1  ! { dg-error "Failed to match clause" }
   do i = 1, N
      b(i) = a(i)
   end do
diff --git a/gcc/testsuite/gfortran.dg/goacc/asyncwait-2.f95 b/gcc/testsuite/gfortran.dg/goacc/asyncwait-2.f95
index fe4e4ee..3663c9b 100644
--- a/gcc/testsuite/gfortran.dg/goacc/asyncwait-2.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/asyncwait-2.f95
@@ -83,13 +83,13 @@
   end do
   !$acc end parallel ! { dg-error "Unexpected \\\!\\\$ACC END PARALLEL" }
 
-  !$acc parallel copyin (a(1:N)) copy (b(1:N)) waitasync ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel copyin (a(1:N)) copy (b(1:N)) waitasync ! { dg-error "Failed to match clause" }
   do i = 1, N
      b(i) = a(i)
   end do
   !$acc end parallel ! { dg-error "Unexpected \\\!\\\$ACC END PARALLEL" }
 
-  !$acc parallel copyin (a(1:N)) copy (b(1:N)) asyncwait ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel copyin (a(1:N)) copy (b(1:N)) asyncwait ! { dg-error "Failed to match clause" }
   do i = 1, N
      b(i) = a(i)
   end do
diff --git a/gcc/testsuite/gfortran.dg/goacc/asyncwait-3.f95 b/gcc/testsuite/gfortran.dg/goacc/asyncwait-3.f95
index 5c55c36..815928a 100644
--- a/gcc/testsuite/gfortran.dg/goacc/asyncwait-3.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/asyncwait-3.f95
@@ -33,9 +33,9 @@
 
   !$acc wait (1.0) ! { dg-error "WAIT clause at \\\(1\\\) requires a scalar INTEGER expression" }
 
-  !$acc wait 1 ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait 1 ! { dg-error "Failed to match clause" }
 
-  !$acc wait N ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait N ! { dg-error "Failed to match clause" }
 
   !$acc wait (1)
 end program asyncwait
diff --git a/gcc/testsuite/gfortran.dg/goacc/asyncwait-4.f95 b/gcc/testsuite/gfortran.dg/goacc/asyncwait-4.f95
index df31154..057d06b 100644
--- a/gcc/testsuite/gfortran.dg/goacc/asyncwait-4.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/asyncwait-4.f95
@@ -11,21 +11,21 @@
   a(:) = 3.0
   b(:) = 0.0
 
-  !$acc wait async (1 2) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait async (1 2) ! { dg-error "Failed to match clause" }
 
-  !$acc wait async (1,) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait async (1,) ! { dg-error "Failed to match clause" }
 
   !$acc wait async (,1) ! { dg-error "Invalid character in name" }
 
-  !$acc wait async (1, 2, ) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait async (1, 2, ) ! { dg-error "Failed to match clause" }
 
-  !$acc wait async (1, 2, ,) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait async (1, 2, ,) ! { dg-error "Failed to match clause" }
 
-  !$acc wait async (1 ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait async (1 ! { dg-error "Failed to match clause" }
 
-  !$acc wait async (1, *) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait async (1, *) ! { dg-error "Failed to match clause" }
 
-  !$acc wait async (1, a) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait async (1, a) ! { dg-error "Failed to match clause" }
 
   !$acc wait async (a) ! { dg-error "ASYNC clause at \\\(1\\\) requires a scalar INTEGER expression" }
 
@@ -33,9 +33,9 @@
 
   !$acc wait async (1.0) ! { dg-error "ASYNC clause at \\\(1\\\) requires a scalar INTEGER expression" }
 
-  !$acc wait async 1 ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait async 1 ! { dg-error "Failed to match clause" }
 
-  !$acc waitasync ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc waitasync ! { dg-error "Failed to match clause" }
 
-  !$acc wait,async ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc wait,async ! { dg-error "Failed to match clause" }
 end program asyncwait
diff --git a/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized.f95 b/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized.f95
index 0877242..6411f48 100644
--- a/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized.f95
@@ -5,7 +5,7 @@
 ! { dg-additional-options "-fopt-info-optimized-omp" }
 ! { dg-additional-options "-fdump-tree-ompexp" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
-! { dg-additional-options "-fdump-tree-oaccdevlow" }
+! { dg-additional-options "-fdump-tree-oaccloops" }
 
 program main
   implicit none
@@ -21,6 +21,7 @@
 
   !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
   do i = 0, n - 1 ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+                  ! { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" "" { target *-*-* } 23 }
      c(i) = a(f (i)) + b(f (i))
   end do
   !$acc end kernels
@@ -37,6 +38,6 @@
 
 ! Check the offloaded function's classification and compute dimensions (will
 ! always be 1 x 1 x 1 for non-offloading compilation).
-! { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccdevlow" } }
-! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccdevlow" } }
-! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccdevlow" } }
+! { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccloops" } }
+! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/classify-kernels.f95 b/gcc/testsuite/gfortran.dg/goacc/classify-kernels.f95
index f2c4736..8ee3e3d 100644
--- a/gcc/testsuite/gfortran.dg/goacc/classify-kernels.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/classify-kernels.f95
@@ -5,7 +5,7 @@
 ! { dg-additional-options "-fopt-info-optimized-omp" }
 ! { dg-additional-options "-fdump-tree-ompexp" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
-! { dg-additional-options "-fdump-tree-oaccdevlow" }
+! { dg-additional-options "-fdump-tree-oaccloops" }
 
 program main
   implicit none
@@ -17,6 +17,7 @@
 
   !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
   do i = 0, n - 1 ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+                  ! { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" "" { target *-*-* } 19 }
      c(i) = a(i) + b(i)
   end do
   !$acc end kernels
@@ -33,6 +34,6 @@
 
 ! Check the offloaded function's classification and compute dimensions (will
 ! always be 1 x 1 x 1 for non-offloading compilation).
-! { dg-final { scan-tree-dump-times "(?n)Function is parallelized OpenACC kernels offload" 1 "oaccdevlow" } }
-! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccdevlow" } }
-! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccdevlow" } }
+! { dg-final { scan-tree-dump-times "(?n)Function is parallelized OpenACC kernels offload" 1 "oaccloops" } }
+! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/classify-parallel.f95 b/gcc/testsuite/gfortran.dg/goacc/classify-parallel.f95
index a23ea81..ae3f322 100644
--- a/gcc/testsuite/gfortran.dg/goacc/classify-parallel.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/classify-parallel.f95
@@ -4,7 +4,7 @@
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fopt-info-optimized-omp" }
 ! { dg-additional-options "-fdump-tree-ompexp" }
-! { dg-additional-options "-fdump-tree-oaccdevlow" }
+! { dg-additional-options "-fdump-tree-oaccloops" }
 
 program main
   implicit none
@@ -26,6 +26,6 @@
 
 ! Check the offloaded function's classification and compute dimensions (will
 ! always be 1 x 1 x 1 for non-offloading compilation).
-! { dg-final { scan-tree-dump-times "(?n)Function is OpenACC parallel offload" 1 "oaccdevlow" } }
-! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccdevlow" } }
-! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), omp target entrypoint\\)\\)" 1 "oaccdevlow" } }
+! { dg-final { scan-tree-dump-times "(?n)Function is OpenACC parallel offload" 1 "oaccloops" } }
+! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), omp target entrypoint\\)\\)" 1 "oaccloops" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/classify-routine.f95 b/gcc/testsuite/gfortran.dg/goacc/classify-routine.f95
index e435f5d..272fb71 100644
--- a/gcc/testsuite/gfortran.dg/goacc/classify-routine.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/classify-routine.f95
@@ -4,7 +4,7 @@
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fopt-info-optimized-omp" }
 ! { dg-additional-options "-fdump-tree-ompexp" }
-! { dg-additional-options "-fdump-tree-oaccdevlow" }
+! { dg-additional-options "-fdump-tree-oaccloops" }
 
 subroutine ROUTINE
   !$acc routine worker
@@ -25,6 +25,6 @@
 
 ! Check the offloaded function's classification and compute dimensions (will
 ! always be 1 x 1 x 1 for non-offloading compilation).
-! { dg-final { scan-tree-dump-times "(?n)Function is OpenACC routine level 1" 1 "oaccdevlow" } }
-! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccdevlow" } }
-! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(0 1, 1 1, 1 1\\), omp declare target, oacc function \\(0 1, 1 0, 1 0\\)\\)\\)" 1 "oaccdevlow" } }
+! { dg-final { scan-tree-dump-times "(?n)Function is OpenACC routine level 1" 1 "oaccloops" } }
+! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(0 1, 1 1, 1 1\\), omp declare target, oacc function \\(0 1, 1 0, 1 0\\)\\)\\)" 1 "oaccloops" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/common-block-1.f90 b/gcc/testsuite/gfortran.dg/goacc/common-block-1.f90
new file mode 100644
index 0000000..c9de125
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/common-block-1.f90
@@ -0,0 +1,69 @@
+! Test data clauses involving common blocks and common block data.
+! Specifically, validate early matching errors.
+
+subroutine subtest
+  implicit none
+  integer, parameter :: n = 10
+  integer a(n), b(n), c, d(n), e
+  real*4 x(n), y(n), z, w(n), v
+  common /blockA/ a, c, x
+  common /blockB/ b, y, z
+  !$acc declare link(/blockA/, /blockB/, e, v)
+end subroutine subtest
+
+program test
+  implicit none
+  integer, parameter :: n = 10
+  integer a(n), b(n), c, d(n), e
+  real*4 x(n), y(n), z, w(n), v
+  common /blockA/ a, c, x
+  common /blockB/ b, y, z
+  !$acc declare link(/blockA/, /blockB/, e, v)
+
+  !$acc data copy(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data copyin(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data copyout(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data create(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data copyout(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data pcopy(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data pcopyin(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data pcopyout(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data pcreate(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data pcopyout(/blockA/, /blockB/, e, v)
+  !$acc end data
+
+  !$acc data present(/blockA/, /blockB/, e, v) ! { dg-error "" }
+  !$acc end data ! { dg-error "Unexpected ..ACC END DATA statement" }
+
+  !$acc parallel private(/blockA/, /blockB/, e, v)
+  !$acc end parallel
+
+  !$acc parallel firstprivate(/blockA/, /blockB/, e, v)
+  !$acc end parallel
+
+  !$acc exit data delete(/blockA/, /blockB/, e, v)
+
+  !$acc data deviceptr(/blockA/, /blockB/, e, v) ! { dg-error "Syntax error" }
+  !$acc end data ! { dg-error "Unexpected ..ACC END DATA statement" }
+
+  !$acc data deviceptr(/blockA/, /blockB/, e, v) ! { dg-error "Syntax error" }
+  !$acc end data ! { dg-error "Unexpected ..ACC END DATA statement" }
+end program test
diff --git a/gcc/testsuite/gfortran.dg/goacc/common-block-2.f90 b/gcc/testsuite/gfortran.dg/goacc/common-block-2.f90
new file mode 100644
index 0000000..b836389
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/common-block-2.f90
@@ -0,0 +1,49 @@
+! Test data clauses involving common blocks and common block data.
+! Specifically, resolver errors such as duplicate data clauses.
+
+program test
+  implicit none
+  integer, parameter :: n = 10
+  integer a(n), b(n), c, d(n), e
+  real*4 x(n), y(n), z, w(n), v
+  common /blockA/ a, c, x
+  common /blockB/ b, y, z
+
+  !$acc data copy(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc data copyin(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc data copyout(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc data create(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc data copyout(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc data pcopy(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc data pcopyin(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc data pcopyout(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc data pcreate(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc data pcopyout(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end data
+
+  !$acc parallel private(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end parallel
+
+  !$acc parallel firstprivate(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+  !$acc end parallel
+
+  !$acc exit data delete(/blockA/, /blockB/, e, v, a) ! { dg-error "Symbol .a. present on multiple clauses" }
+end program test
diff --git a/gcc/testsuite/gfortran.dg/goacc/continuation-free-form.f95 b/gcc/testsuite/gfortran.dg/goacc/continuation-free-form.f95
index 1c9a3f3..b904f7b 100644
--- a/gcc/testsuite/gfortran.dg/goacc/continuation-free-form.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/continuation-free-form.f95
@@ -16,8 +16,8 @@
     x = x + 0.3
   enddo
   ! continuation must begin with sentinel
-  !$acc end parallel & ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc end parallel & ! { dg-error "Unexpected junk" }
   ! loop
 
   print *, x
-end
\ No newline at end of file
+end
diff --git a/gcc/testsuite/gfortran.dg/goacc/data-clauses.f95 b/gcc/testsuite/gfortran.dg/goacc/data-clauses.f95
index b94214e..1a4a671 100644
--- a/gcc/testsuite/gfortran.dg/goacc/data-clauses.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/data-clauses.f95
@@ -39,9 +39,9 @@
   !$acc end data
 
 
-  !$acc parallel copy (tip) ! { dg-error "POINTER" }
+  !$acc parallel copy (tip)
   !$acc end parallel
-  !$acc parallel copy (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc parallel copy (tia)
   !$acc end parallel
   !$acc parallel deviceptr (i) copy (i) ! { dg-error "multiple clauses" }
   !$acc end parallel
@@ -54,9 +54,9 @@
   !$acc end data
 
 
-  !$acc parallel copyin (tip) ! { dg-error "POINTER" }
+  !$acc parallel copyin (tip)
   !$acc end parallel
-  !$acc parallel copyin (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc parallel copyin (tia)
   !$acc end parallel
   !$acc parallel deviceptr (i) copyin (i) ! { dg-error "multiple clauses" }
   !$acc end parallel
@@ -71,9 +71,9 @@
   !$acc end data
 
 
-  !$acc parallel copyout (tip) ! { dg-error "POINTER" }
+  !$acc parallel copyout (tip)
   !$acc end parallel
-  !$acc parallel copyout (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc parallel copyout (tia)
   !$acc end parallel
   !$acc parallel deviceptr (i) copyout (i) ! { dg-error "multiple clauses" }
   !$acc end parallel
@@ -90,9 +90,9 @@
   !$acc end data
 
 
-  !$acc parallel create (tip) ! { dg-error "POINTER" }
+  !$acc parallel create (tip)
   !$acc end parallel
-  !$acc parallel create (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc parallel create (tia)
   !$acc end parallel
   !$acc parallel deviceptr (i) create (i) ! { dg-error "multiple clauses" }
   !$acc end parallel
@@ -111,9 +111,9 @@
   !$acc end data
 
 
-  !$acc parallel present (tip) ! { dg-error "POINTER" }
+  !$acc parallel present (tip)
   !$acc end parallel
-  !$acc parallel present (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc parallel present (tia)
   !$acc end parallel
   !$acc parallel deviceptr (i) present (i) ! { dg-error "multiple clauses" }
   !$acc end parallel
@@ -144,9 +144,9 @@
   !$acc end parallel
 
 
-  !$acc parallel present_or_copy (tip) ! { dg-error "POINTER" }
+  !$acc parallel present_or_copy (tip)
   !$acc end parallel
-  !$acc parallel present_or_copy (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc parallel present_or_copy (tia)
   !$acc end parallel
   !$acc parallel deviceptr (i) present_or_copy (i) ! { dg-error "multiple clauses" }
   !$acc end parallel
@@ -169,9 +169,9 @@
   !$acc end data
 
 
-  !$acc parallel present_or_copyin (tip) ! { dg-error "POINTER" }
+  !$acc parallel present_or_copyin (tip)
   !$acc end parallel
-  !$acc parallel present_or_copyin (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc parallel present_or_copyin (tia)
   !$acc end parallel
   !$acc parallel deviceptr (i) present_or_copyin (i) ! { dg-error "multiple clauses" }
   !$acc end parallel
@@ -196,9 +196,9 @@
   !$acc end data
 
 
-  !$acc parallel present_or_copyout (tip) ! { dg-error "POINTER" }
+  !$acc parallel present_or_copyout (tip)
   !$acc end parallel
-  !$acc parallel present_or_copyout (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc parallel present_or_copyout (tia)
   !$acc end parallel
   !$acc parallel deviceptr (i) present_or_copyout (i) ! { dg-error "multiple clauses" }
   !$acc end parallel
@@ -225,9 +225,9 @@
   !$acc end data
 
 
-  !$acc parallel present_or_create (tip) ! { dg-error "POINTER" }
+  !$acc parallel present_or_create (tip)
   !$acc end parallel
-  !$acc parallel present_or_create (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc parallel present_or_create (tia)
   !$acc end parallel
   !$acc parallel deviceptr (i) present_or_create (i) ! { dg-error "multiple clauses" }
   !$acc end parallel
@@ -256,4 +256,4 @@
   !$acc end data
 
   end subroutine foo
-end module test
\ No newline at end of file
+end module test
diff --git a/gcc/testsuite/gfortran.dg/goacc/declare-allocatable-1.f90 b/gcc/testsuite/gfortran.dg/goacc/declare-allocatable-1.f90
new file mode 100644
index 0000000..5349e0d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/declare-allocatable-1.f90
@@ -0,0 +1,25 @@
+! Verify that OpenACC declared allocatable arrays have implicit
+! OpenACC enter and exit pragmas at the time of allocation and
+! deallocation.
+
+! { dg-additional-options "-fdump-tree-original" }
+
+program allocate
+  implicit none
+  integer, allocatable :: a(:), b
+  integer, parameter :: n = 100
+  integer i
+  !$acc declare create(a,b)
+
+  allocate (a(n), b)
+
+  !$acc parallel loop copyout(a, b)
+  do i = 1, n
+     a(i) = b
+  end do
+
+  deallocate (a, b)
+end program allocate
+
+! { dg-final { scan-tree-dump-times "pragma acc enter data map.declare_allocate" 2 "original" } }
+! { dg-final { scan-tree-dump-times "pragma acc exit data map.declare_deallocate" 2 "original" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/default-2.f b/gcc/testsuite/gfortran.dg/goacc/default-2.f
index 8f88688..ea82388 100644
--- a/gcc/testsuite/gfortran.dg/goacc/default-2.f
+++ b/gcc/testsuite/gfortran.dg/goacc/default-2.f
@@ -3,58 +3,58 @@
       SUBROUTINE F1
       IMPLICIT NONE
 
-!$ACC KERNELS DEFAULT ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT ( ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT ( ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT ( ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT ( ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT (, ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT (, ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT (, ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT (, ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT () ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT () ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT () ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT () ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT (,) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT (,) ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT (,) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT (,) ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT (FIRSTPRIVATE) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT (FIRSTPRIVATE) ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT (FIRSTPRIVATE) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT (FIRSTPRIVATE) ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT (PRIVATE) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT (PRIVATE) ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT (PRIVATE) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT (PRIVATE) ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT (SHARED) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT (SHARED) ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT (SHARED) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT (SHARED) ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT (NONE ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT (NONE ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT (NONE ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT (NONE ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT (NONE NONE) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT (NONE NONE) ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT (NONE NONE) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT (NONE NONE) ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
 
-!$ACC KERNELS DEFAULT (NONE, NONE) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC KERNELS DEFAULT (NONE, NONE) ! { dg-error "Failed to match clause" }
 !$ACC END KERNELS ! { dg-error "Unexpected" }
-!$ACC PARALLEL DEFAULT (NONE, NONE) ! { dg-error "Unclassifiable OpenACC directive" }
+!$ACC PARALLEL DEFAULT (NONE, NONE) ! { dg-error "Failed to match clause" }
 !$ACC END PARALLEL ! { dg-error "Unexpected" }
       END SUBROUTINE F1
diff --git a/gcc/testsuite/gfortran.dg/goacc/derived-types-2.f90 b/gcc/testsuite/gfortran.dg/goacc/derived-types-2.f90
new file mode 100644
index 0000000..d01583f
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/derived-types-2.f90
@@ -0,0 +1,14 @@
+module bar
+  type :: type1
+     real(8), pointer, public :: p(:) => null()
+  end type
+  type :: type2
+     class(type1), pointer :: p => null()
+  end type
+end module
+
+subroutine foo (var)
+   use bar
+   type(type2), intent(inout) :: var
+   !$acc enter data create(var%p%p)
+end subroutine
diff --git a/gcc/testsuite/gfortran.dg/goacc/derived-types.f90 b/gcc/testsuite/gfortran.dg/goacc/derived-types.f90
new file mode 100644
index 0000000..5fb2981
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/derived-types.f90
@@ -0,0 +1,77 @@
+! Test ACC UPDATE with derived types.
+
+module dt
+  integer, parameter :: n = 10
+  type inner
+     integer :: d(n)
+  end type inner
+  type dtype
+     integer(8) :: a, b, c(n)
+     type(inner) :: in
+  end type dtype
+end module dt
+
+program derived_acc
+  use dt
+  
+  implicit none
+  type(dtype):: var
+  integer i
+  !$acc declare create(var)
+  !$acc declare pcopy(var%a) ! { dg-error "Syntax error in OpenMP" }
+
+  !$acc update host(var)
+  !$acc update host(var%a)
+  !$acc update device(var)
+  !$acc update device(var%a)
+  !$acc update self(var)
+  !$acc update self(var%a)
+  
+  !$acc enter data copyin(var)
+  !$acc enter data copyin(var%a)
+
+  !$acc exit data copyout(var)
+  !$acc exit data copyout(var%a)
+
+  !$acc data copy(var)
+  !$acc end data
+
+  !$acc data copyout(var%a)
+  !$acc end data
+
+  !$acc parallel loop pcopyout(var)
+  do i = 1, 10
+  end do  
+  !$acc end parallel loop
+
+  !$acc parallel loop copyout(var%a)
+  do i = 1, 10
+  end do
+  !$acc end parallel loop
+
+  !$acc parallel pcopy(var)
+  !$acc end parallel
+
+  !$acc parallel pcopy(var%a)
+  do i = 1, 10
+  end do
+  !$acc end parallel
+  
+  !$acc kernels pcopyin(var)
+  !$acc end kernels
+
+  !$acc kernels pcopy(var%a)
+  do i = 1, 10
+  end do
+  !$acc end kernels
+
+  !$acc kernels loop pcopyin(var)
+  do i = 1, 10
+  end do
+  !$acc end kernels loop
+
+  !$acc kernels loop pcopy(var%a)
+  do i = 1, 10
+  end do
+  !$acc end kernels loop
+end program derived_acc
diff --git a/gcc/testsuite/gfortran.dg/goacc/enter-exit-data.f95 b/gcc/testsuite/gfortran.dg/goacc/enter-exit-data.f95
index 805459c..c2a4979 100644
--- a/gcc/testsuite/gfortran.dg/goacc/enter-exit-data.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/enter-exit-data.f95
@@ -28,7 +28,7 @@
   !$acc enter data
   !$acc enter data if (.false.)
   !$acc enter data if (l)
-  !$acc enter data if (.false.) if (l) ! { dg-error "Unclassifiable" }
+  !$acc enter data if (.false.) if (l) ! { dg-error "Failed to match clause" }
   !$acc enter data if (i) ! { dg-error "LOGICAL" }
   !$acc enter data if (1) ! { dg-error "LOGICAL" }
   !$acc enter data if (a) ! { dg-error "LOGICAL" }
@@ -44,14 +44,14 @@
   !$acc enter data wait (i, 1) 
   !$acc enter data wait (a) ! { dg-error "INTEGER" }
   !$acc enter data wait (b(5:6)) ! { dg-error "INTEGER" }
-  !$acc enter data copyin (tip) ! { dg-error "POINTER" }
-  !$acc enter data copyin (tia) ! { dg-error "ALLOCATABLE" }
-  !$acc enter data create (tip) ! { dg-error "POINTER" }
-  !$acc enter data create (tia) ! { dg-error "ALLOCATABLE" }
-  !$acc enter data present_or_copyin (tip) ! { dg-error "POINTER" }
-  !$acc enter data present_or_copyin (tia) ! { dg-error "ALLOCATABLE" }
-  !$acc enter data present_or_create (tip) ! { dg-error "POINTER" }
-  !$acc enter data present_or_create (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc enter data copyin (tip)
+  !$acc enter data copyin (tia)
+  !$acc enter data create (tip)
+  !$acc enter data create (tia)
+  !$acc enter data present_or_copyin (tip)
+  !$acc enter data present_or_copyin (tia)
+  !$acc enter data present_or_create (tip)
+  !$acc enter data present_or_create (tia)
   !$acc enter data copyin (i) create (i) ! { dg-error "multiple clauses" }
   !$acc enter data copyin (i) present_or_copyin (i) ! { dg-error "multiple clauses" }
   !$acc enter data create (i) present_or_copyin (i) ! { dg-error "multiple clauses" }
@@ -63,7 +63,7 @@
   !$acc exit data
   !$acc exit data if (.false.)
   !$acc exit data if (l)
-  !$acc exit data if (.false.) if (l) ! { dg-error "Unclassifiable" }
+  !$acc exit data if (.false.) if (l) ! { dg-error "Failed to match clause" }
   !$acc exit data if (i) ! { dg-error "LOGICAL" }
   !$acc exit data if (1) ! { dg-error "LOGICAL" }
   !$acc exit data if (a) ! { dg-error "LOGICAL" }
@@ -79,10 +79,10 @@
   !$acc exit data wait (i, 1) 
   !$acc exit data wait (a) ! { dg-error "INTEGER" }
   !$acc exit data wait (b(5:6)) ! { dg-error "INTEGER" }
-  !$acc exit data copyout (tip) ! { dg-error "POINTER" }
-  !$acc exit data copyout (tia) ! { dg-error "ALLOCATABLE" }
-  !$acc exit data delete (tip) ! { dg-error "POINTER" }
-  !$acc exit data delete (tia) ! { dg-error "ALLOCATABLE" }
+  !$acc exit data copyout (tip)
+  !$acc exit data copyout (tia)
+  !$acc exit data delete (tip)
+  !$acc exit data delete (tia)
   !$acc exit data copyout (i) delete (i) ! { dg-error "multiple clauses" }
   !$acc exit data finalize
   !$acc exit data finalize copyout (i)
diff --git a/gcc/testsuite/gfortran.dg/goacc/host_data-tree.f95 b/gcc/testsuite/gfortran.dg/goacc/host_data-tree.f95
index d44ca58..2ac1c0d 100644
--- a/gcc/testsuite/gfortran.dg/goacc/host_data-tree.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/host_data-tree.f95
@@ -7,5 +7,15 @@
 
   !$acc host_data use_device(p)
   !$acc end host_data
+
+  !$acc host_data use_device(p) if (p == 42)
+  !$acc end host_data
+
+  !$acc host_data use_device(p) if_present if (p == 43)
+  !$acc end host_data
 end program test
-! { dg-final { scan-tree-dump-times "pragma acc host_data use_device_ptr\\(p\\)" 1 "original" } }
+! { dg-final { scan-tree-dump-times "pragma acc host_data use_device_ptr\\(p\\)" 3 "original" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*p == 42;" 1 "original" } }
+! { dg-final { scan-tree-dump-times "pragma acc host_data use_device_ptr\\(p\\) if\\(D.\[0-9\]+\\)" 2 "original" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*p == 43;" 1 "original" } }
+! { dg-final { scan-tree-dump-times "pragma acc host_data use_device_ptr\\(p\\) if\\(D.\[0-9\]+\\) if_present" 1 "original" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/if.f95 b/gcc/testsuite/gfortran.dg/goacc/if.f95
index a45035d..35e9cfe 100644
--- a/gcc/testsuite/gfortran.dg/goacc/if.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/if.f95
@@ -6,7 +6,7 @@
   logical :: x
   integer :: i
 
-  !$acc parallel if ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel if ! { dg-error "Failed to match clause" }
   !$acc parallel if () ! { dg-error "Invalid character" }
   !$acc parallel if (i) ! { dg-error "scalar LOGICAL expression" }
   !$acc end parallel 
@@ -14,11 +14,11 @@
   !$acc end parallel 
   !$acc kernels if (i) ! { dg-error "scalar LOGICAL expression" }
   !$acc end kernels 
-  !$acc kernels if ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc kernels if ! { dg-error "Failed to match clause" }
   !$acc kernels if () ! { dg-error "Invalid character" }
   !$acc kernels if (1) ! { dg-error "scalar LOGICAL expression" }
   !$acc end kernels
-  !$acc data if ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc data if ! { dg-error "Failed to match clause" }
   !$acc data if () ! { dg-error "Invalid character" }
   !$acc data if (i) ! { dg-error "scalar LOGICAL expression" }
   !$acc end data 
@@ -26,9 +26,9 @@
   !$acc end data 
 
   ! at most one if clause may appear
-  !$acc parallel if (.false.) if (.false.) { dg-error "Unclassifiable OpenACC directive" }
-  !$acc kernels if (.false.) if (.false.) { dg-error "Unclassifiable OpenACC directive" }
-  !$acc data if (.false.) if (.false.) { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel if (.false.) if (.false.) { dg-error "Failed to match clause" }
+  !$acc kernels if (.false.) if (.false.) { dg-error "Failed to match clause" }
+  !$acc data if (.false.) if (.false.) { dg-error "Failed to match clause" }
 
   !$acc parallel if (x)
   !$acc end parallel
@@ -49,4 +49,4 @@
   !$acc data if (i.gt.1)
   !$acc end data
 
-end program test
\ No newline at end of file
+end program test
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-alias-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-alias-2.f95
index 6a9f241..f365896 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-alias-2.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-alias-2.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-ealias-all" }
 
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-alias-3.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-alias-3.f95
index 07dc8d6..cd5e539 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-alias-3.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-alias-3.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-ealias-all" }
 
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-alias-4.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-alias-4.f95
index 415eb96..0cda06e 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-alias-4.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-alias-4.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-ealias-all" }
 
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-alias.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-alias.f95
index 62f9a71..d8dcd37 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-alias.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-alias.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-ealias-all" }
 
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
new file mode 100644
index 0000000..4672d15
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
@@ -0,0 +1,59 @@
+! { dg-additional-options "-fdump-tree-convert_oacc_kernels" }
+
+program main
+  implicit none
+  integer, parameter         :: N = 1024
+  integer, dimension (1:N)   :: a
+  integer                    :: i, sum
+
+  !$acc kernels copyin(a(1:N)) copy(sum)
+
+  ! converted to "oacc_kernels"
+  !$acc loop
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
+  ! converted to "oacc_parallel_kernels_gang_single"
+  sum = sum + 1
+  a(1) = a(1) + 1
+
+  ! converted to "oacc_parallel_kernels_parallelized"
+  !$acc loop independent
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
+  ! converted to "oacc_kernels"
+  if (sum .gt. 10) then
+    !$acc loop
+    do i = 1, N
+      sum = sum + a(i)
+    end do
+  end if
+
+  ! converted to "oacc_kernels"
+  !$acc loop auto
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
+  !$acc end kernels
+end program main
+
+! Check that the kernels region is split into a data region and enclosed
+! parallel regions.
+! { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 "convert_oacc_kernels" } }
+
+! As noted in the comments above, we get one gang-single serial region; one
+! parallelized loop region; and three "old-style" kernel regions.
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 "convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 1 "convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "oacc_kernels " 3 "convert_oacc_kernels" } }
+
+! Each of the parallel regions is async, and there is a final call to
+! __builtin_GOACC_wait.
+! { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\)" 3 "convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single async\\(-1\\)" 1 "convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized async\\(-1\\)" 1 "convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "convert_oacc_kernels" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95
new file mode 100644
index 0000000..073b963
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1.f95
@@ -0,0 +1,131 @@
+! Test OpenACC 'kernels' construct decomposition.
+
+! { dg-additional-options "-fopt-info-optimized-omp" }
+! { dg-additional-options "-O2" } for "parloops".
+
+! See also "../../c-c++-common/goacc/kernels-decompose-1.c".
+
+program main
+  implicit none
+
+  integer, external :: f_g
+  !$acc routine (f_g) gang
+  integer, external :: f_w
+  !$acc routine (f_w) worker
+  integer, external :: f_v
+  !$acc routine (f_v) vector
+  integer, external :: f_s
+  !$acc routine (f_s) seq
+
+  integer :: i, j, k
+  integer :: x, y, z
+  logical :: y_l
+  integer, parameter :: N = 10
+  integer :: a(N), b(N), c(N)
+
+  !$acc kernels
+  x = 0 ! { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" }
+  y = 0
+  y_l = x < 10
+  z = x
+  x = x + 1
+  ;
+  !$acc end kernels
+
+  !$acc kernels ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+  do i = 1, N ! { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" }
+     a(i) = 0
+  end do
+  !$acc end kernels
+
+  !$acc kernels loop ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+  ! { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc kernels
+  !$acc loop ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+  ! { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc loop ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+  ! { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 }
+  do i = 1, N
+     c(i) = a(i) * b(i)
+  end do
+
+  a(z) = 0 ! { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" }
+
+  !$acc loop ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+  ! { dg-message "optimized: forwarded loop nest in OpenACC .kernels. construct to .parloops. for analysis" "" { target *-*-* } .-1 }
+  do i = 1, N
+     c(i) = c(i) + a(i)
+  end do
+
+  !$acc loop seq ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+  ! { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 }
+  do i = 1 + 1, N
+     c(i) = c(i) + c(i - 1)
+  end do
+  !$acc end kernels
+
+  !$acc kernels ! { dg-bogus "optimized: assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } }
+  !$acc loop independent ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+  ! { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 }
+  do i = 1, N
+     !$acc loop independent ! { dg-message "optimized: assigned OpenACC worker loop parallelism" }
+     do j = 1, N
+        !$acc loop independent ! { dg-message "optimized: assigned OpenACC seq loop parallelism" "TODO" { xfail *-*-* } }
+        ! { dg-warning "insufficient partitioning available to parallelize loop" "TODO" { xfail *-*-* } .-1 }
+        ! { dg-bogus "optimized: assigned OpenACC vector loop parallelism" "TODO" { xfail *-*-* } .-2 }
+        do k = 1, N
+           a(1 + mod(i + j + k, N)) &
+                = b(j) &
+                + f_v (c(k)) ! { dg-message "optimized: assigned OpenACC vector loop parallelism" "TODO" { xfail *-*-* } .-1 }
+        end do
+     end do
+  end do
+
+  !TODO Should the following turn into "gang-single" instead of "parloops"?
+  !TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>; else goto <D.2548>;", thus "parloops".
+  if (y < 5) then ! { dg-message "optimized: beginning .parloops. region in OpenACC .kernels. construct" }
+     !$acc loop independent ! { dg-message "optimized: unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" }
+     do j = 1, N
+        b(j) = f_w (c(j))
+     end do
+  end if
+  !$acc end kernels
+
+  !$acc kernels
+  !TODO This refers to the "gang-single" "f_g" call.
+  ! { dg-warning "region contains gang partitoned code but is not gang partitioned" "TODO" { xfail *-*-* } .-2 }
+  ! { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" "" { target *-*-* } .+1 }
+  y = f_g (a(5)) ! { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" "TODO" { xfail *-*-* } }
+
+  !$acc loop independent ! { dg-message "optimized: assigned OpenACC gang loop parallelism" "TODO" { xfail *-*-* } }
+  ! { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 }
+  ! { dg-bogus "optimized: assigned OpenACC gang vector loop parallelism" "TODO" { xfail *-*-* } .-2 }
+  do j = 1, N
+     b(j) = y + f_w (c(j)) ! { dg-message "optimized: assigned OpenACC worker vector loop parallelism" "TODO" { xfail *-*-* } }
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  y = 3 ! { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" }
+
+  !$acc loop independent ! { dg-message "optimized: assigned OpenACC gang worker loop parallelism" "TODO" { xfail *-*-* } }
+  ! { dg-message "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } .-1 }
+  ! { dg-bogus "optimized: assigned OpenACC gang vector loop parallelism" "TODO" { xfail *-*-* } .-2 }
+  do j = 1, N
+     b(j) = y + f_v (c(j)) ! { dg-message "optimized: assigned OpenACC vector loop parallelism" "TODO" { xfail *-*-* } }
+  end do
+
+  z = 2 ! { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" }
+  !$acc end kernels
+
+  !$acc kernels ! { dg-message "optimized: beginning .gang-single. region in OpenACC .kernels. construct" }
+  !$acc end kernels  
+end program main
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-2.f95
index ef53324..59001e4 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-2.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-2.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
 ! { dg-additional-options "-fdump-tree-optimized" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-2.f95
index 2f1dcd6..b6f50cb 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-2.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-2.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
 ! { dg-additional-options "-fdump-tree-optimized" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95
index 447e85d6..779073a 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit-2.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
 ! { dg-additional-options "-fdump-tree-optimized" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit.f95
index 4edb288..30ae2cb 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-enter-exit.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
 ! { dg-additional-options "-fdump-tree-optimized" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-update.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-update.f95
index fc113e1..b68945a 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-update.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-update.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
 ! { dg-additional-options "-fdump-tree-optimized" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data.f95
index 94522f5..f5c6688 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
 ! { dg-additional-options "-fdump-tree-optimized" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-inner.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-inner.f95
index a3ad591..18509b2 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-inner.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-inner.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fopt-info-optimized-omp" }
 
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95
index b9c4aea..4c43b11 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
 ! { dg-additional-options "-fdump-tree-optimized" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop.f95
index 6dc7b2e..4da7040 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
 ! { dg-additional-options "-fdump-tree-optimized" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loops-adjacent.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loops-adjacent.f95
index fb92da8..a83ff95 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-loops-adjacent.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loops-adjacent.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 
 program main
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95
index 48c20b9..1586e64 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-parallel-loop-data-enter-exit.f95
@@ -1,3 +1,5 @@
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 ! { dg-additional-options "-O2" }
 ! { dg-additional-options "-fdump-tree-parloops1-all" }
 ! { dg-additional-options "-fdump-tree-optimized" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95
index a70f1e7..bc9beba 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95
@@ -1,5 +1,6 @@
 ! { dg-do compile } 
 ! { dg-additional-options "-fdump-tree-original" } 
+! { dg-additional-options "-fdump-tree-convert_oacc_kernels" }
 
 program test
   implicit none
@@ -33,3 +34,7 @@
 ! { dg-final { scan-tree-dump-times "map\\(alloc:t\\)" 1 "original" } } 
 
 ! { dg-final { scan-tree-dump-times "map\\(force_deviceptr:u\\)" 1 "original" } } 
+
+! Verify that the 'if' clause gets duplicated.
+! { dg-final { scan-tree-dump-times "#pragma omp target oacc_data_kernels if\\(" 1 "convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "#pragma omp target oacc_parallel_kernels_gang_single .* if\\(" 1 "convert_oacc_kernels" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/list.f95 b/gcc/testsuite/gfortran.dg/goacc/list.f95
index 87c8408..d2f4c5e 100644
--- a/gcc/testsuite/gfortran.dg/goacc/list.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/list.f95
@@ -24,7 +24,7 @@
   !$acc parallel private (i) private (j)
   !$acc end parallel
 
-  !$acc parallel private ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel private ! { dg-error "Failed to match clause" }
 
   !$acc parallel private() ! { dg-error "Syntax error" }
 
@@ -56,7 +56,7 @@
   !$acc parallel firstprivate (i) firstprivate (j)
   !$acc end parallel
 
-  !$acc parallel firstprivate ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel firstprivate ! { dg-error "Failed to match clause" }
 
   !$acc parallel firstprivate() ! { dg-error "Syntax error" }
 
@@ -91,7 +91,7 @@
   !$acc host_data use_device (i) use_device (j) ! { dg-error "neither a POINTER nor an array" }
   !$acc end host_data
 
-  !$acc host_data use_device ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc host_data use_device ! { dg-error "Failed to match clause" }
 
   !$acc host_data use_device() ! { dg-error "Syntax error" }
 
diff --git a/gcc/testsuite/gfortran.dg/goacc/literal.f95 b/gcc/testsuite/gfortran.dg/goacc/literal.f95
index e6760d0..896749a 100644
--- a/gcc/testsuite/gfortran.dg/goacc/literal.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/literal.f95
@@ -10,7 +10,7 @@
     !$acc end data ! { dg-error "Unexpected" }
     !$acc data deviceptr (10) ! { dg-error "Syntax error" }
     !$acc end data ! { dg-error "Unexpected" }
-    !$acc data private (10) ! { dg-error "Unclassifiable" }
+    !$acc data private (10) ! { dg-error "Failed to match clause" }
     !$acc end data ! { dg-error "Unexpected" }
     !$acc host_data use_device (10) ! { dg-error "Syntax error" }
     !$acc end host_data ! { dg-error "Unexpected" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-2-kernels-tile.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-2-kernels-tile.f95
index 96e0ccb..6542515 100644
--- a/gcc/testsuite/gfortran.dg/goacc/loop-2-kernels-tile.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-2-kernels-tile.f95
@@ -3,7 +3,7 @@
   integer :: i, j
 
   !$acc kernels
-    !$acc loop tile ! { dg-error "Unclassifiable" }
+    !$acc loop tile ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
     !$acc loop tile() ! { dg-error "Syntax error" }
@@ -29,7 +29,7 @@
       DO j = 1,10
       ENDDO
     ENDDO
-    !$acc loop tile(-1) ! { dg-warning "must be positive" }
+    !$acc loop tile(-1) ! { dg-error "must be positive" }
     do i = 1,10
     enddo
     !$acc loop tile(i) ! { dg-error "constant expression" }
@@ -65,7 +65,7 @@
     ENDDO
   !$acc end kernels
 
-  !$acc kernels loop tile ! { dg-error "Unclassifiable" }
+  !$acc kernels loop tile ! { dg-error "Failed to match clause" }
   DO i = 1,10
   ENDDO
   !$acc kernels loop tile() ! { dg-error "Syntax error" }
@@ -82,7 +82,7 @@
     DO j = 1,10
     ENDDO
   ENDDO
-  !$acc kernels loop tile(-1) ! { dg-warning "must be positive" }
+  !$acc kernels loop tile(-1) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
   !$acc kernels loop tile(i) ! { dg-error "constant expression" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-2-parallel-tile.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-2-parallel-tile.f95
index 3a4db5d..dae8f66 100644
--- a/gcc/testsuite/gfortran.dg/goacc/loop-2-parallel-tile.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-2-parallel-tile.f95
@@ -3,7 +3,7 @@
   integer :: i, j
 
   !$acc parallel
-    !$acc loop tile ! { dg-error "Unclassifiable" }
+    !$acc loop tile ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
     !$acc loop tile() ! { dg-error "Syntax error" }
@@ -20,7 +20,7 @@
       DO j = 1,10
       ENDDO
     ENDDO
-    !$acc loop tile(-1) ! { dg-warning "must be positive" }
+    !$acc loop tile(-1) ! { dg-error "must be positive" }
     do i = 1,10
     enddo
     !$acc loop tile(i) ! { dg-error "constant expression" }
@@ -56,7 +56,7 @@
     ENDDO
   !$acc end parallel
 
-  !$acc parallel loop tile ! { dg-error "Unclassifiable" }
+  !$acc parallel loop tile ! { dg-error "Failed to match clause" }
   DO i = 1,10
   ENDDO
   !$acc parallel loop tile() ! { dg-error "Syntax error" }
@@ -73,7 +73,7 @@
     DO j = 1,10
     ENDDO
   ENDDO
-  !$acc parallel loop tile(-1) ! { dg-warning "must be positive" }
+  !$acc parallel loop tile(-1) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
   !$acc parallel loop tile(i) ! { dg-error "constant expression" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-5.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-5.f95
index d059cf7..fe137d5 100644
--- a/gcc/testsuite/gfortran.dg/goacc/loop-5.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-5.f95
@@ -93,9 +93,6 @@
       DO j = 1,10
       ENDDO
     ENDDO
-    !$acc loop tile(-1) ! { dg-warning "must be positive" }
-    do i = 1,10
-    enddo
     !$acc loop vector tile(*)
     DO i = 1,10
     ENDDO
@@ -129,9 +126,6 @@
       DO j = 1,10
       ENDDO
     ENDDO
-    !$acc loop tile(-1) ! { dg-warning "must be positive" }
-    do i = 1,10
-    enddo
     !$acc loop vector tile(*)
     DO i = 1,10
     ENDDO
@@ -242,9 +236,6 @@
     DO j = 1,10
     ENDDO
   ENDDO
-  !$acc kernels loop tile(-1) ! { dg-warning "must be positive" }
-  do i = 1,10
-  enddo
   !$acc kernels loop vector tile(*)
   DO i = 1,10
   ENDDO
@@ -333,9 +324,6 @@
     DO j = 1,10
     ENDDO
   ENDDO
-  !$acc parallel loop tile(-1) ! { dg-warning "must be positive" }
-  do i = 1,10
-  enddo
   !$acc parallel loop vector tile(*)
   DO i = 1,10
   ENDDO
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-7.f95 b/gcc/testsuite/gfortran.dg/goacc/loop-7.f95
index 9ca8297..37d50d3 100644
--- a/gcc/testsuite/gfortran.dg/goacc/loop-7.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-7.f95
@@ -30,13 +30,13 @@
     !$acc loop gang(num:num, static:1)
     DO i = 1,10
     ENDDO
-    !$acc loop gang(static:*, num:5, static:5) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop gang(static:*, num:5, static:5) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
-    !$acc loop gang(1, num:2, static:3) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop gang(1, num:2, static:3) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
-    !$acc loop gang(num:num static:1) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop gang(num:num static:1) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
     !$acc loop gang(num)
@@ -45,7 +45,7 @@
     !$acc loop gang(num:num+1, static:1+num)
     DO i = 1,10
     ENDDO
-    !$acc loop gang(length:num) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop gang(length:num) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
 
@@ -58,19 +58,19 @@
     !$acc loop worker (num)
     DO i = 1,10
     ENDDO
-    !$acc loop worker (static:num) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop worker (static:num) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
     !$acc loop worker (num:,) ! { dg-error "Invalid character" }
     DO i = 1,10
     ENDDO
-    !$acc loop worker (num:num:num) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop worker (num:num:num) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
     !$acc loop worker (num:num*num)
     DO i = 1,10
     ENDDO
-    !$acc loop worker (length:num*num) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop worker (length:num*num) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
     !$acc loop worker (num:*) ! { dg-error "Invalid character" }
@@ -89,13 +89,13 @@
     !$acc loop vector (length)
     DO i = 1,10
     ENDDO
-    !$acc loop vrctor (static:num) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop vrctor (static:num) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
     !$acc loop vector (length:,) ! { dg-error "Invalid character" }
     DO i = 1,10
     ENDDO
-    !$acc loop vector (length:num:num) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop vector (length:num:num) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
     !$acc loop vector (length:static*num)
@@ -107,7 +107,7 @@
     !$acc loop vector (length:32)
     DO i = 1,10
     ENDDO
-    !$acc loop vector (num:num*num) ! { dg-error "Unclassifiable OpenACC directive" }
+    !$acc loop vector (num:num*num) ! { dg-error "Failed to match clause" }
     DO i = 1,10
     ENDDO
     !$acc loop vector (length:*) ! { dg-error "Invalid character" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-auto-1.f90 b/gcc/testsuite/gfortran.dg/goacc/loop-auto-1.f90
new file mode 100644
index 0000000..8d600f4
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-auto-1.f90
@@ -0,0 +1,88 @@
+! Ensure that the auto clause falls back to seq parallelism when the
+! OpenACC loop is not explicitly independent.
+
+! { dg-additional-options "-fopt-info-optimized-omp" }
+
+program test
+  implicit none
+  integer, parameter :: n = 100
+  integer i, j, k, l
+  
+  !$acc parallel loop auto ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+  do i = 1, n
+     !$acc loop auto independent ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+     do j = 1, n
+        !$acc loop worker vector ! { dg-message "optimized: assigned OpenACC worker vector loop parallelism" }
+        do k = 1, n
+        end do
+     end do
+  end do
+ 
+  !$acc parallel loop auto independent ! { dg-message "optimized: assigned OpenACC gang worker loop parallelism" }
+  do i = 1, n
+     !$acc loop auto ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+     do j = 1, n
+        !$acc loop auto ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+        do k = 1, n
+           !$acc loop auto independent ! { dg-message "optimized: assigned OpenACC vector loop parallelism" }
+           do l = 1, n
+           end do
+        end do
+     end do
+  end do
+
+  !$acc parallel loop gang ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+  do i = 1, n
+     !$acc loop worker ! { dg-message "optimized: assigned OpenACC worker loop parallelism" }
+     do j = 1, n
+        !$acc loop vector ! { dg-message "optimized: assigned OpenACC vector loop parallelism" }
+        do k = 1, n
+           !$acc loop auto independent ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+           ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 }
+           do l = 1, n
+           end do
+           !$acc loop auto ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+           do l = 1, n
+           end do
+        end do
+     end do
+  end do
+  
+
+  !$acc parallel loop ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+  ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 }
+  do i = 1, n
+     !$acc loop gang worker ! { dg-message "optimized: assigned OpenACC gang worker loop parallelism" }
+     do j = 1, n
+        !$acc loop auto ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+	do k = 1, n
+          !$acc loop vector ! { dg-message "optimized: assigned OpenACC vector loop parallelism" }
+          do l = 1, n
+          end do
+       end do
+       !$acc loop auto independent ! { dg-message "optimized: assigned OpenACC vector loop parallelism" }
+       do l = 1, n
+       end do
+    end do
+    !$acc loop worker ! { dg-message "optimized: assigned OpenACC worker loop parallelism" }
+    do j = 1, n
+       !$acc loop vector ! { dg-message "optimized: assigned OpenACC vector loop parallelism" }
+       do k = 1, n
+       end do
+    end do
+  end do
+
+  !$acc parallel loop ! { dg-message "optimized: assigned OpenACC gang loop parallelism" }
+  do i = 1, n
+     !$acc loop ! { dg-message "optimized: assigned OpenACC worker loop parallelism" }
+     do j = 1, n
+        !$acc loop ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+        ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 }
+        do k = 1, n
+           !$acc loop ! { dg-message "optimized: assigned OpenACC vector loop parallelism" }
+           do l = 1, n
+           end do
+        end do
+     end do
+  end do
+end program test
diff --git a/gcc/testsuite/gfortran.dg/goacc/orphan-reductions-1.f90 b/gcc/testsuite/gfortran.dg/goacc/orphan-reductions-1.f90
new file mode 100644
index 0000000..7f363d5
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/orphan-reductions-1.f90
@@ -0,0 +1,204 @@
+! Verify that gang reduction on orphan OpenACC loops reported as errors.
+
+subroutine s1
+  implicit none
+
+  integer, parameter :: n = 100
+  integer :: i, sum
+  sum = 0
+
+  !$acc parallel reduction(+:sum)
+  do i = 1, n
+     sum = sum + 1
+  end do
+  !$acc end parallel
+
+  !$acc parallel loop gang reduction(+:sum)
+  do i = 1, n
+     sum = sum + 1
+  end do
+
+  !$acc parallel
+  !$acc loop gang reduction(+:sum)
+  do i = 1, n
+     sum = sum + 1
+  end do
+  !$acc end parallel
+end subroutine s1
+
+subroutine s2
+  implicit none
+  !$acc routine worker
+
+  integer, parameter :: n = 100
+  integer :: i, j, sum
+  sum = 0
+
+  !$acc loop gang reduction(+:sum) ! { dg-error "gang reduction on an orphan loop" }
+  do i = 1, n
+     sum = sum + 1
+  end do
+
+  !$acc loop reduction(+:sum)
+  do i = 1, n
+     !$acc loop gang reduction(+:sum) ! { dg-error "gang reduction on an orphan loop" }
+     do j = 1, n
+        sum = sum + 1
+     end do
+  end do
+end subroutine s2
+
+integer function f1 ()
+  implicit none
+
+  integer, parameter :: n = 100
+  integer :: i, sum
+  sum = 0
+
+  !$acc parallel reduction(+:sum)
+  do i = 1, n
+     sum = sum + 1
+  end do
+  !$acc end parallel
+
+  !$acc parallel loop gang reduction(+:sum)
+  do i = 1, n
+     sum = sum + 1
+  end do
+
+  !$acc parallel
+  !$acc loop gang reduction(+:sum)
+  do i = 1, n
+     sum = sum + 1
+  end do
+  !$acc end parallel
+
+  f1 = sum
+end function f1
+
+integer function f2 ()
+  implicit none
+  !$acc routine worker
+
+  integer, parameter :: n = 100
+  integer :: i, j, sum
+  sum = 0
+
+  !$acc loop gang reduction(+:sum) ! { dg-error "gang reduction on an orphan loop" }
+  do i = 1, n
+     sum = sum + 1
+  end do
+
+  !$acc loop reduction(+:sum)
+  do i = 1, n
+     !$acc loop gang reduction(+:sum) ! { dg-error "gang reduction on an orphan loop" }
+     do j = 1, n
+        sum = sum + 1
+     end do
+  end do
+
+  f2 = sum
+end function f2
+
+module m
+contains
+  subroutine s3
+    implicit none
+
+    integer, parameter :: n = 100
+    integer :: i, sum
+    sum = 0
+
+    !$acc parallel reduction(+:sum)
+    do i = 1, n
+       sum = sum + 1
+    end do
+    !$acc end parallel
+
+    !$acc parallel loop gang reduction(+:sum)
+    do i = 1, n
+       sum = sum + 1
+    end do
+
+    !$acc parallel
+    !$acc loop gang reduction(+:sum)
+    do i = 1, n
+       sum = sum + 1
+    end do
+    !$acc end parallel
+  end subroutine s3
+
+  subroutine s4
+    implicit none
+    !$acc routine worker
+
+    integer, parameter :: n = 100
+    integer :: i, j, sum
+    sum = 0
+
+    !$acc loop gang reduction(+:sum) ! { dg-error "gang reduction on an orphan loop" }
+    do i = 1, n
+       sum = sum + 1
+    end do
+
+    !$acc loop reduction(+:sum)
+    do i = 1, n
+       !$acc loop gang reduction(+:sum) ! { dg-error "gang reduction on an orphan loop" }
+       do j = 1, n
+          sum = sum + 1
+       end do
+    end do
+  end subroutine s4
+
+  integer function f3 ()
+    implicit none
+
+    integer, parameter :: n = 100
+    integer :: i, sum
+    sum = 0
+
+    !$acc parallel reduction(+:sum)
+    do i = 1, n
+       sum = sum + 1
+    end do
+    !$acc end parallel
+
+    !$acc parallel loop gang reduction(+:sum)
+    do i = 1, n
+       sum = sum + 1
+    end do
+
+    !$acc parallel
+    !$acc loop gang reduction(+:sum)
+    do i = 1, n
+       sum = sum + 1
+    end do
+    !$acc end parallel
+
+    f3 = sum
+  end function f3
+
+  integer function f4 ()
+    implicit none
+    !$acc routine worker
+
+    integer, parameter :: n = 100
+    integer :: i, j, sum
+    sum = 0
+
+    !$acc loop gang reduction(+:sum) ! { dg-error "gang reduction on an orphan loop" }
+    do i = 1, n
+       sum = sum + 1
+    end do
+
+    !$acc loop reduction(+:sum)
+    do i = 1, n
+       !$acc loop gang reduction(+:sum) ! { dg-error "gang reduction on an orphan loop" }
+       do j = 1, n
+          sum = sum + 1
+       end do
+    end do
+
+    f4 = sum
+  end function f4
+end module m
diff --git a/gcc/testsuite/gfortran.dg/goacc/parallel-kernels-clauses.f95 b/gcc/testsuite/gfortran.dg/goacc/parallel-kernels-clauses.f95
index c37208c..72ba147 100644
--- a/gcc/testsuite/gfortran.dg/goacc/parallel-kernels-clauses.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/parallel-kernels-clauses.f95
@@ -16,8 +16,8 @@
   !$acc parallel async(i)
   !$acc end parallel
 
-  !$acc kernels async(0, 1) { dg-error "Unclassifiable" }
-  !$acc parallel async(0, 1) { dg-error "Unclassifiable" }
+  !$acc kernels async(0, 1) { dg-error "Failed to match clause" }
+  !$acc parallel async(0, 1) { dg-error "Failed to match clause" }
 
   !$acc kernels async
   !$acc end kernels
@@ -37,11 +37,11 @@
   !$acc kernels async() { dg-error "Invalid character" }
   !$acc parallel async() { dg-error "Invalid character" }
 
-  !$acc kernels async("a") { dg-error "Unclassifiable" }
-  !$acc parallel async("a") { dg-error "Unclassifiable" }
+  !$acc kernels async("a") { dg-error "Failed to match clause" }
+  !$acc parallel async("a") { dg-error "Failed to match clause" }
 
-  !$acc kernels async(.true.) { dg-error "Unclassifiable" }
-  !$acc parallel async(.true.) { dg-error "Unclassifiable" }
+  !$acc kernels async(.true.) { dg-error "Failed to match clause" }
+  !$acc parallel async(.true.) { dg-error "Failed to match clause" }
 
   ! default(none)
   !$acc kernels default(none)
@@ -59,17 +59,17 @@
   !$acc parallel default ( none )
   !$acc end parallel
 
-  !$acc kernels default { dg-error "Unclassifiable" }
-  !$acc parallel default { dg-error "Unclassifiable" }
+  !$acc kernels default { dg-error "Failed to match clause" }
+  !$acc parallel default { dg-error "Failed to match clause" }
 
-  !$acc kernels default() { dg-error "Unclassifiable" }
-  !$acc parallel default() { dg-error "Unclassifiable" }
+  !$acc kernels default() { dg-error "Failed to match clause" }
+  !$acc parallel default() { dg-error "Failed to match clause" }
 
-  !$acc kernels default(i) { dg-error "Unclassifiable" }
-  !$acc parallel default(i) { dg-error "Unclassifiable" }
+  !$acc kernels default(i) { dg-error "Failed to match clause" }
+  !$acc parallel default(i) { dg-error "Failed to match clause" }
 
-  !$acc kernels default(1) { dg-error "Unclassifiable" }
-  !$acc parallel default(1) { dg-error "Unclassifiable" }
+  !$acc kernels default(1) { dg-error "Failed to match clause" }
+  !$acc parallel default(1) { dg-error "Failed to match clause" }
 
   ! Wait
   !$acc kernels wait (l) ! { dg-error "INTEGER" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/parallel-tree.f95 b/gcc/testsuite/gfortran.dg/goacc/parallel-tree.f95
index 2697bb7..aaa1bfd 100644
--- a/gcc/testsuite/gfortran.dg/goacc/parallel-tree.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/parallel-tree.f95
@@ -11,6 +11,9 @@
   !$acc reduction(max:q), copy(i), copyin(j), copyout(k), create(m) &
   !$acc present(o), pcopy(p), pcopyin(r), pcopyout(s), pcreate(t) &
   !$acc deviceptr(u), private(v), firstprivate(w)
+  ! { dg-warning "region is gang partitioned but does not contain gang partitioned code" "" { target *-*-* } .-1 }
+  ! { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 }
+  ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-3 }
   !$acc end parallel
 
 end program test
diff --git a/gcc/testsuite/gfortran.dg/goacc/parameter.f95 b/gcc/testsuite/gfortran.dg/goacc/parameter.f95
index 8427461..cbe67db 100644
--- a/gcc/testsuite/gfortran.dg/goacc/parameter.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/parameter.f95
@@ -6,7 +6,7 @@
     implicit none
     integer :: i
     integer, parameter :: a = 1
-    !$acc declare device_resident (a) ! { dg-error "PARAMETER" }
+    !$acc declare device_resident (a) ! { dg-error "is not a variable" }
     !$acc data copy (a) ! { dg-error "not a variable" }
     !$acc end data
     !$acc data deviceptr (a) ! { dg-error "not a variable" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/pr70828.f90 b/gcc/testsuite/gfortran.dg/goacc/pr70828.f90
new file mode 100644
index 0000000..2e58120
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/pr70828.f90
@@ -0,0 +1,22 @@
+! Ensure that pointer mappings are preserved in nested parallel
+! constructs.
+
+! { dg-additional-options "-fdump-tree-gimple" }
+
+program test
+  integer, parameter :: n = 100
+  integer i, data(n)
+
+  data(:) = 0
+
+  !$acc data copy(data(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     data(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+end program test
+
+! { dg-final { scan-tree-dump-times "omp target oacc_data map\\(tofrom:MEM\\\[\\(c_char \\*\\)\_\[0-9\]+\\\] \\\[len: _\[0-9\]+\\\]\\) map\\(alloc:data \\\[pointer assign, bias: _\[0-9\]+\\\]\\)" 1 "gimple" } }
+! { dg-final { scan-tree-dump-times "omp target oacc_parallel map\\(force_present:MEM\\\[\\(c_char \\*\\)D\\.\[0-9\]+\\\] \\\[len: D\\.\[0-9\]+\\\]\\) map\\(alloc:data \\\[pointer assign, bias: D\\.\[0-9\]+\\\]\\)" 1 "gimple" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/pr72741-2.f b/gcc/testsuite/gfortran.dg/goacc/pr72741-2.f
new file mode 100644
index 0000000..ef7f8d3
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/pr72741-2.f
@@ -0,0 +1,39 @@
+      SUBROUTINE v_1
+!$ACC ROUTINE
+!$ACC ROUTINE
+!$ACC ROUTINE GANG ! { dg-error "ACC ROUTINE already applied" }
+!$ACC ROUTINE SEQ
+!$ACC ROUTINE
+!$ACC ROUTINE WORKER ! { dg-error "ACC ROUTINE already applied" }
+      END SUBROUTINE v_1
+
+      SUBROUTINE sub_1
+      IMPLICIT NONE
+      EXTERNAL :: g_1
+!$ACC ROUTINE (g_1) GANG
+!$ACC ROUTINE (g_1) VECTOR ! { dg-error "ACC ROUTINE already applied" }
+!$ACC ROUTINE (g_1) SEQ ! { dg-error "ACC ROUTINE already applied" }
+!$ACC ROUTINE (g_1) ! { dg-error "ACC ROUTINE already applied" }
+!$ACC ROUTINE (g_1) ! { dg-error "ACC ROUTINE already applied" }
+
+      CALL v_1
+      CALL g_1
+      CALL ABORT
+      END SUBROUTINE sub_1
+
+      MODULE m_w_1
+      IMPLICIT NONE
+      EXTERNAL :: w_1
+!$ACC ROUTINE (w_1) WORKER
+!$ACC ROUTINE (w_1) ! { dg-error "ACC ROUTINE already applied" }
+!$ACC ROUTINE (w_1) SEQ ! { dg-error "ACC ROUTINE already applied" }
+!$ACC ROUTINE (w_1) ! { dg-error "ACC ROUTINE already applied" }
+!$ACC ROUTINE (w_1) VECTOR ! { dg-error "ACC ROUTINE already applied" }
+
+      CONTAINS
+      SUBROUTINE sub_2
+      CALL v_1
+      CALL w_1
+      CALL ABORT
+      END SUBROUTINE sub_2
+      END MODULE m_w_1
diff --git a/gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-1.f b/gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-1.f
new file mode 100644
index 0000000..f6fe044
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-1.f
@@ -0,0 +1,16 @@
+      SUBROUTINE sub_1
+      IMPLICIT NONE
+!$ACC ROUTINE (ABORT) SEQ VECTOR ! { dg-error "Multiple loop axes specified for routine" }
+
+      CALL ABORT
+      END SUBROUTINE sub_1
+
+      MODULE m_w_1
+      IMPLICIT NONE
+!$ACC ROUTINE (ABORT) VECTOR GANG ! { dg-error "Multiple loop axes specified for routine" }
+
+      CONTAINS
+      SUBROUTINE sub_2
+      CALL ABORT
+      END SUBROUTINE sub_2
+      END MODULE m_w_1
diff --git a/gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-2.f b/gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-2.f
new file mode 100644
index 0000000..cd7e511
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/pr72741-intrinsic-2.f
@@ -0,0 +1,22 @@
+! Check for invalid clauses with intrinsic function specified in !$ACC ROUTINE ( NAME ).
+
+      SUBROUTINE sub_1
+      IMPLICIT NONE
+!$ACC ROUTINE (ABORT) WORKER ! { dg-error "Intrinsic symbol specified in \\!\\\$ACC ROUTINE \\( NAME \\) at \\(1\\) marked with incompatible GANG, WORKER, or VECTOR clause" }
+!$ACC ROUTINE (ABORT) GANG ! { dg-error "Intrinsic symbol specified in \\!\\\$ACC ROUTINE \\( NAME \\) at \\(1\\) marked with incompatible GANG, WORKER, or VECTOR clause" }
+!$ACC ROUTINE (ABORT) VECTOR ! { dg-error "Intrinsic symbol specified in \\!\\\$ACC ROUTINE \\( NAME \\) at \\(1\\) marked with incompatible GANG, WORKER, or VECTOR clause" }
+
+      CALL ABORT
+      END SUBROUTINE sub_1
+
+      MODULE m_w_1
+      IMPLICIT NONE
+!$ACC ROUTINE (ABORT) VECTOR ! { dg-error "Intrinsic symbol specified in \\!\\\$ACC ROUTINE \\( NAME \\) at \\(1\\) marked with incompatible GANG, WORKER, or VECTOR clause" }
+!$ACC ROUTINE (ABORT) WORKER ! { dg-error "Intrinsic symbol specified in \\!\\\$ACC ROUTINE \\( NAME \\) at \\(1\\) marked with incompatible GANG, WORKER, or VECTOR clause" }
+!$ACC ROUTINE (ABORT) GANG ! { dg-error "Intrinsic symbol specified in \\!\\\$ACC ROUTINE \\( NAME \\) at \\(1\\) marked with incompatible GANG, WORKER, or VECTOR clause" }
+
+      CONTAINS
+      SUBROUTINE sub_2
+      CALL ABORT
+      END SUBROUTINE sub_2
+      END MODULE m_w_1
diff --git a/gcc/testsuite/gfortran.dg/goacc/pr72741.f90 b/gcc/testsuite/gfortran.dg/goacc/pr72741.f90
new file mode 100644
index 0000000..bbbfb6b
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/pr72741.f90
@@ -0,0 +1,42 @@
+SUBROUTINE v_1
+  !$ACC ROUTINE VECTOR WORKER ! { dg-error "Multiple loop axes" }
+  !$ACC ROUTINE VECTOR
+  !$ACC ROUTINE ! { dg-error "ACC ROUTINE already applied" }
+  !$ACC ROUTINE GANG VECTOR ! { dg-error "Multiple loop axes" }
+END SUBROUTINE v_1
+
+SUBROUTINE v_2
+  !$ACC ROUTINE(v_2) VECTOR WORKER ! { dg-error "Multiple loop axes" }
+  !$ACC ROUTINE(v_2) VECTOR
+  !$ACC ROUTINE(v_2) ! { dg-error "ACC ROUTINE already applied" }
+  !$ACC ROUTINE(v_2) GANG VECTOR ! { dg-error "Multiple loop axes" }
+END SUBROUTINE v_2
+
+SUBROUTINE sub_1
+  IMPLICIT NONE
+  EXTERNAL :: g_1
+  !$ACC ROUTINE (g_1) GANG WORKER ! { dg-error "Multiple loop axes" }
+  !$ACC ROUTINE (g_1) GANG
+  !$ACC ROUTINE (g_1) ! { dg-error "ACC ROUTINE already applied" }
+  !$ACC ROUTINE (g_1) VECTOR GANG ! { dg-error "Multiple loop axes" }
+
+  CALL v_1
+  CALL g_1
+  CALL ABORT
+END SUBROUTINE sub_1
+
+MODULE m_w_1
+  IMPLICIT NONE
+  EXTERNAL :: w_1
+  !$ACC ROUTINE (w_1) WORKER SEQ ! { dg-error "Multiple loop axes" }
+  !$ACC ROUTINE (w_1) WORKER
+  !$ACC ROUTINE (w_1) ! { dg-error "ACC ROUTINE already applied" }
+  !$ACC ROUTINE (w_1) VECTOR WORKER ! { dg-error "Multiple loop axes" }
+
+CONTAINS
+  SUBROUTINE sub_2
+    CALL v_1
+    CALL w_1
+    CALL ABORT
+  END SUBROUTINE sub_2
+END MODULE m_w_1
diff --git a/gcc/testsuite/gfortran.dg/goacc/pr78260-2.f90 b/gcc/testsuite/gfortran.dg/goacc/pr78260-2.f90
new file mode 100644
index 0000000..e28564d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/pr78260-2.f90
@@ -0,0 +1,20 @@
+! { dg-do compile }
+! { dg-options "-fopenacc -fdump-tree-original" }
+! { dg-require-effective-target fopenacc }
+
+! PR fortran/78260
+
+module m
+  implicit none
+  integer :: n = 0
+contains
+  integer function f1()
+    !$acc declare present(f1)
+    !$acc kernels copyin(f1)
+    f1 = 5 
+    !$acc end kernels
+  end function f1
+end module m
+! { dg-final { scan-tree-dump-times "#pragma acc data map\\(force_present:__result_f1\\)" 1 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma acc data map\\(force_present:__result_f1\\)" 1 "original" } }
+
diff --git a/gcc/testsuite/gfortran.dg/goacc/pr78260.f90 b/gcc/testsuite/gfortran.dg/goacc/pr78260.f90
new file mode 100644
index 0000000..21bde85
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/pr78260.f90
@@ -0,0 +1,36 @@
+! { dg-do compile }
+! { dg-options "-fopenacc" }
+! { dg-require-effective-target fopenacc }
+
+! PR fortran/78260
+! Contributed by Gerhard Steinmetz
+
+module m
+  implicit none
+  integer :: n = 0
+contains
+  subroutine s
+    !$acc declare present(m)  ! { dg-error "Object .m. is not a variable" }
+    !$acc kernels copyin(m)   ! { dg-error "Object .m. is not a variable" }
+    n = n + 1
+    !$acc end kernels
+  end subroutine s
+  subroutine s2
+    !$acc declare present(s2)  ! { dg-error "Object .s2. is not a variable" }
+    !$acc kernels copyin(s2)   ! { dg-error "Object .s2. is not a variable" }
+    n = n + 1
+    !$acc end kernels
+  end subroutine s2
+  integer function f1()
+    !$acc declare present(f1)  ! OK, f1 is also the result variable
+    !$acc kernels copyin(f1)   ! OK, f1 is also the result variable
+    f1 = 5 
+    !$acc end kernels
+  end function f1
+  integer function f2() result(res)
+    !$acc declare present(f2)  ! { dg-error "Object .f2. is not a variable" }
+    !$acc kernels copyin(f2)   ! { dg-error "Object .f2. is not a variable" }
+    res = 5 
+    !$acc end kernels
+  end function f2
+end module m
diff --git a/gcc/testsuite/gfortran.dg/goacc/pr85701.f90 b/gcc/testsuite/gfortran.dg/goacc/pr85701.f90
index 9c201b8..bae09de 100644
--- a/gcc/testsuite/gfortran.dg/goacc/pr85701.f90
+++ b/gcc/testsuite/gfortran.dg/goacc/pr85701.f90
@@ -9,11 +9,11 @@
    !$acc declare present(s2) ! { dg-error "is not a variable" }
 end
 
-function f1 ()
+function f1 () result(res)
    !$acc declare copy(f1) ! { dg-error "is not a variable" }
 end
 
-function f2 ()
+function f2 () result(res)
    !$acc declare present(f2) ! { dg-error "is not a variable" }
 end
 
diff --git a/gcc/testsuite/gfortran.dg/goacc/routine-4.f90 b/gcc/testsuite/gfortran.dg/goacc/routine-4.f90
index 6714c7b..3fb60e71 100644
--- a/gcc/testsuite/gfortran.dg/goacc/routine-4.f90
+++ b/gcc/testsuite/gfortran.dg/goacc/routine-4.f90
@@ -123,6 +123,7 @@
     integer, intent (inout) :: a(N)
     integer :: i
 
+    !$acc loop gang worker vector
     do i = 1, N
        a(i) = a(i) - a(i)
     end do
@@ -133,6 +134,7 @@
     integer, intent (inout) :: a(N)
     integer :: i
 
+    !$acc loop worker vector
     do i = 1, N
        a(i) = a(i) - a(i)
     end do
@@ -143,6 +145,7 @@
     integer, intent (inout) :: a(N)
     integer :: i
 
+    !$acc loop vector
     do i = 1, N
        a(i) = a(i) - a(i)
     end do
@@ -153,6 +156,7 @@
     integer, intent (inout) :: a(N)
     integer :: i
 
+    !$acc loop seq
     do i = 1, N
        a(i) = a(i) - a(i)
     end do
diff --git a/gcc/testsuite/gfortran.dg/goacc/routine-6.f90 b/gcc/testsuite/gfortran.dg/goacc/routine-6.f90
index cdf643f..f1e2aa3 100644
--- a/gcc/testsuite/gfortran.dg/goacc/routine-6.f90
+++ b/gcc/testsuite/gfortran.dg/goacc/routine-6.f90
@@ -108,7 +108,7 @@
 end subroutine subr4
 
 subroutine subr10 (x)
-  !$acc routine (subr10) device ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc routine (subr10) device ! { dg-error "Failed to match clause" }
   integer, intent(inout) :: x
   if (x < 1) then
      x = 1
diff --git a/gcc/testsuite/gfortran.dg/goacc/routine-level-of-parallelism-1.f90 b/gcc/testsuite/gfortran.dg/goacc/routine-level-of-parallelism-1.f90
index 75dd1b0..1b41a68 100644
--- a/gcc/testsuite/gfortran.dg/goacc/routine-level-of-parallelism-1.f90
+++ b/gcc/testsuite/gfortran.dg/goacc/routine-level-of-parallelism-1.f90
@@ -2,8 +2,10 @@
 ! parallelism with the OpenACC routine directive.  The Fortran counterpart is
 ! c-c++-common/goacc/routine-level-of-parallelism-2.c
 
-subroutine g_1
+subroutine g_1 ! { dg-warning "region is gang partitioned but does not contain gang partitioned code" }
   !$acc routine gang
+! { dg-bogus "region is worker partitioned but does not contain worker partitioned code" "worker partitioned" { xfail *-*-* } .-2 }
+! { dg-bogus "region is vector partitioned but does not contain vector partitioned code" "worker partitioned" { xfail *-*-* } .-3 }
 end subroutine g_1
 
 subroutine s_1_2a
diff --git a/gcc/testsuite/gfortran.dg/goacc/routine-multiple-directives-1.f90 b/gcc/testsuite/gfortran.dg/goacc/routine-multiple-directives-1.f90
index 6e12ee9..e39f6b5 100644
--- a/gcc/testsuite/gfortran.dg/goacc/routine-multiple-directives-1.f90
+++ b/gcc/testsuite/gfortran.dg/goacc/routine-multiple-directives-1.f90
@@ -12,14 +12,14 @@
 !$ACC ROUTINE(s_2)
       END SUBROUTINE s_2
 
-      SUBROUTINE v_1
+      SUBROUTINE v_1 ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" }
 !$ACC ROUTINE VECTOR
 !$ACC ROUTINE VECTOR
 !$ACC ROUTINE(v_1) VECTOR
 !$ACC ROUTINE VECTOR
       END SUBROUTINE v_1
 
-      SUBROUTINE v_2
+      SUBROUTINE v_2 ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" }
 !$ACC ROUTINE(v_2) VECTOR
 !$ACC ROUTINE VECTOR
 !$ACC ROUTINE(v_2) VECTOR
diff --git a/gcc/testsuite/gfortran.dg/goacc/several-directives.f95 b/gcc/testsuite/gfortran.dg/goacc/several-directives.f95
index 8fb97b5..e7610be 100644
--- a/gcc/testsuite/gfortran.dg/goacc/several-directives.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/several-directives.f95
@@ -2,5 +2,5 @@
 
 program test
   ! only one directive-name may appear in directive
-  !$acc parallel kernels ! { dg-error "Unclassifiable OpenACC directive" }
-end
\ No newline at end of file
+  !$acc parallel kernels ! { dg-error "Failed to match clause" }
+end
diff --git a/gcc/testsuite/gfortran.dg/goacc/sie.f95 b/gcc/testsuite/gfortran.dg/goacc/sie.f95
index abfe28b..bb942bd 100644
--- a/gcc/testsuite/gfortran.dg/goacc/sie.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/sie.f95
@@ -67,7 +67,7 @@
   !$acc end kernels
 
 
-  !$acc parallel num_gangs ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel num_gangs ! { dg-error "Failed to match clause" }
 
   !$acc parallel num_gangs(3)
   !$acc end parallel
@@ -78,10 +78,10 @@
   !$acc parallel num_gangs(i+1)
   !$acc end parallel
 
-  !$acc parallel num_gangs(-1) ! { dg-warning "must be positive" }
+  !$acc parallel num_gangs(-1) ! { dg-error "must be positive" }
   !$acc end parallel
 
-  !$acc parallel num_gangs(0) ! { dg-warning "must be positive" }
+  !$acc parallel num_gangs(0) ! { dg-error "must be positive" }
   !$acc end parallel
 
   !$acc parallel num_gangs() ! { dg-error "Invalid character in name" }
@@ -95,7 +95,7 @@
   !$acc parallel num_gangs("1") ! { dg-error "scalar INTEGER expression" }
   !$acc end parallel
 
-  !$acc kernels num_gangs ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc kernels num_gangs ! { dg-error "Failed to match clause" }
 
   !$acc kernels num_gangs(3)
   !$acc end kernels
@@ -106,10 +106,10 @@
   !$acc kernels num_gangs(i+1)
   !$acc end kernels
 
-  !$acc kernels num_gangs(-1) ! { dg-warning "must be positive" }
+  !$acc kernels num_gangs(-1) ! { dg-error "must be positive" }
   !$acc end kernels
 
-  !$acc kernels num_gangs(0) ! { dg-warning "must be positive" }
+  !$acc kernels num_gangs(0) ! { dg-error "must be positive" }
   !$acc end kernels
 
   !$acc kernels num_gangs() ! { dg-error "Invalid character in name" }
@@ -124,7 +124,7 @@
   !$acc end kernels
 
 
-  !$acc parallel num_workers ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel num_workers ! { dg-error "Failed to match clause" }
 
   !$acc parallel num_workers(3)
   !$acc end parallel
@@ -135,10 +135,10 @@
   !$acc parallel num_workers(i+1)
   !$acc end parallel
 
-  !$acc parallel num_workers(-1) ! { dg-warning "must be positive" }
+  !$acc parallel num_workers(-1) ! { dg-error "must be positive" }
   !$acc end parallel
 
-  !$acc parallel num_workers(0) ! { dg-warning "must be positive" }
+  !$acc parallel num_workers(0) ! { dg-error "must be positive" }
   !$acc end parallel
 
   !$acc parallel num_workers() ! { dg-error "Invalid character in name" }
@@ -152,7 +152,7 @@
   !$acc parallel num_workers("1") ! { dg-error "scalar INTEGER expression" }
   !$acc end parallel
 
-  !$acc kernels num_workers ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc kernels num_workers ! { dg-error "Failed to match clause" }
 
   !$acc kernels num_workers(3)
   !$acc end kernels
@@ -163,10 +163,10 @@
   !$acc kernels num_workers(i+1)
   !$acc end kernels
 
-  !$acc kernels num_workers(-1) ! { dg-warning "must be positive" }
+  !$acc kernels num_workers(-1) ! { dg-error "must be positive" }
   !$acc end kernels
 
-  !$acc kernels num_workers(0) ! { dg-warning "must be positive" }
+  !$acc kernels num_workers(0) ! { dg-error "must be positive" }
   !$acc end kernels
 
   !$acc kernels num_workers() ! { dg-error "Invalid character in name" }
@@ -181,7 +181,7 @@
   !$acc end kernels
 
 
-  !$acc parallel vector_length ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel vector_length ! { dg-error "Failed to match clause" }
 
   !$acc parallel vector_length(3)
   !$acc end parallel
@@ -192,10 +192,10 @@
   !$acc parallel vector_length(i+1)
   !$acc end parallel
 
-  !$acc parallel vector_length(-1) ! { dg-warning "must be positive" }
+  !$acc parallel vector_length(-1) ! { dg-error "must be positive" }
   !$acc end parallel
 
-  !$acc parallel vector_length(0) ! { dg-warning "must be positive" }
+  !$acc parallel vector_length(0) ! { dg-error "must be positive" }
   !$acc end parallel
 
   !$acc parallel vector_length() ! { dg-error "Invalid character in name" }
@@ -209,7 +209,7 @@
   !$acc parallel vector_length("1") ! { dg-error "scalar INTEGER expression" }
   !$acc end parallel
 
-  !$acc kernels vector_length ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc kernels vector_length ! { dg-error "Failed to match clause" }
 
   !$acc kernels vector_length(3)
   !$acc end kernels
@@ -220,10 +220,10 @@
   !$acc kernels vector_length(i+1)
   !$acc end kernels
 
-  !$acc kernels vector_length(-1) ! { dg-warning "must be positive" }
+  !$acc kernels vector_length(-1) ! { dg-error "must be positive" }
   !$acc end kernels
 
-  !$acc kernels vector_length(0) ! { dg-warning "must be positive" }
+  !$acc kernels vector_length(0) ! { dg-error "must be positive" }
   !$acc end kernels
 
   !$acc kernels vector_length() ! { dg-error "Invalid character in name" }
@@ -250,10 +250,10 @@
   !$acc loop gang(i+1)
   do i = 1,10
   enddo
-  !$acc loop gang(-1) ! { dg-warning "must be positive" }
+  !$acc loop gang(-1) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
-  !$acc loop gang(0) ! { dg-warning "must be positive" }
+  !$acc loop gang(0) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
   !$acc loop gang() ! { dg-error "Invalid character in name" }
@@ -282,10 +282,10 @@
   !$acc loop worker(i+1)
   do i = 1,10
   enddo
-  !$acc loop worker(-1) ! { dg-warning "must be positive" }
+  !$acc loop worker(-1) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
-  !$acc loop worker(0) ! { dg-warning "must be positive" }
+  !$acc loop worker(0) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
   !$acc loop worker() ! { dg-error "Invalid character in name" }
@@ -314,10 +314,10 @@
   !$acc loop vector(i+1)
   do i = 1,10
   enddo
-  !$acc loop vector(-1) ! { dg-warning "must be positive" }
+  !$acc loop vector(-1) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
-  !$acc loop vector(0) ! { dg-warning "must be positive" }
+  !$acc loop vector(0) ! { dg-error "must be positive" }
   do i = 1,10
   enddo
   !$acc loop vector() ! { dg-error "Invalid character in name" }
diff --git a/gcc/testsuite/gfortran.dg/goacc/tile-1.f90 b/gcc/testsuite/gfortran.dg/goacc/tile-1.f90
index 3dbabda..9ef7521 100644
--- a/gcc/testsuite/gfortran.dg/goacc/tile-1.f90
+++ b/gcc/testsuite/gfortran.dg/goacc/tile-1.f90
@@ -24,7 +24,7 @@
      end do
   end do 
 
-  !$acc parallel loop tile ! { dg-error "Unclassifiable" }
+  !$acc parallel loop tile ! { dg-error "Failed to match clause" }
   do i = 1, n
   end do
 
@@ -44,17 +44,17 @@
   do i = 1, n
   end do
 
-  !$acc parallel loop tile(-3) ! { dg-warning "must be positive" }
+  !$acc parallel loop tile(-3) ! { dg-error "must be positive" }
   do i = 1, n
   end do
 
-  !$acc parallel loop tile(10, -3) ! { dg-warning "must be positive" }
+  !$acc parallel loop tile(10, -3) ! { dg-error "must be positive" }
   do i = 1, n
      do j = 1, n
      end do
   end do
 
-  !$acc parallel loop tile(-100, 10, 5) ! { dg-warning "must be positive" }
+  !$acc parallel loop tile(-100, 10, 5) ! { dg-error "must be positive" }
   do i = 1, n
      do j = 1, n
         do k = 1, n
@@ -92,7 +92,7 @@
   integer i, j, k
 
   !$acc parallel
-  !$acc loop tile ! { dg-error "Unclassifiable" }
+  !$acc loop tile ! { dg-error "Failed to match clause" }
   do i = 1, n
   end do
 
@@ -114,7 +114,7 @@
      end do
   end do
 
-  !$acc loop tile(-2) ! { dg-warning "must be positive" }
+  !$acc loop tile(-2) ! { dg-error "must be positive" }
   do i = 1, n
   end do
 
@@ -173,7 +173,7 @@
   integer i, j, k
 
   !$acc kernels
-  !$acc loop tile  ! { dg-error "Unclassifiable" }
+  !$acc loop tile  ! { dg-error "Failed to match clause" }
   do i = 1, n
   end do
 
@@ -195,7 +195,7 @@
      end do
   end do
 
-  !$acc loop tile(-2) ! { dg-warning "must be positive" }
+  !$acc loop tile(-2) ! { dg-error "must be positive" }
   do i = 1, n
   end do
 
@@ -275,7 +275,7 @@
      end do
   end do 
 
-  !$acc kernels loop tile ! { dg-error "Unclassifiable" }
+  !$acc kernels loop tile ! { dg-error "Failed to match clause" }
   do i = 1, n
   end do
 
@@ -295,17 +295,17 @@
   do i = 1, n
   end do
 
-  !$acc kernels loop tile(-3) ! { dg-warning "must be positive" }
+  !$acc kernels loop tile(-3) ! { dg-error "must be positive" }
   do i = 1, n
   end do
 
-  !$acc kernels loop tile(10, -3) ! { dg-warning "must be positive" }
+  !$acc kernels loop tile(10, -3) ! { dg-error "must be positive" }
   do i = 1, n
      do j = 1, n
      end do
   end do
 
-  !$acc kernels loop tile(-100, 10, 5) ! { dg-warning "must be positive" }
+  !$acc kernels loop tile(-100, 10, 5) ! { dg-error "must be positive" }
   do i = 1, n
      do j = 1, n
         do k = 1, n
diff --git a/gcc/testsuite/gfortran.dg/goacc/uninit-dim-clause.f95 b/gcc/testsuite/gfortran.dg/goacc/uninit-dim-clause.f95
index 5dea42b..8551140 100644
--- a/gcc/testsuite/gfortran.dg/goacc/uninit-dim-clause.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/uninit-dim-clause.f95
@@ -4,14 +4,20 @@
   implicit none
   integer :: i, j, k
 
-  !$acc parallel num_gangs(i) ! { dg-warning "is used uninitialized in this function" }
-  !$acc end parallel
+  !$acc parallel loop gang num_gangs(i) ! { dg-warning "is used uninitialized in this function" }
+  do i = 0, 1
+  end do
+  !$acc end parallel loop
 
-  !$acc parallel num_workers(j) ! { dg-warning "is used uninitialized in this function" }
-  !$acc end parallel
+  !$acc parallel loop worker num_workers(j) ! { dg-warning "is used uninitialized in this function" }
+  do j = 0, 1
+  end do
+  !$acc end parallel loop
 
-  !$acc parallel vector_length(k) ! { dg-warning "is used uninitialized in this function" }
-  !$acc end parallel
+  !$acc parallel loop vector vector_length(k) ! { dg-warning "is used uninitialized in this function" }
+  do k = 0, 1
+  end do
+  !$acc end parallel loop
 end subroutine acc_parallel
 
 subroutine acc_kernels
diff --git a/gcc/testsuite/gfortran.dg/goacc/update-if_present-2.f90 b/gcc/testsuite/gfortran.dg/goacc/update-if_present-2.f90
index e73c2dc..bf8b319 100644
--- a/gcc/testsuite/gfortran.dg/goacc/update-if_present-2.f90
+++ b/gcc/testsuite/gfortran.dg/goacc/update-if_present-2.f90
@@ -2,7 +2,7 @@
 
 subroutine t1
   implicit none
-  !$acc routine gang if_present ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc routine gang if_present ! { dg-error "Failed to match clause" }
   integer a, b, c(10)
   real, allocatable :: x, y, z(:)
 
@@ -12,10 +12,10 @@
 
   allocate (x, y, z(100))
 
-  !$acc enter data copyin(a) if_present ! { dg-error "Unclassifiable OpenACC directive" }
-  !$acc exit data copyout(a) if_present ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc enter data copyin(a) if_present ! { dg-error "Failed to match clause" }
+  !$acc exit data copyout(a) if_present ! { dg-error "Failed to match clause" }
 
-  !$acc data copy(a) if_present ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc data copy(a) if_present ! { dg-error "Failed to match clause" }
   !$acc end data ! { dg-error "Unexpected ..ACC END DATA statement" }
 
   !$acc declare link(a) if_present ! { dg-error "Unexpected junk after" }
@@ -23,7 +23,7 @@
   !$acc init if_present ! { dg-error "Unclassifiable OpenACC directive" }
   !$acc shutdown if_present ! { dg-error "Unclassifiable OpenACC directive" }
   
-  !$acc update self(a) device_type(nvidia) device(b) if_present ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc update self(a) device_type(nvidia) device(b) if_present ! { dg-error "Failed to match clause" }
 end subroutine t1
 
 subroutine t2
@@ -35,17 +35,17 @@
   c(:) = -1
 
   !$acc parallel
-  !$acc loop if_present ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc loop if_present ! { dg-error "Failed to match clause" }
   do b = 1, 10
   end do
   !$acc end parallel
 
-  !$acc kernels loop if_present ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc kernels loop if_present ! { dg-error "Failed to match clause" }
   do b = 1, 10
   end do
   !$acc end kernels loop ! { dg-error "Unexpected ..ACC END KERNELS LOOP statement" }
 
-  !$acc parallel loop if_present ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc parallel loop if_present ! { dg-error "Failed to match clause" }
   do b = 1, 10
   end do
   !$acc end parallel loop   ! { dg-error "Unexpected ..ACC END PARALLEL LOOP statement" }
diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-simd-1.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-simd-1.f90
index d6ae7c9..40169d3 100644
--- a/gcc/testsuite/gfortran.dg/gomp/declare-simd-1.f90
+++ b/gcc/testsuite/gfortran.dg/gomp/declare-simd-1.f90
@@ -2,7 +2,7 @@
 
 subroutine fn1 (x)
   integer :: x
-!$omp declare simd (fn1) inbranch notinbranch uniform (x) ! { dg-error "Unclassifiable OpenMP directive" }
+!$omp declare simd (fn1) inbranch notinbranch uniform (x) ! { dg-error "Failed to match clause" }
 end subroutine fn1
 subroutine fn2 (x)
 !$omp declare simd (fn100)	! { dg-error "should refer to containing procedure" }
diff --git a/gcc/testsuite/gfortran.dg/gomp/pr29759.f90 b/gcc/testsuite/gfortran.dg/gomp/pr29759.f90
index 39c59c5..1b1f379 100644
--- a/gcc/testsuite/gfortran.dg/gomp/pr29759.f90
+++ b/gcc/testsuite/gfortran.dg/gomp/pr29759.f90
@@ -21,20 +21,20 @@
 !$OMP END PARALLEL
 
 
-!$OMP PARALLEL &		! { dg-error "Unclassifiable OpenMP" }
+!$OMP PARALLEL &		! { dg-error "Failed to match clause" }
 !$    NUM_THREADS(2)
 !$OMP END PARALLEL		! { dg-error "Unexpected" }
 
-!$OMP PARALLEL &		! { dg-error "Unclassifiable OpenMP" }
+!$OMP PARALLEL &		! { dg-error "Failed to match clause" }
 !$    & NUM_THREADS(2)		! { dg-error "Invalid character" }
 !$OMP END PARALLEL		! { dg-error "Unexpected" }
 
-!$OMP PARALLEL &		! { dg-error "Unclassifiable OpenMP" }
+!$OMP PARALLEL &		! { dg-error "Failed to match clause" }
 !
 !$    NUM_THREADS(2)
 !$OMP END PARALLEL		! { dg-error "Unexpected" }
 
-!$OMP PARALLEL &		! { dg-error "Unclassifiable OpenMP" }
+!$OMP PARALLEL &		! { dg-error "Failed to match clause" }
 !
 !$    & NUM_THREADS(2)		! { dg-error "Invalid character" }
 !$OMP END PARALLEL		! { dg-error "Unexpected" }
diff --git a/gcc/testsuite/gfortran.dg/gomp/pr77516.f90 b/gcc/testsuite/gfortran.dg/gomp/pr77516.f90
index 9c0a95b..3ac3f55 100644
--- a/gcc/testsuite/gfortran.dg/gomp/pr77516.f90
+++ b/gcc/testsuite/gfortran.dg/gomp/pr77516.f90
@@ -4,7 +4,7 @@
 program pr77516
    integer :: i, x
    x = 0
-!$omp simd safelen(0) reduction(+:x)	! { dg-warning "must be positive" }
+!$omp simd safelen(0) reduction(+:x)	! { dg-error "must be positive" }
    do i = 1, 8
       x = x + 1
    end do
diff --git a/gcc/testsuite/gfortran.dg/gomp/pr78260-2.f90 b/gcc/testsuite/gfortran.dg/gomp/pr78260-2.f90
new file mode 100644
index 0000000..c58ad93
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/gomp/pr78260-2.f90
@@ -0,0 +1,59 @@
+! { dg-do compile }
+! { dg-options "-fopenmp -fdump-tree-original" }
+
+! PR fortran/78260
+
+module m
+  implicit none
+  integer :: n = 0
+contains
+  integer function f1()
+    !$omp target data map(f1)
+    !$omp target update to(f1)
+    f1 = 5 
+    !$omp end target data
+  end function f1
+
+  integer function f2()
+    dimension :: f2(1)
+    !$omp target data map(f2)
+    !$omp target update to(f2)
+    f2(1) = 5 
+    !$omp end target data
+  end function f2
+
+  integer function f3() result(res)
+    dimension :: res(1)
+    !$omp target data map(res)
+    !$omp target update to(res)
+    res(1) = 5 
+    !$omp end target data
+  end function f3
+
+  integer function f4() result(res)
+    allocatable :: res
+    dimension :: res(:)
+    !$omp target data map(res)
+    !$omp target update to(res)
+    res = [5]
+    !$omp end target data
+  end function f4
+
+  subroutine sub()
+    integer, allocatable :: arr(:)
+    !$omp target data map(arr)
+    !$omp target update to(arr)
+    arr = [5]
+    !$omp end target data
+  end subroutine sub
+end module m
+
+! { dg-final { scan-tree-dump-times "#pragma omp target data map\\(tofrom:\\*\\(c_char \\*\\) arr.data \\\[len: D.\[0-9\]+ \\* 4\\\]\\) map\\(to:arr \\\[pointer set, len: ..\\\]\\) map\\(alloc:\\(integer\\(kind=4\\)\\\[0:\\\] \\* restrict\\) arr.data \\\[pointer assign, bias: 0\\\]\\)" 1 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp target update to\\(\\*\\(c_char \\*\\) arr.data \\\[len: D.\[0-9\]+ \\* 4\\\]\\)" 1 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp target data map\\(tofrom:\\*\\(c_char \\*\\) __result->data \\\[len: D.\[0-9\]+ \\* 4\\\]\\) map\\(to:\\*__result \\\[pointer set, len: ..\\\]\\) map\\(alloc:\\(integer\\(kind=4\\)\\\[0:\\\] \\* restrict\\) __result->data \\\[pointer assign, bias: 0\\\]\\) map\\(alloc:__result \\\[pointer assign, bias: 0\\\]\\)" 1 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp target update to\\(\\*\\(c_char \\*\\) __result->data \\\[len: D.\[0-9\]+ \\* 4\\\]\\)" 1 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp target data map\\(tofrom:\\*__result.0\\) map\\(alloc:__result.0 \\\[pointer assign, bias: 0\\\]\\)" 2 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp target update to\\(\\*__result.0\\)" 2 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp target data map\\(tofrom:__result_f1\\)" 1 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp target update to\\(__result_f1\\)" 1 "original" } }
+
diff --git a/gcc/testsuite/gfortran.dg/gomp/pr78260-3.f90 b/gcc/testsuite/gfortran.dg/gomp/pr78260-3.f90
new file mode 100644
index 0000000..4ca3e36
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/gomp/pr78260-3.f90
@@ -0,0 +1,74 @@
+! { dg-do compile }
+! { dg-options "-fopenmp -fdump-tree-original" }
+
+! PR fortran/78260
+
+integer function f1()
+  implicit none
+
+  f1 = 0
+
+  !$omp task depend(inout:f1)
+  !$omp end task
+
+  !$omp task depend(inout:f1)
+  !$omp end task
+end function f1
+
+integer function f2()
+  implicit none
+  dimension :: f2(1)
+
+  f2(1) = 0
+
+  !$omp task depend(inout:f2)
+  !$omp end task
+
+  !$omp task depend(inout:f2)
+  !$omp end task
+end function f2
+
+integer function f3() result(res)
+  implicit none
+  dimension :: res(1)
+
+  res(1) = 0
+
+  !$omp task depend(inout:res)
+  !$omp end task
+
+  !$omp task depend(inout:res)
+  !$omp end task
+end function f3
+
+integer function f4() result(res)
+  implicit none
+  allocatable :: res
+  dimension :: res(:)
+
+  res = [0]
+
+  !$omp task depend(inout:res)
+  !$omp end task
+
+  !$omp task depend(inout:res)
+  !$omp end task
+end function f4
+
+subroutine sub()
+  implicit none
+  integer, allocatable :: arr(:)
+
+  arr = [3]
+
+  !$omp task depend(inout:arr)
+  !$omp end task
+
+  !$omp task depend(inout:arr)
+  !$omp end task
+end subroutine sub
+
+! { dg-final { scan-tree-dump-times "#pragma omp task depend\\(inout:__result_f1\\)" 2 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp task depend\\(inout:\\*__result.0\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp task depend\\(inout:\\*\\(c_char \\*\\) __result->data\\)" 2 "original" } }
+! { dg-final { scan-tree-dump-times "#pragma omp task depend\\(inout:\\*\\(c_char \\*\\) arr.data\\)" 2 "original" } }
diff --git a/gcc/testsuite/gfortran.dg/gomp/pr78260.f90 b/gcc/testsuite/gfortran.dg/gomp/pr78260.f90
new file mode 100644
index 0000000..23acd4c
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/gomp/pr78260.f90
@@ -0,0 +1,33 @@
+! { dg-do compile }
+
+! PR fortran/78260
+
+module m
+  implicit none
+  integer :: n = 0
+contains
+  subroutine s
+    !$omp target data map(m)   ! { dg-error "Object .m. is not a variable" }
+    !$omp target update to(m)  ! { dg-error "Object .m. is not a variable" }
+    n = n + 1
+    !$omp end target data
+  end subroutine s
+  subroutine s2
+    !$omp target data map(s2)   ! { dg-error "Object .s2. is not a variable" }
+    !$omp target update to(s2)  ! { dg-error "Object .s2. is not a variable" }
+    n = n + 1
+    !$omp end target data
+  end subroutine s2
+  integer function f1()
+    !$omp target data map(f1)   ! OK, f1 is also the result variable
+    !$omp target update to(f1)  ! OK, f1 is also the result variable
+    f1 = 5 
+    !$omp end target data
+  end function f1
+  integer function f2() result(res)
+    !$omp target data map(f2)   ! { dg-error "Object .f2. is not a variable" }
+    !$omp target update to(f2)  ! { dg-error "Object .f2. is not a variable" }
+    res = 5 
+    !$omp end target data
+  end function f2
+end module m
diff --git a/gcc/testsuite/gfortran.dg/gomp/target-simd.f90 b/gcc/testsuite/gfortran.dg/gomp/target-simd.f90
new file mode 100644
index 0000000..733420f
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/gomp/target-simd.f90
@@ -0,0 +1,26 @@
+! { dg-do compile }
+
+program test
+  implicit none
+  real, allocatable :: a(:), b(:)
+  integer :: i
+
+  a = [(i, i = 1, 100)]
+  allocate(b, mold=a)
+  b = 0
+
+  !$omp target simd map(to:a) map(from:b)
+  do i = 0, size(a)
+    b(i) = 5.0 * a(i)
+  end do
+
+  if (any (b - 5.0 *a > 10.0*epsilon(a))) call abort()
+
+  !$omp target simd map(to:a) map(from:b)
+  do i = 0, size(a)
+    b(i) = 2.0 * a(i)
+  end do
+  !$omp end target simd
+
+  if (any (b - 2.0 *a > 10.0*epsilon(a))) call abort()
+end program test
diff --git a/gcc/testsuite/gfortran.dg/gomp/use_device_ptr1.f90 b/gcc/testsuite/gfortran.dg/gomp/use_device_ptr1.f90
new file mode 100644
index 0000000..b8c56b5
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/gomp/use_device_ptr1.f90
@@ -0,0 +1,102 @@
+! { dg-do compile }
+! { dg-options "-fopenmp -fdump-tree-optimized -O0" }
+
+! Check use_device_ptr with local variables
+
+module offloading
+  use iso_c_binding
+  implicit none
+  interface
+    subroutine copy3_array_data(from, to, N) bind(C)
+      import :: c_ptr
+      type(c_ptr), value :: from, to
+      integer, value :: N
+    end subroutine copy3_array_data
+  end interface
+end module offloading
+
+subroutine omp_device_ptr()
+  use iso_c_binding
+  use offloading
+  implicit none
+
+  integer, parameter :: N = 1000
+  real(c_double), pointer :: AA(:), BBB(:)
+  real(c_double), allocatable, target :: CC(:), DD(:)
+  real(c_double), target :: EE(N), FF(N), dummy(1)
+
+  ! allocate(AA(N), BBB(N), CC(N), DD(N))  ! make dump more readable
+
+  ! AA = 11.0_c_double
+  ! BBB = 22.0_c_double
+  ! CC = 33.0_c_double
+  ! DD = 44.0_c_double
+  ! EE = 55.0_c_double
+  ! FF = 66.0_c_double
+
+  ! NOTE: OpenMP 5's use_device_addr is (at time of writing) not yet supported
+
+  ! pointer-type array to use_device_ptr
+  ! !$omp target data map(to:AA) map(from:BBB) use_device_ptr(AA,BBB)
+  !$omp target data map(alloc:dummy) use_device_ptr(AA,BBB)
+  call copy3_array_data(c_loc(AA), c_loc(BBB), N)
+  !$omp end target data
+
+  ! allocatable array to use_device_ptr
+  !!$omp target data map(to:CC) map(from:DD) use_device_ptr(CC,DD)
+  !$omp target data map(alloc:dummy) use_device_ptr(CC,DD)
+  call copy3_array_data(c_loc(CC), c_loc(DD), N)
+  !$omp end target data
+
+  ! fixed-size decriptorless array to use_device_ptr
+  !!$omp target data map(to:EE) map(from:FF) use_device_ptr(EE,FF)
+  !$omp target data map(alloc:dummy) use_device_ptr(EE,FF)
+  call copy3_array_data(c_loc(EE), c_loc(FF), N)
+  !$omp end target data
+
+  ! deallocate(AA, BBB)  ! Free all pointers, only
+end subroutine omp_device_ptr
+
+
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.aa = &aa" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.aa = aa" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.bbb = &bbb" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.bbb = bbb" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.cc = &cc" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.cc = cc" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.dd = &dd" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.dd = dd" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.ee = ee" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.ff = ff" "optimized" } }
+! { dg-final { scan-tree-dump-times ".omp_data_arr.\[0-9\]+.ee = &ee" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times ".omp_data_arr.\[0-9\]+.ff = &ff" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = aa.data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "aa\.\[0-9\]+ = aa;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "aa\.\[0-9\]+\.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = aa\.\[0-9\]+\.data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = bbb.data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "bbb\.\[0-9\]+ = bbb;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "bbb\.\[0-9\]+\.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = bbb\.\[0-9\]+\.data;" 1 "optimized" } }
+
+! '3' because of automatic deallocation
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = cc.data;" 3 "optimized" } }
+! { dg-final { scan-tree-dump-times "cc\.\[0-9\]+ = cc;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "cc\.\[0-9\]+\.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = cc\.\[0-9\]+\.data;" 1 "optimized" } }
+
+! '3' because of automatic deallocation
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = dd.data;" 3 "optimized" } }
+! { dg-final { scan-tree-dump-times "dd\.\[0-9\]+ = dd;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "dd\.\[0-9\]+\.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = dd\.\[0-9\]+\.data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = .omp_data_arr.\[0-9\]+.ee;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "ee\.\[0-9\]+_\[0-9\]+ = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = &\\*ee.\[0-9\]+_\[0-9\]+;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = .omp_data_arr.\[0-9\]+.ff;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "ff\.\[0-9\]+_\[0-9\]+ = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = &\\*ff.\[0-9\]+_\[0-9\]+;" 1 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/gomp/use_device_ptr2.f90 b/gcc/testsuite/gfortran.dg/gomp/use_device_ptr2.f90
new file mode 100644
index 0000000..c2b837a
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/gomp/use_device_ptr2.f90
@@ -0,0 +1,107 @@
+! { dg-do compile }
+! { dg-options "-fopenmp -fdump-tree-optimized -O0" }
+
+! Check use_device_ptr with nonoptional dummy arguments
+
+module offloading
+  use iso_c_binding
+  implicit none
+  interface
+    subroutine copy3_array_data(from, to, N) bind(C)
+      import :: c_ptr
+      type(c_ptr), value :: from, to
+      integer, value :: N
+    end subroutine copy3_array_data
+  end interface
+end module offloading
+
+subroutine omp_device_ptr(AA, BBB, CC, DD, EE, FF, N)
+  use iso_c_binding
+  use offloading
+  implicit none
+
+  integer, value :: N
+  real(c_double), pointer :: AA(:), BBB(:)
+  real(c_double), allocatable, target :: CC(:), DD(:)
+  real(c_double), target :: EE(N), FF(N), dummy(1)
+
+  ! allocate(AA(N), BBB(N), CC(N), DD(N))  ! make dump more readable
+
+  ! AA = 11.0_c_double
+  ! BBB = 22.0_c_double
+  ! CC = 33.0_c_double
+  ! DD = 44.0_c_double
+  ! EE = 55.0_c_double
+  ! FF = 66.0_c_double
+
+  ! NOTE: OpenMP 5's use_device_addr is (at time of writing) not yet supported
+
+  ! pointer-type array to use_device_ptr
+  ! !$omp target data map(to:AA) map(from:BBB) use_device_ptr(AA,BBB)
+  !$omp target data map(alloc:dummy) use_device_ptr(AA,BBB)
+  call copy3_array_data(c_loc(AA), c_loc(BBB), N)
+  !$omp end target data
+
+  ! allocatable array to use_device_ptr
+  !!$omp target data map(to:CC) map(from:DD) use_device_ptr(CC,DD)
+  !$omp target data map(alloc:dummy) use_device_ptr(CC,DD)
+  call copy3_array_data(c_loc(CC), c_loc(DD), N)
+  !$omp end target data
+
+  ! fixed-size decriptorless array to use_device_ptr
+  !!$omp target data map(to:EE) map(from:FF) use_device_ptr(EE,FF)
+  !$omp target data map(alloc:dummy) use_device_ptr(EE,FF)
+  call copy3_array_data(c_loc(EE), c_loc(FF), N)
+  !$omp end target data
+
+  ! deallocate(AA, BBB)  ! Free all pointers, only
+end subroutine omp_device_ptr
+
+! { dg-final { scan-tree-dump-not ".omp_data_arr.aa" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.aa" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.bbb" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.bbb" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.cc" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.cc" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.dd" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.dd" "optimized" } }
+
+! { dg-final { scan-tree-dump-times ".omp_data_arr.\[0-9\]+.D.\[0-9\]+ = _\[0-9\]+" 4 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = .omp_data_arr.\[0-9\]+.D.\[0-9\]+" 4 "optimized" } }
+! { dg-final { scan-tree-dump-times ".omp_data_arr.\[0-9\]+.ee = ee_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times ".omp_data_arr.\[0-9\]+.ff = ff_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = .omp_data_arr.\[0-9\]+.ee;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = aa_\[0-9\]+.D.->data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*aa_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "aa.\[0-9\]+ = D.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "aa.\[0-9\]+.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "aa.\[0-9\]+_\[0-9\]+ = &aa.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = aa.\[0-9\]+_\[0-9\]+->data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = bbb_\[0-9\]+.D.->data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*bbb_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "bbb.\[0-9\]+ = D.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "bbb.\[0-9\]+.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "bbb.\[0-9\]+_\[0-9\]+ = &bbb.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = bbb.\[0-9\]+_\[0-9\]+->data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = cc_\[0-9\]+.D.->data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*cc_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "cc.\[0-9\]+ = D.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "cc.\[0-9\]+.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "cc.\[0-9\]+_\[0-9\]+ = &cc.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = cc.\[0-9\]+_\[0-9\]+->data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = dd_\[0-9\]+.D.->data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*dd_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "dd.\[0-9\]+ = D.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "dd.\[0-9\]+.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "dd.\[0-9\]+_\[0-9\]+ = &dd.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = dd.\[0-9\]+_\[0-9\]+->data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "ee.\[0-9\]+_\[0-9\]+ = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = ee.\[0-9\]+_\[0-9\]+" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "ff.\[0-9\]+_\[0-9\]+ = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = ff.\[0-9\]+_\[0-9\]+" 1 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/gomp/use_device_ptr3.f90 b/gcc/testsuite/gfortran.dg/gomp/use_device_ptr3.f90
new file mode 100644
index 0000000..7ce0ca6
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/gomp/use_device_ptr3.f90
@@ -0,0 +1,108 @@
+! { dg-do compile }
+! { dg-options "-fopenmp -fdump-tree-optimized -O0" }
+
+! Check use_device_ptr with optional dummy arguments
+
+module offloading
+  use iso_c_binding
+  implicit none
+  interface
+    subroutine copy3_array_data(from, to, N) bind(C)
+      import :: c_ptr
+      type(c_ptr), value :: from, to
+      integer, value :: N
+    end subroutine copy3_array_data
+  end interface
+end module offloading
+
+subroutine omp_device_ptr(AA, BBB, CC, DD, EE, FF, N)
+  use iso_c_binding
+  use offloading
+  implicit none
+
+  integer, value :: N
+  real(c_double), optional, pointer :: AA(:), BBB(:)
+  real(c_double), optional, allocatable, target :: CC(:), DD(:)
+  real(c_double), optional, target :: EE(N), FF(N)
+  real(c_double) :: dummy(1)
+
+  ! allocate(AA(N), BBB(N), CC(N), DD(N))  ! make dump more readable
+
+  ! AA = 11.0_c_double
+  ! BBB = 22.0_c_double
+  ! CC = 33.0_c_double
+  ! DD = 44.0_c_double
+  ! EE = 55.0_c_double
+  ! FF = 66.0_c_double
+
+  ! NOTE: OpenMP 5's use_device_addr is (at time of writing) not yet supported
+
+  ! pointer-type array to use_device_ptr
+  ! !$omp target data map(to:AA) map(from:BBB) use_device_ptr(AA,BBB)
+  !$omp target data map(alloc:dummy) use_device_ptr(AA,BBB)
+  call copy3_array_data(c_loc(AA), c_loc(BBB), N)
+  !$omp end target data
+
+  ! allocatable array to use_device_ptr
+  !!$omp target data map(to:CC) map(from:DD) use_device_ptr(CC,DD)
+  !$omp target data map(alloc:dummy) use_device_ptr(CC,DD)
+  call copy3_array_data(c_loc(CC), c_loc(DD), N)
+  !$omp end target data
+
+  ! fixed-size decriptorless array to use_device_ptr
+  !!$omp target data map(to:EE) map(from:FF) use_device_ptr(EE,FF)
+  !$omp target data map(alloc:dummy) use_device_ptr(EE,FF)
+  call copy3_array_data(c_loc(EE), c_loc(FF), N)
+  !$omp end target data
+
+  ! deallocate(AA, BBB)  ! Free all pointers, only
+end subroutine omp_device_ptr
+
+! { dg-final { scan-tree-dump-not ".omp_data_arr.aa" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.aa" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.bbb" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.bbb" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.cc" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.cc" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.dd" "optimized" } }
+! { dg-final { scan-tree-dump-not ".omp_data_arr.\[0-9\]+.dd" "optimized" } }
+
+! { dg-final { scan-tree-dump-times ".omp_data_arr.\[0-9\]+.D.\[0-9\]+ = _\[0-9\]+" 4 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = .omp_data_arr.\[0-9\]+.D.\[0-9\]+" 4 "optimized" } }
+! { dg-final { scan-tree-dump-times ".omp_data_arr.\[0-9\]+.ee = ee_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times ".omp_data_arr.\[0-9\]+.ff = ff_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = .omp_data_arr.\[0-9\]+.ee;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = aa_\[0-9\]+.D.->data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*aa_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "aa.\[0-9\]+ = D.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "aa.\[0-9\]+.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "aa.\[0-9\]+_\[0-9\]+ = &aa.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = aa.\[0-9\]+_\[0-9\]+->data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = bbb_\[0-9\]+.D.->data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*bbb_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "bbb.\[0-9\]+ = D.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "bbb.\[0-9\]+.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "bbb.\[0-9\]+_\[0-9\]+ = &bbb.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = bbb.\[0-9\]+_\[0-9\]+->data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = cc_\[0-9\]+.D.->data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*cc_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "cc.\[0-9\]+ = D.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "cc.\[0-9\]+.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "cc.\[0-9\]+_\[0-9\]+ = &cc.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = cc.\[0-9\]+_\[0-9\]+->data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = dd_\[0-9\]+.D.->data;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "D.\[0-9\]+ = \\*dd_\[0-9\]+.D.;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "dd.\[0-9\]+ = D.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "dd.\[0-9\]+.data = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "dd.\[0-9\]+_\[0-9\]+ = &dd.\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = dd.\[0-9\]+_\[0-9\]+->data;" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "ee.\[0-9\]+_\[0-9\]+ = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = ee.\[0-9\]+_\[0-9\]+" 1 "optimized" } }
+
+! { dg-final { scan-tree-dump-times "ff.\[0-9\]+_\[0-9\]+ = _\[0-9\]+;" 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "_\[0-9\]+ = ff.\[0-9\]+_\[0-9\]+" 1 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/openacc-define-3.f90 b/gcc/testsuite/gfortran.dg/openacc-define-3.f90
index b6c296e..dcc52b6 100644
--- a/gcc/testsuite/gfortran.dg/openacc-define-3.f90
+++ b/gcc/testsuite/gfortran.dg/openacc-define-3.f90
@@ -6,6 +6,6 @@
 # error _OPENACC not defined
 #endif
 
-#if _OPENACC != 201306
+#if _OPENACC != 201711
 # error _OPENACC defined to wrong value
 #endif
diff --git a/gcc/tree-core.h b/gcc/tree-core.h
index 41d0529..b51fe32 100644
--- a/gcc/tree-core.h
+++ b/gcc/tree-core.h
@@ -255,7 +255,9 @@
                 placeholder used in OMP_CLAUSE_REDUCTION_{INIT,MERGE}.
      Operand 4: OMP_CLAUSE_REDUCTION_DECL_PLACEHOLDER: Another dummy
 		VAR_DECL placeholder, used like the above for C/C++ array
-		reductions.  */
+		reductions.
+     Operand 5: OMP_CLAUSE_REDUCTION_PRIVATE_DECL: A private VAR_DECL of
+                the original DECL associated with the reduction clause.  */
   OMP_CLAUSE_REDUCTION,
 
   /* OpenMP clause: task_reduction (operator:variable_list).  */
@@ -461,6 +463,9 @@
   /* OpenACC clause: vector_length (integer-expression).  */
   OMP_CLAUSE_VECTOR_LENGTH,
 
+  /* OpenACC clause: nohost.  */
+  OMP_CLAUSE_NOHOST,
+
   /* OpenACC clause: tile ( size-expr-list ).  */
   OMP_CLAUSE_TILE,
 
diff --git a/gcc/tree-nested.c b/gcc/tree-nested.c
index 3fe23cc..76071cc 100644
--- a/gcc/tree-nested.c
+++ b/gcc/tree-nested.c
@@ -1350,6 +1350,8 @@
 	case OMP_CLAUSE_FINALIZE:
 	  break;
 
+	  /* OpenACC nohost clause is not yet handled here.  */
+	case OMP_CLAUSE_NOHOST:
 	  /* The following clause belongs to the OpenACC cache directive, which
 	     is discarded during gimplification.  */
 	case OMP_CLAUSE__CACHE_:
@@ -2077,6 +2079,8 @@
 	case OMP_CLAUSE_FINALIZE:
 	  break;
 
+	  /* OpenACC nohost clauses is not yet handled here.  */
+	case OMP_CLAUSE_NOHOST:
 	  /* The following clause belongs to the OpenACC cache directive, which
 	     is discarded during gimplification.  */
 	case OMP_CLAUSE__CACHE_:
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index f6c230b..5a95beb 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -412,10 +412,13 @@
 extern gimple_opt_pass *make_pass_lower_vector (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_vector_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_omp (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_convert_oacc_kernels (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_diagnose_omp_blocks (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_oacc_loop_designation (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_oacc_gimple_workers (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt);
diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c
index ae0a6c1..3a17079 100644
--- a/gcc/tree-pretty-print.c
+++ b/gcc/tree-pretty-print.c
@@ -765,6 +765,9 @@
 	case GOMP_MAP_POINTER:
 	  pp_string (pp, "alloc");
 	  break;
+	case GOMP_MAP_NO_ALLOC:
+	  pp_string (pp, "no_alloc");
+	  break;
 	case GOMP_MAP_TO:
 	case GOMP_MAP_TO_PSET:
 	  pp_string (pp, "to");
@@ -826,6 +829,51 @@
 	case GOMP_MAP_LINK:
 	  pp_string (pp, "link");
 	  break;
+	case GOMP_MAP_DYNAMIC_ARRAY_TO:
+	  pp_string (pp, "to,dynamic_array");
+	  break;
+	case GOMP_MAP_DYNAMIC_ARRAY_FROM:
+	  pp_string (pp, "from,dynamic_array");
+	  break;
+	case GOMP_MAP_DYNAMIC_ARRAY_TOFROM:
+	  pp_string (pp, "tofrom,dynamic_array");
+	  break;
+	case GOMP_MAP_DYNAMIC_ARRAY_FORCE_TO:
+	  pp_string (pp, "force_to,dynamic_array");
+	  break;
+	case GOMP_MAP_DYNAMIC_ARRAY_FORCE_FROM:
+	  pp_string (pp, "force_from,dynamic_array");
+	  break;
+	case GOMP_MAP_DYNAMIC_ARRAY_FORCE_TOFROM:
+	  pp_string (pp, "force_tofrom,dynamic_array");
+	  break;
+	case GOMP_MAP_DYNAMIC_ARRAY_ALLOC:
+	  pp_string (pp, "alloc,dynamic_array");
+	  break;
+	case GOMP_MAP_DYNAMIC_ARRAY_FORCE_ALLOC:
+	  pp_string (pp, "force_alloc,dynamic_array");
+	  break;
+	case GOMP_MAP_DYNAMIC_ARRAY_FORCE_PRESENT:
+	  pp_string (pp, "force_present,dynamic_array");
+	  break;
+	case GOMP_MAP_DECLARE_ALLOCATE:
+	  pp_string (pp, "declare_allocate");
+	  break;
+	case GOMP_MAP_DECLARE_DEALLOCATE:
+	  pp_string (pp, "declare_deallocate");
+	  break;
+	case GOMP_MAP_ATTACH:
+	  pp_string (pp, "attach");
+	  break;
+	case GOMP_MAP_DETACH:
+	  pp_string (pp, "detach");
+	  break;
+	case GOMP_MAP_FORCE_DETACH:
+	  pp_string (pp, "force_detach");
+	  break;
+	case GOMP_MAP_ATTACH_DETACH:
+	  pp_string (pp, "attach_detach");
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
@@ -847,6 +895,16 @@
 	    case GOMP_MAP_TO_PSET:
 	      pp_string (pp, " [pointer set, len: ");
 	      break;
+	    case GOMP_MAP_DYNAMIC_ARRAY:
+	      gcc_assert (TREE_CODE (OMP_CLAUSE_SIZE (clause)) == TREE_LIST);
+	      pp_string (pp, " [dimensions: ");
+	      break;
+	    case GOMP_MAP_ATTACH:
+	    case GOMP_MAP_DETACH:
+	    case GOMP_MAP_FORCE_DETACH:
+	    case GOMP_MAP_ATTACH_DETACH:
+	      pp_string (pp, " [bias: ");
+	      break;
 	    default:
 	      pp_string (pp, " [len: ");
 	      break;
@@ -1160,6 +1218,9 @@
 			 spc, flags, false);
       pp_right_paren (pp);
       break;
+    case OMP_CLAUSE_NOHOST:
+      pp_string (pp, "nohost");
+      break;
 
     case OMP_CLAUSE__GRIDDIM_:
       pp_string (pp, "_griddim_(");
@@ -3077,6 +3138,10 @@
       pp_string (pp, "#pragma acc kernels");
       goto dump_omp_clauses_body;
 
+    case OACC_SERIAL:
+      pp_string (pp, "#pragma acc serial");
+      goto dump_omp_clauses_body;
+
     case OACC_DATA:
       pp_string (pp, "#pragma acc data");
       dump_omp_clauses (pp, OACC_DATA_CLAUSES (node), spc, flags);
diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index 0ea0b46..2ce070e 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -43,6 +43,8 @@
 #include "stringpool.h"
 #include "attribs.h"
 #include "tree-ssa.h"
+#include "target.h"
+#include "gomp-constants.h"
 
 /* The idea behind this analyzer is to generate set constraints from the
    program, then solve the resulting constraints in order to generate the
@@ -4715,6 +4717,7 @@
 	  if (in_ipa_mode)
 	    {
 	      unsigned int fnpos, argpos;
+	      bool oacc_exploded_parallel = false;
 	      switch (DECL_FUNCTION_CODE (fndecl))
 		{
 		case BUILT_IN_GOMP_PARALLEL:
@@ -4727,11 +4730,27 @@
 					       sizes, kinds, ...).  */
 		  fnpos = 1;
 		  argpos = 3;
+		  if (targetm.goacc.explode_args ())
+		    for (unsigned int i = 6; i < gimple_call_num_args (t); i++)
+		      {
+		        tree arg = gimple_call_arg (t, i);
+			if (TREE_CODE (arg) == INTEGER_CST
+			    && (tree_to_shwi (arg)
+				== GOMP_LAUNCH_PACK (GOMP_LAUNCH_ARGS_EXPLODED,
+						     0, 0)))
+			  {
+			    oacc_exploded_parallel = true;
+			    break;
+			  }
+		      }
 		  break;
 		default:
 		  gcc_unreachable ();
 		}
 
+	      if (oacc_exploded_parallel)
+	        break;
+
 	      tree fnarg = gimple_call_arg (t, fnpos);
 	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
 	      tree fndecl = TREE_OPERAND (fnarg, 0);
@@ -5275,6 +5294,7 @@
 	      unsigned int fnpos, argpos;
 	      unsigned int implicit_use_args[2];
 	      unsigned int num_implicit_use_args = 0;
+	      bool oacc_exploded_parallel = false;
 	      switch (DECL_FUNCTION_CODE (decl))
 		{
 		case BUILT_IN_GOMP_PARALLEL:
@@ -5289,11 +5309,27 @@
 		  argpos = 3;
 		  implicit_use_args[num_implicit_use_args++] = 4;
 		  implicit_use_args[num_implicit_use_args++] = 5;
+		  if (targetm.goacc.explode_args ())
+		    for (unsigned int i = 6; i < gimple_call_num_args (t); i++)
+		      {
+		        tree arg = gimple_call_arg (t, i);
+			if (TREE_CODE (arg) == INTEGER_CST
+			    && (tree_to_shwi (arg)
+				== GOMP_LAUNCH_PACK (GOMP_LAUNCH_ARGS_EXPLODED,
+						     0, 0)))
+			  {
+			    oacc_exploded_parallel = true;
+			    break;
+			  }
+		      }
 		  break;
 		default:
 		  gcc_unreachable ();
 		}
 
+	      if (oacc_exploded_parallel)
+		break;
+
 	      tree fnarg = gimple_call_arg (t, fnpos);
 	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
 	      tree fndecl = TREE_OPERAND (fnarg, 0);
diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
index 2140101..8ed4adf 100644
--- a/gcc/tree-vrp.c
+++ b/gcc/tree-vrp.c
@@ -2130,6 +2130,16 @@
       tree inner_type = op0_type;
       tree outer_type = type;
 
+      /* Do not trust the range information when converting from a reference
+	 type to a integral type, since the reference might be a type-punned
+	 integer that could take the value zero.  */
+      if (TREE_CODE (inner_type) == REFERENCE_TYPE
+	  && !POINTER_TYPE_P (outer_type))
+	{
+	  vr->set_varying ();
+	  return;
+	}
+
       /* If the expression involves a pointer, we are only interested in
 	 determining if it evaluates to NULL [0, 0] or non-NULL (~[0, 0]).
 
diff --git a/gcc/tree.c b/gcc/tree.c
index 32e94e4..5e73570 100644
--- a/gcc/tree.c
+++ b/gcc/tree.c
@@ -283,7 +283,7 @@
   1, /* OMP_CLAUSE_SHARED  */
   1, /* OMP_CLAUSE_FIRSTPRIVATE  */
   2, /* OMP_CLAUSE_LASTPRIVATE  */
-  5, /* OMP_CLAUSE_REDUCTION  */
+  6, /* OMP_CLAUSE_REDUCTION  */
   5, /* OMP_CLAUSE_TASK_REDUCTION  */
   5, /* OMP_CLAUSE_IN_REDUCTION  */
   1, /* OMP_CLAUSE_COPYIN  */
@@ -347,6 +347,7 @@
   1, /* OMP_CLAUSE_NUM_GANGS  */
   1, /* OMP_CLAUSE_NUM_WORKERS  */
   1, /* OMP_CLAUSE_VECTOR_LENGTH  */
+  0, /* OMP_CLAUSE_NOHOST  */
   3, /* OMP_CLAUSE_TILE  */
   2, /* OMP_CLAUSE__GRIDDIM_  */
   0, /* OMP_CLAUSE_IF_PRESENT */
@@ -424,6 +425,7 @@
   "num_gangs",
   "num_workers",
   "vector_length",
+  "nohost",
   "tile",
   "_griddim_",
   "if_present",
@@ -12328,6 +12330,7 @@
 	case OMP_CLAUSE_DEFAULTMAP:
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
+	case OMP_CLAUSE_NOHOST:
 	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE__SIMT_:
 	case OMP_CLAUSE_IF_PRESENT:
@@ -12363,11 +12366,16 @@
 	  WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
 
 	case OMP_CLAUSE_REDUCTION:
+	  {
+	    for (int i = 0; i < 6; i++)
+	      WALK_SUBTREE (OMP_CLAUSE_OPERAND (*tp, i));
+	    WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
+	  }
+
 	case OMP_CLAUSE_TASK_REDUCTION:
 	case OMP_CLAUSE_IN_REDUCTION:
 	  {
-	    int i;
-	    for (i = 0; i < 5; i++)
+	    for (int i = 0; i < 5; i++)
 	      WALK_SUBTREE (OMP_CLAUSE_OPERAND (*tp, i));
 	    WALK_SUBTREE_TAIL (OMP_CLAUSE_CHAIN (*tp));
 	  }
diff --git a/gcc/tree.def b/gcc/tree.def
index 1d6dded..56cec43 100644
--- a/gcc/tree.def
+++ b/gcc/tree.def
@@ -1084,6 +1084,12 @@
 
 DEFTREECODE (OACC_KERNELS, "oacc_kernels", tcc_statement, 2)
 
+/* OpenACC - #pragma acc serial [clause1 ... clauseN]
+   Operand 0: OMP_BODY: Code to be executed sequentially.
+   Operand 1: OMP_CLAUSES: List of clauses.  */
+
+DEFTREECODE (OACC_SERIAL, "oacc_serial", tcc_statement, 2)
+
 /* OpenACC - #pragma acc data [clause1 ... clauseN]
    Operand 0: OACC_DATA_BODY: Data construct body.
    Operand 1: OACC_DATA_CLAUSES: List of clauses.  */
diff --git a/gcc/tree.h b/gcc/tree.h
index 2f8e37b..97a499e 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -1634,6 +1634,8 @@
 #define OMP_CLAUSE_REDUCTION_DECL_PLACEHOLDER(NODE) \
   OMP_CLAUSE_OPERAND (OMP_CLAUSE_RANGE_CHECK (NODE, OMP_CLAUSE_REDUCTION, \
 					      OMP_CLAUSE_IN_REDUCTION), 4)
+#define OMP_CLAUSE_REDUCTION_PRIVATE_DECL(NODE) \
+  OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_REDUCTION), 5)
 
 /* True if a REDUCTION clause may reference the original list item (omp_orig)
    in its OMP_CLAUSE_REDUCTION_{,GIMPLE_}INIT.  */
diff --git a/include/ChangeLog.omp b/include/ChangeLog.omp
new file mode 100644
index 0000000..2400ca5
--- /dev/null
+++ b/include/ChangeLog.omp
@@ -0,0 +1,53 @@
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+
+	* gomp-constants.h (GOMP_LAUNCH_ARGS_EXPLODED): Define.
+
+2019-07-10  Julian Brown  <julian@codesourcery.com>
+
+	* gomp-constants.h (gomp_map_kind): Add GOMP_MAP_ATTACH_DETACH.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* gomp-constants.h (GOMP_DEVICE_CURRENT): New macro.
+	(GOMP_DEVICE_PROPERTY_MEMORY, GOMP_DEVICE_PROPERTY_FREE_MEMORY)
+	(GOMP_DEVICE_PROPERTY_NAME, GOMP_DEVICE_PROPERTY_VENDOR)
+	(GOMP_DEVICE_PROPERTY_DRIVER): Likewise.
+	(GOMP_DEVICE_PROPERTY_STRING_MASK): Likewise.
+
+2018-12-19  Julian Brown  <julian@codesourcery.com>
+            Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* gomp-constants.h (gomp_map_kind): Support GOMP_MAP_NO_ALLOC.
+
+2018-12-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* gomp-constants.h (GOACC_FLAG_HOST_DATA_IF_PRESENT): New constant.
+
+2018-10-04  Cesar Philippidis  <cesar@codesourcery.com>
+            Julian Brown  <julian@codesourcery.com>
+
+	* gomp-constants.h (enum gomp_map_kind): Define
+	GOMP_MAP_DECLARE_{ALLOCATE,DEALLOCATE} and GOMP_MAP_FLAG_SPECIAL_4.
+
+2018-10-16  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* gomp-constants.h (GOMP_MAP_FLAG_SPECIAL_3): Define.
+	(enum gomp_map_kind): Add GOMP_MAP_DYNAMIC_ARRAY,
+	GOMP_MAP_DYNAMIC_ARRAY_TO, GOMP_MAP_DYNAMIC_ARRAY_FROM,
+	GOMP_MAP_DYNAMIC_ARRAY_TOFROM, GOMP_MAP_DYNAMIC_ARRAY_FORCE_TO,
+	GOMP_MAP_DYNAMIC_ARRAY_FORCE_FROM, GOMP_MAP_DYNAMIC_ARRAY_FORCE_TOFROM,
+	GOMP_MAP_DYNAMIC_ARRAY_ALLOC, GOMP_MAP_DYNAMIC_ARRAY_FORCE_ALLOC,
+	GOMP_MAP_DYNAMIC_ARRAY_FORCE_PRESENT.
+	(GOMP_MAP_DYNAMIC_ARRAY_P): Define.
+
+2018-12-14  Julian Brown  <julian@codesourcery.com>
+
+	* gomp-constants.h (GOMP_MAP_DEEP_COPY): Define.
+	(gomp_map_kind): Add GOMP_MAP_ATTACH, GOMP_MAP_DETACH,
+	GOMP_MAP_FORCE_DETACH.
+
+2019-02-26  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* gomp-constants.h (GOMP_ASYNC_DEFAULT): Define.
+	(GOMP_PLUGIN_IF_VERSION): New version symbol for plugin interface.
+	Update surrounding comments.
diff --git a/include/gomp-constants.h b/include/gomp-constants.h
index 8b93634..aae0745 100644
--- a/include/gomp-constants.h
+++ b/include/gomp-constants.h
@@ -40,8 +40,12 @@
 #define GOMP_MAP_FLAG_SPECIAL_0		(1 << 2)
 #define GOMP_MAP_FLAG_SPECIAL_1		(1 << 3)
 #define GOMP_MAP_FLAG_SPECIAL_2		(1 << 4)
+#define GOMP_MAP_FLAG_SPECIAL_3		(1 << 5)
+#define GOMP_MAP_FLAG_SPECIAL_4		(1 << 6)
 #define GOMP_MAP_FLAG_SPECIAL		(GOMP_MAP_FLAG_SPECIAL_1 \
 					 | GOMP_MAP_FLAG_SPECIAL_0)
+#define GOMP_MAP_DEEP_COPY		(GOMP_MAP_FLAG_SPECIAL_4 \
+					 | GOMP_MAP_FLAG_SPECIAL_2)
 /* Flag to force a specific behavior (or else, trigger a run-time error).  */
 #define GOMP_MAP_FLAG_FORCE		(1 << 7)
 
@@ -76,6 +80,8 @@
     GOMP_MAP_DEVICE_RESIDENT =		(GOMP_MAP_FLAG_SPECIAL_1 | 1),
     /* OpenACC link.  */
     GOMP_MAP_LINK =			(GOMP_MAP_FLAG_SPECIAL_1 | 2),
+    /* Use device data if present, fall back to host address otherwise.  */
+    GOMP_MAP_NO_ALLOC =			(GOMP_MAP_FLAG_SPECIAL_1 | 3),
     /* Allocate.  */
     GOMP_MAP_FIRSTPRIVATE =		(GOMP_MAP_FLAG_SPECIAL | 0),
     /* Similarly, but store the value in the pointer rather than
@@ -128,12 +134,48 @@
     /* Decrement usage count and deallocate if zero.  */
     GOMP_MAP_RELEASE =			(GOMP_MAP_FLAG_SPECIAL_2
 					 | GOMP_MAP_DELETE),
+    /* Mapping kinds for dynamic arrays.  */
+    GOMP_MAP_DYNAMIC_ARRAY =		(GOMP_MAP_FLAG_SPECIAL_3),
+    GOMP_MAP_DYNAMIC_ARRAY_TO =		(GOMP_MAP_DYNAMIC_ARRAY
+					 | GOMP_MAP_TO),
+    GOMP_MAP_DYNAMIC_ARRAY_FROM =	(GOMP_MAP_DYNAMIC_ARRAY
+					 | GOMP_MAP_FROM),
+    GOMP_MAP_DYNAMIC_ARRAY_TOFROM =	(GOMP_MAP_DYNAMIC_ARRAY
+					 | GOMP_MAP_TOFROM),
+    GOMP_MAP_DYNAMIC_ARRAY_FORCE_TO =	(GOMP_MAP_DYNAMIC_ARRAY_TO
+					 | GOMP_MAP_FLAG_FORCE),
+    GOMP_MAP_DYNAMIC_ARRAY_FORCE_FROM =		(GOMP_MAP_DYNAMIC_ARRAY_FROM
+						 | GOMP_MAP_FLAG_FORCE),
+    GOMP_MAP_DYNAMIC_ARRAY_FORCE_TOFROM =	(GOMP_MAP_DYNAMIC_ARRAY_TOFROM
+						 | GOMP_MAP_FLAG_FORCE),
+    GOMP_MAP_DYNAMIC_ARRAY_ALLOC =		(GOMP_MAP_DYNAMIC_ARRAY
+						 | GOMP_MAP_ALLOC),
+    GOMP_MAP_DYNAMIC_ARRAY_FORCE_ALLOC =	(GOMP_MAP_DYNAMIC_ARRAY
+						 | GOMP_MAP_FORCE_ALLOC),
+    GOMP_MAP_DYNAMIC_ARRAY_FORCE_PRESENT =	(GOMP_MAP_DYNAMIC_ARRAY
+						 | GOMP_MAP_FORCE_PRESENT),
+    /* Mapping kinds for allocatable arrays.  */
+    GOMP_MAP_DECLARE_ALLOCATE =		(GOMP_MAP_FLAG_SPECIAL_4
+					 | GOMP_MAP_FORCE_TO),
+    GOMP_MAP_DECLARE_DEALLOCATE =	(GOMP_MAP_FLAG_SPECIAL_4
+					 | GOMP_MAP_FORCE_FROM),
+    /* In OpenACC, attach a pointer to a mapped struct field.  */
+    GOMP_MAP_ATTACH =			(GOMP_MAP_DEEP_COPY | 0),
+    /* In OpenACC, detach a pointer to a mapped struct field.  */
+    GOMP_MAP_DETACH =			(GOMP_MAP_DEEP_COPY | 1),
+    /* In OpenACC, detach a pointer to a mapped struct field.  */
+    GOMP_MAP_FORCE_DETACH =		(GOMP_MAP_DEEP_COPY
+					 | GOMP_MAP_FLAG_FORCE | 1),
 
     /* Internal to GCC, not used in libgomp.  */
     /* Do not map, but pointer assign a pointer instead.  */
     GOMP_MAP_FIRSTPRIVATE_POINTER =	(GOMP_MAP_LAST | 1),
     /* Do not map, but pointer assign a reference instead.  */
-    GOMP_MAP_FIRSTPRIVATE_REFERENCE =	(GOMP_MAP_LAST | 2)
+    GOMP_MAP_FIRSTPRIVATE_REFERENCE =	(GOMP_MAP_LAST | 2),
+    /* An attach or detach operation.  Rewritten to the appropriate type during
+       gimplification, depending on directive (i.e. "enter data" or
+       parallel/kernels region vs. "exit data").  */
+    GOMP_MAP_ATTACH_DETACH =		(GOMP_MAP_LAST | 3)
   };
 
 #define GOMP_MAP_COPY_TO_P(X) \
@@ -156,6 +198,8 @@
 #define GOMP_MAP_ALWAYS_P(X) \
   (GOMP_MAP_ALWAYS_TO_P (X) || ((X) == GOMP_MAP_ALWAYS_FROM))
 
+#define GOMP_MAP_DYNAMIC_ARRAY_P(X) \
+  ((X) & GOMP_MAP_DYNAMIC_ARRAY)
 
 /* Asynchronous behavior.  Keep in sync with
    libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_async_t.  */
@@ -175,10 +219,25 @@
 #define GOMP_DEVICE_NVIDIA_PTX		5
 #define GOMP_DEVICE_INTEL_MIC		6
 #define GOMP_DEVICE_HSA			7
+#define GOMP_DEVICE_GCN			8
+#define GOMP_DEVICE_CURRENT		9
 
 #define GOMP_DEVICE_ICV			-1
 #define GOMP_DEVICE_HOST_FALLBACK	-2
 
+/* Device property codes.  Keep in sync with
+   libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_device_property_t
+   as well as libgomp/libgomp-plugin.h.  */
+/* Start from 1 to catch uninitialized use.  */
+#define GOMP_DEVICE_PROPERTY_MEMORY		1
+#define GOMP_DEVICE_PROPERTY_FREE_MEMORY	2
+#define GOMP_DEVICE_PROPERTY_NAME		0x10001
+#define GOMP_DEVICE_PROPERTY_VENDOR		0x10002
+#define GOMP_DEVICE_PROPERTY_DRIVER		0x10003
+
+/* Internal property mask to tell numeric and string values apart.  */
+#define GOMP_DEVICE_PROPERTY_STRING_MASK	0x10000
+
 /* GOMP_task/GOMP_taskloop* flags argument.  */
 #define GOMP_TASK_FLAG_UNTIED		(1 << 0)
 #define GOMP_TASK_FLAG_FINAL		(1 << 1)
@@ -202,6 +261,8 @@
 
 /* Force host fallback execution.  */
 #define GOACC_FLAG_HOST_FALLBACK	(1 << 0)
+/* "if_present" semantics for OpenACC "host_data" constructs.  */
+#define GOACC_FLAG_HOST_DATA_IF_PRESENT	(1 << 1)
 
 /* For legacy reasons, in the ABI, the GOACC_FLAGs are encoded as an inverted
    bitmask.  */
@@ -216,6 +277,7 @@
 #define GOMP_VERSION_NVIDIA_PTX 1
 #define GOMP_VERSION_INTEL_MIC 0
 #define GOMP_VERSION_HSA 0
+#define GOMP_VERSION_GCN 0
 
 #define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV))
 #define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff)
@@ -231,6 +293,7 @@
 #define GOMP_LAUNCH_DIM		1  /* Launch dimensions, op = mask */
 #define GOMP_LAUNCH_ASYNC	2  /* Async, op = cst val if not MAX  */
 #define GOMP_LAUNCH_WAIT	3  /* Waits, op = num waits.  */
+#define GOMP_LAUNCH_ARGS_EXPLODED 4 /* Exploded args, op ignored.  */
 #define GOMP_LAUNCH_CODE_SHIFT	28
 #define GOMP_LAUNCH_DEVICE_SHIFT 16
 #define GOMP_LAUNCH_OP_SHIFT 0
diff --git a/libatomic/acinclude.m4 b/libatomic/acinclude.m4
index ab1d838..6c5712b 100644
--- a/libatomic/acinclude.m4
+++ b/libatomic/acinclude.m4
@@ -336,7 +336,7 @@
   fi
   changequote(,)
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
   changequote([,])
   libat_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) [$]3=0; print ([$]1*100+[$]2)*100+[$]3 }'`
@@ -438,7 +438,7 @@
   fi
   changequote(,)
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
   changequote([,])
   libat_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) [$]3=0; print ([$]1*100+[$]2)*100+[$]3 }'`
diff --git a/libatomic/configure b/libatomic/configure
index e7076a0..e75dcf8 100755
--- a/libatomic/configure
+++ b/libatomic/configure
@@ -15177,7 +15177,7 @@
   fi
 
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
   libat_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
diff --git a/libffi/ChangeLog.omp b/libffi/ChangeLog.omp
new file mode 100644
index 0000000..5dbf853
--- /dev/null
+++ b/libffi/ChangeLog.omp
@@ -0,0 +1,8 @@
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* configure.ac: Disable AC_PROG_CXX.
+	* configure: Regenerate.
+	* Makefile.in: Regenerate.
+	* include/Makefile.in: Regenerate.
+	* man/Makefile.in: Regenerate.
+	* testsuite/Makefile.in: Regenerate.
diff --git a/libffi/Makefile.in b/libffi/Makefile.in
index 72f8678..914a4a7 100644
--- a/libffi/Makefile.in
+++ b/libffi/Makefile.in
@@ -320,10 +320,6 @@
 CFLAGS = @CFLAGS@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
@@ -385,7 +381,6 @@
 abs_top_builddir = @abs_top_builddir@
 abs_top_srcdir = @abs_top_srcdir@
 ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
 ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
 am__include = @am__include@
 am__leading_dot = @am__leading_dot@
diff --git a/libffi/acinclude.m4 b/libffi/acinclude.m4
index 270dadf..d45654e 100644
--- a/libffi/acinclude.m4
+++ b/libffi/acinclude.m4
@@ -178,7 +178,7 @@
   fi
   changequote(,)
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
   changequote([,])
   libat_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) [$]3=0; print ([$]1*100+[$]2)*100+[$]3 }'`
@@ -280,7 +280,7 @@
   fi
   changequote(,)
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
   changequote([,])
   libat_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) [$]3=0; print ([$]1*100+[$]2)*100+[$]3 }'`
diff --git a/libffi/configure b/libffi/configure
index 2324890..e41ff6b 100755
--- a/libffi/configure
+++ b/libffi/configure
@@ -664,7 +664,6 @@
 MAINT
 MAINTAINER_MODE_FALSE
 MAINTAINER_MODE_TRUE
-CXXCPP
 CPP
 OTOOL64
 OTOOL
@@ -689,12 +688,6 @@
 CCASDEPMODE
 CCASFLAGS
 CCAS
-am__fastdepCXX_FALSE
-am__fastdepCXX_TRUE
-CXXDEPMODE
-ac_ct_CXX
-CXXFLAGS
-CXX
 am__fastdepCC_FALSE
 am__fastdepCC_TRUE
 CCDEPMODE
@@ -822,8 +815,7 @@
 CCAS
 CCASFLAGS
 CPP
-CPPFLAGS
-CXXCPP'
+CPPFLAGS'
 
 
 # Initialize some variables set by options.
@@ -1487,12 +1479,9 @@
   LIBS        libraries to pass to the linker, e.g. -l<library>
   CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
               you have headers in a nonstandard directory <include dir>
-  CXX         C++ compiler command
-  CXXFLAGS    C++ compiler flags
   CCAS        assembler compiler command (defaults to CC)
   CCASFLAGS   assembler compiler flags (defaults to CFLAGS)
   CPP         C preprocessor
-  CXXCPP      C++ preprocessor
 
 Use these variables to override the choices made by `configure' or to help
 it to find libraries and programs with nonstandard names/locations.
@@ -1612,44 +1601,6 @@
 
 } # ac_fn_c_try_compile
 
-# ac_fn_cxx_try_compile LINENO
-# ----------------------------
-# Try to compile conftest.$ac_ext, and return whether this succeeded.
-ac_fn_cxx_try_compile ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext
-  if { { ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compile") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_cxx_try_compile
-
 # ac_fn_c_try_link LINENO
 # -----------------------
 # Try to link conftest.$ac_ext, and return whether this succeeded.
@@ -1873,89 +1824,6 @@
 
 } # ac_fn_c_check_func
 
-# ac_fn_cxx_try_cpp LINENO
-# ------------------------
-# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
-ac_fn_cxx_try_cpp ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if { { ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } > conftest.i && {
-	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-    ac_retval=1
-fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_cxx_try_cpp
-
-# ac_fn_cxx_try_link LINENO
-# -------------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded.
-ac_fn_cxx_try_link ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext conftest$ac_exeext
-  if { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest$ac_exeext && {
-	 test "$cross_compiling" = yes ||
-	 test -x conftest$ac_exeext
-       }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
-  # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
-  # interfere with the next link command; also delete a directory that is
-  # left behind by Apple's compiler.  We do this before executing the actions.
-  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_cxx_try_link
-
 # ac_fn_c_compute_int LINENO EXPR VAR INCLUDES
 # --------------------------------------------
 # Tries to find the compile-time value of EXPR in a program that includes
@@ -4554,391 +4422,6 @@
 fi
 
 
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-if test -z "$CXX"; then
-  if test -n "$CCC"; then
-    CXX=$CCC
-  else
-    if test -n "$ac_tool_prefix"; then
-  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CXX"; then
-  ac_cv_prog_CXX="$CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-CXX=$ac_cv_prog_CXX
-if test -n "$CXX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
-$as_echo "$CXX" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-    test -n "$CXX" && break
-  done
-fi
-if test -z "$CXX"; then
-  ac_ct_CXX=$CXX
-  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_CXX"; then
-  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_CXX="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
-if test -n "$ac_ct_CXX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
-$as_echo "$ac_ct_CXX" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  test -n "$ac_ct_CXX" && break
-done
-
-  if test "x$ac_ct_CXX" = x; then
-    CXX="g++"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    CXX=$ac_ct_CXX
-  fi
-fi
-
-  fi
-fi
-# Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
-set X $ac_compile
-ac_compiler=$2
-for ac_option in --version -v -V -qversion; do
-  { { ac_try="$ac_compiler $ac_option >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    sed '10a\
-... rest of stderr output deleted ...
-         10q' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-  fi
-  rm -f conftest.er1 conftest.err
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-done
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5
-$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; }
-if ${ac_cv_cxx_compiler_gnu+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-#ifndef __GNUC__
-       choke me
-#endif
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_compiler_gnu=yes
-else
-  ac_compiler_gnu=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
-$as_echo "$ac_cv_cxx_compiler_gnu" >&6; }
-if test $ac_compiler_gnu = yes; then
-  GXX=yes
-else
-  GXX=
-fi
-ac_test_CXXFLAGS=${CXXFLAGS+set}
-ac_save_CXXFLAGS=$CXXFLAGS
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
-$as_echo_n "checking whether $CXX accepts -g... " >&6; }
-if ${ac_cv_prog_cxx_g+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
-   ac_cxx_werror_flag=yes
-   ac_cv_prog_cxx_g=no
-   CXXFLAGS="-g"
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_cv_prog_cxx_g=yes
-else
-  CXXFLAGS=""
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-
-else
-  ac_cxx_werror_flag=$ac_save_cxx_werror_flag
-	 CXXFLAGS="-g"
-	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_cv_prog_cxx_g=yes
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
-$as_echo "$ac_cv_prog_cxx_g" >&6; }
-if test "$ac_test_CXXFLAGS" = set; then
-  CXXFLAGS=$ac_save_CXXFLAGS
-elif test $ac_cv_prog_cxx_g = yes; then
-  if test "$GXX" = yes; then
-    CXXFLAGS="-g -O2"
-  else
-    CXXFLAGS="-g"
-  fi
-else
-  if test "$GXX" = yes; then
-    CXXFLAGS="-O2"
-  else
-    CXXFLAGS=
-  fi
-fi
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-depcc="$CXX"  am_compiler_list=
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CXX_dependencies_compiler_type+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named 'D' -- because '-MD' means "put the output
-  # in D".
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_CXX_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  am__universal=false
-  case " $depcc " in #(
-     *\ -arch\ *\ -arch\ *) am__universal=true ;;
-     esac
-
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
-      # Solaris 10 /bin/sh.
-      echo '/* dummy */' > sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    # We check with '-c' and '-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle '-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs.
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # After this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested.
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok '-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CXX_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_CXX_dependencies_compiler_type=none
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CXX_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CXX_dependencies_compiler_type" >&6; }
-CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type
-
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then
-  am__fastdepCXX_TRUE=
-  am__fastdepCXX_FALSE='#'
-else
-  am__fastdepCXX_TRUE='#'
-  am__fastdepCXX_FALSE=
-fi
-
-
 CFLAGS=$save_CFLAGS
 
 
@@ -8077,7 +7560,6 @@
 
 
 
-
 # Set options
 
 
@@ -11552,7 +11034,7 @@
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11555 "configure"
+#line 11037 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -11658,7 +11140,7 @@
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11661 "configure"
+#line 11143 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -11875,3022 +11357,6 @@
 
 CC="$lt_save_CC"
 
-      if test -n "$CXX" && ( test "X$CXX" != "Xno" &&
-    ( (test "X$CXX" = "Xg++" && `g++ -v >/dev/null 2>&1` ) ||
-    (test "X$CXX" != "Xg++"))) ; then
-  ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C++ preprocessor" >&5
-$as_echo_n "checking how to run the C++ preprocessor... " >&6; }
-if test -z "$CXXCPP"; then
-  if ${ac_cv_prog_CXXCPP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-      # Double quotes because CXXCPP needs to be expanded
-    for CXXCPP in "$CXX -E" "/lib/cpp"
-    do
-      ac_preproc_ok=false
-for ac_cxx_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_cxx_try_cpp "$LINENO"; then :
-
-else
-  # Broken: fails on valid input.
-continue
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_cxx_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
-else
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
-  break
-fi
-
-    done
-    ac_cv_prog_CXXCPP=$CXXCPP
-
-fi
-  CXXCPP=$ac_cv_prog_CXXCPP
-else
-  ac_cv_prog_CXXCPP=$CXXCPP
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXXCPP" >&5
-$as_echo "$CXXCPP" >&6; }
-ac_preproc_ok=false
-for ac_cxx_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_cxx_try_cpp "$LINENO"; then :
-
-else
-  # Broken: fails on valid input.
-continue
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_cxx_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
-else
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "C++ preprocessor \"$CXXCPP\" fails sanity check
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-else
-  _lt_caught_CXX_error=yes
-fi
-
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-
-archive_cmds_need_lc_CXX=no
-allow_undefined_flag_CXX=
-always_export_symbols_CXX=no
-archive_expsym_cmds_CXX=
-compiler_needs_object_CXX=no
-export_dynamic_flag_spec_CXX=
-hardcode_direct_CXX=no
-hardcode_direct_absolute_CXX=no
-hardcode_libdir_flag_spec_CXX=
-hardcode_libdir_flag_spec_ld_CXX=
-hardcode_libdir_separator_CXX=
-hardcode_minus_L_CXX=no
-hardcode_shlibpath_var_CXX=unsupported
-hardcode_automatic_CXX=no
-inherit_rpath_CXX=no
-module_cmds_CXX=
-module_expsym_cmds_CXX=
-link_all_deplibs_CXX=unknown
-old_archive_cmds_CXX=$old_archive_cmds
-reload_flag_CXX=$reload_flag
-reload_cmds_CXX=$reload_cmds
-no_undefined_flag_CXX=
-whole_archive_flag_spec_CXX=
-enable_shared_with_static_runtimes_CXX=no
-
-# Source file extension for C++ test sources.
-ac_ext=cpp
-
-# Object file extension for compiled C++ test sources.
-objext=o
-objext_CXX=$objext
-
-# No sense in running all these tests if we already determined that
-# the CXX compiler isn't working.  Some variables (like enable_shared)
-# are currently assumed to apply to all compilers on this platform,
-# and will be corrupted by setting them based on a non-working compiler.
-if test "$_lt_caught_CXX_error" != yes; then
-  # Code to be used in simple compile tests
-  lt_simple_compile_test_code="int some_variable = 0;"
-
-  # Code to be used in simple link tests
-  lt_simple_link_test_code='int main(int, char *[]) { return(0); }'
-
-  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
-
-
-
-
-
-
-# If no C compiler was specified, use CC.
-LTCC=${LTCC-"$CC"}
-
-# If no C compiler flags were specified, use CFLAGS.
-LTCFLAGS=${LTCFLAGS-"$CFLAGS"}
-
-# Allow CC to be a program name with arguments.
-compiler=$CC
-
-
-  # save warnings/boilerplate of simple test code
-  ac_outfile=conftest.$ac_objext
-echo "$lt_simple_compile_test_code" >conftest.$ac_ext
-eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
-_lt_compiler_boilerplate=`cat conftest.err`
-$RM conftest*
-
-  ac_outfile=conftest.$ac_objext
-echo "$lt_simple_link_test_code" >conftest.$ac_ext
-eval "$ac_link" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
-_lt_linker_boilerplate=`cat conftest.err`
-$RM -r conftest*
-
-
-  # Allow CC to be a program name with arguments.
-  lt_save_CC=$CC
-  lt_save_LD=$LD
-  lt_save_GCC=$GCC
-  GCC=$GXX
-  lt_save_with_gnu_ld=$with_gnu_ld
-  lt_save_path_LD=$lt_cv_path_LD
-  if test -n "${lt_cv_prog_gnu_ldcxx+set}"; then
-    lt_cv_prog_gnu_ld=$lt_cv_prog_gnu_ldcxx
-  else
-    $as_unset lt_cv_prog_gnu_ld
-  fi
-  if test -n "${lt_cv_path_LDCXX+set}"; then
-    lt_cv_path_LD=$lt_cv_path_LDCXX
-  else
-    $as_unset lt_cv_path_LD
-  fi
-  test -z "${LDCXX+set}" || LD=$LDCXX
-  CC=${CXX-"c++"}
-  compiler=$CC
-  compiler_CXX=$CC
-  for cc_temp in $compiler""; do
-  case $cc_temp in
-    compile | *[\\/]compile | ccache | *[\\/]ccache ) ;;
-    distcc | *[\\/]distcc | purify | *[\\/]purify ) ;;
-    \-*) ;;
-    *) break;;
-  esac
-done
-cc_basename=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"`
-
-
-  if test -n "$compiler"; then
-    # We don't want -fno-exception when compiling C++ code, so set the
-    # no_builtin_flag separately
-    if test "$GXX" = yes; then
-      lt_prog_compiler_no_builtin_flag_CXX=' -fno-builtin'
-    else
-      lt_prog_compiler_no_builtin_flag_CXX=
-    fi
-
-    if test "$GXX" = yes; then
-      # Set up default GNU C++ configuration
-
-
-
-# Check whether --with-gnu-ld was given.
-if test "${with_gnu_ld+set}" = set; then :
-  withval=$with_gnu_ld; test "$withval" = no || with_gnu_ld=yes
-else
-  with_gnu_ld=no
-fi
-
-ac_prog=ld
-if test "$GCC" = yes; then
-  # Check if gcc -print-prog-name=ld gives a path.
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ld used by $CC" >&5
-$as_echo_n "checking for ld used by $CC... " >&6; }
-  case $host in
-  *-*-mingw*)
-    # gcc leaves a trailing carriage return which upsets mingw
-    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
-  *)
-    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
-  esac
-  case $ac_prog in
-    # Accept absolute paths.
-    [\\/]* | ?:[\\/]*)
-      re_direlt='/[^/][^/]*/\.\./'
-      # Canonicalize the pathname of ld
-      ac_prog=`$ECHO "$ac_prog"| $SED 's%\\\\%/%g'`
-      while $ECHO "$ac_prog" | $GREP "$re_direlt" > /dev/null 2>&1; do
-	ac_prog=`$ECHO $ac_prog| $SED "s%$re_direlt%/%"`
-      done
-      test -z "$LD" && LD="$ac_prog"
-      ;;
-  "")
-    # If it fails, then pretend we aren't using GCC.
-    ac_prog=ld
-    ;;
-  *)
-    # If it is relative, then search for the first ld in PATH.
-    with_gnu_ld=unknown
-    ;;
-  esac
-elif test "$with_gnu_ld" = yes; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU ld" >&5
-$as_echo_n "checking for GNU ld... " >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for non-GNU ld" >&5
-$as_echo_n "checking for non-GNU ld... " >&6; }
-fi
-if ${lt_cv_path_LD+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$LD"; then
-  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
-  for ac_dir in $PATH; do
-    IFS="$lt_save_ifs"
-    test -z "$ac_dir" && ac_dir=.
-    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
-      lt_cv_path_LD="$ac_dir/$ac_prog"
-      # Check to see if the program is GNU ld.  I'd rather use --version,
-      # but apparently some variants of GNU ld only accept -v.
-      # Break only if it was the GNU/non-GNU ld that we prefer.
-      case `"$lt_cv_path_LD" -v 2>&1 </dev/null` in
-      *GNU* | *'with BFD'*)
-	test "$with_gnu_ld" != no && break
-	;;
-      *)
-	test "$with_gnu_ld" != yes && break
-	;;
-      esac
-    fi
-  done
-  IFS="$lt_save_ifs"
-else
-  lt_cv_path_LD="$LD" # Let the user override the test with a path.
-fi
-fi
-
-LD="$lt_cv_path_LD"
-if test -n "$LD"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LD" >&5
-$as_echo "$LD" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-test -z "$LD" && as_fn_error $? "no acceptable ld found in \$PATH" "$LINENO" 5
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if the linker ($LD) is GNU ld" >&5
-$as_echo_n "checking if the linker ($LD) is GNU ld... " >&6; }
-if ${lt_cv_prog_gnu_ld+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  # I'd rather use --version here, but apparently some GNU lds only accept -v.
-case `$LD -v 2>&1 </dev/null` in
-*GNU* | *'with BFD'*)
-  lt_cv_prog_gnu_ld=yes
-  ;;
-*)
-  lt_cv_prog_gnu_ld=no
-  ;;
-esac
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_gnu_ld" >&5
-$as_echo "$lt_cv_prog_gnu_ld" >&6; }
-with_gnu_ld=$lt_cv_prog_gnu_ld
-
-
-
-
-
-
-
-      # Check if GNU C++ uses GNU ld as the underlying linker, since the
-      # archiving commands below assume that GNU ld is being used.
-      if test "$with_gnu_ld" = yes; then
-        archive_cmds_CXX='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-        archive_expsym_cmds_CXX='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-
-        hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-        export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
-
-        # If archive_cmds runs LD, not CC, wlarc should be empty
-        # XXX I think wlarc can be eliminated in ltcf-cxx, but I need to
-        #     investigate it a little bit more. (MM)
-        wlarc='${wl}'
-
-        # ancient GNU ld didn't support --whole-archive et. al.
-        if eval "`$CC -print-prog-name=ld` --help 2>&1" |
-	  $GREP 'no-whole-archive' > /dev/null; then
-          whole_archive_flag_spec_CXX="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
-        else
-          whole_archive_flag_spec_CXX=
-        fi
-      else
-        with_gnu_ld=no
-        wlarc=
-
-        # A generic and very simple default shared library creation
-        # command for GNU C++ for the case where it uses the native
-        # linker, instead of GNU ld.  If possible, this setting should
-        # overridden to take advantage of the native linker features on
-        # the platform it is being used on.
-        archive_cmds_CXX='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
-      fi
-
-      # Commands to make compiler produce verbose output that lists
-      # what "hidden" libraries, object files and flags are used when
-      # linking a shared library.
-      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-
-    else
-      GXX=no
-      with_gnu_ld=no
-      wlarc=
-    fi
-
-    # PORTME: fill in a description of your system's C++ link characteristics
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $compiler linker ($LD) supports shared libraries" >&5
-$as_echo_n "checking whether the $compiler linker ($LD) supports shared libraries... " >&6; }
-    ld_shlibs_CXX=yes
-    case $host_os in
-      aix3*)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
-        ;;
-      aix[4-9]*)
-        if test "$host_cpu" = ia64; then
-          # On IA64, the linker does run time linking by default, so we don't
-          # have to do anything special.
-          aix_use_runtimelinking=no
-          exp_sym_flag='-Bexport'
-          no_entry_flag=""
-        else
-          aix_use_runtimelinking=no
-
-          # Test if we are trying to use run time linking or normal
-          # AIX style linking. If -brtl is somewhere in LDFLAGS, we
-          # need to do runtime linking.
-          case $host_os in aix4.[23]|aix4.[23].*|aix[5-9]*)
-	    for ld_flag in $LDFLAGS; do
-	      case $ld_flag in
-	      *-brtl*)
-	        aix_use_runtimelinking=yes
-	        break
-	        ;;
-	      esac
-	    done
-	    ;;
-          esac
-
-          exp_sym_flag='-bexport'
-          no_entry_flag='-bnoentry'
-        fi
-
-        # When large executables or shared objects are built, AIX ld can
-        # have problems creating the table of contents.  If linking a library
-        # or program results in "error TOC overflow" add -mminimal-toc to
-        # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
-        # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
-
-        archive_cmds_CXX=''
-        hardcode_direct_CXX=yes
-        hardcode_direct_absolute_CXX=yes
-        hardcode_libdir_separator_CXX=':'
-        link_all_deplibs_CXX=yes
-        file_list_spec_CXX='${wl}-f,'
-
-        if test "$GXX" = yes; then
-          case $host_os in aix4.[012]|aix4.[012].*)
-          # We only want to do this on AIX 4.2 and lower, the check
-          # below for broken collect2 doesn't work under 4.3+
-	  collect2name=`${CC} -print-prog-name=collect2`
-	  if test -f "$collect2name" &&
-	     strings "$collect2name" | $GREP resolve_lib_name >/dev/null
-	  then
-	    # We have reworked collect2
-	    :
-	  else
-	    # We have old collect2
-	    hardcode_direct_CXX=unsupported
-	    # It fails to find uninstalled libraries when the uninstalled
-	    # path is not listed in the libpath.  Setting hardcode_minus_L
-	    # to unsupported forces relinking
-	    hardcode_minus_L_CXX=yes
-	    hardcode_libdir_flag_spec_CXX='-L$libdir'
-	    hardcode_libdir_separator_CXX=
-	  fi
-          esac
-          shared_flag='-shared'
-	  if test "$aix_use_runtimelinking" = yes; then
-	    shared_flag="$shared_flag "'${wl}-G'
-	  fi
-        else
-          # not using gcc
-          if test "$host_cpu" = ia64; then
-	  # VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
-	  # chokes on -Wl,-G. The following line is correct:
-	  shared_flag='-G'
-          else
-	    if test "$aix_use_runtimelinking" = yes; then
-	      shared_flag='${wl}-G'
-	    else
-	      shared_flag='${wl}-bM:SRE'
-	    fi
-          fi
-        fi
-
-        export_dynamic_flag_spec_CXX='${wl}-bexpall'
-        # It seems that -bexpall does not export symbols beginning with
-        # underscore (_), so it is better to generate a list of symbols to
-	# export.
-        always_export_symbols_CXX=yes
-        if test "$aix_use_runtimelinking" = yes; then
-          # Warning - without using the other runtime loading flags (-brtl),
-          # -berok will link without error, but may produce a broken library.
-          allow_undefined_flag_CXX='-berok'
-          # Determine the default libpath from the value encoded in an empty
-          # executable.
-          cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_link "$LINENO"; then :
-
-lt_aix_libpath_sed='
-    /Import File Strings/,/^$/ {
-	/^0/ {
-	    s/^0  *\(.*\)$/\1/
-	    p
-	}
-    }'
-aix_libpath=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-# Check for a 64-bit object if we didn't find anything.
-if test -z "$aix_libpath"; then
-  aix_libpath=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-fi
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-if test -z "$aix_libpath"; then aix_libpath="/usr/lib:/lib"; fi
-
-          hardcode_libdir_flag_spec_CXX='${wl}-blibpath:$libdir:'"$aix_libpath"
-
-          archive_expsym_cmds_CXX='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
-        else
-          if test "$host_cpu" = ia64; then
-	    hardcode_libdir_flag_spec_CXX='${wl}-R $libdir:/usr/lib:/lib'
-	    allow_undefined_flag_CXX="-z nodefs"
-	    archive_expsym_cmds_CXX="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
-          else
-	    # Determine the default libpath from the value encoded in an
-	    # empty executable.
-	    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_link "$LINENO"; then :
-
-lt_aix_libpath_sed='
-    /Import File Strings/,/^$/ {
-	/^0/ {
-	    s/^0  *\(.*\)$/\1/
-	    p
-	}
-    }'
-aix_libpath=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-# Check for a 64-bit object if we didn't find anything.
-if test -z "$aix_libpath"; then
-  aix_libpath=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-fi
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-if test -z "$aix_libpath"; then aix_libpath="/usr/lib:/lib"; fi
-
-	    hardcode_libdir_flag_spec_CXX='${wl}-blibpath:$libdir:'"$aix_libpath"
-	    # Warning - without using the other run time loading flags,
-	    # -berok will link without error, but may produce a broken library.
-	    no_undefined_flag_CXX=' ${wl}-bernotok'
-	    allow_undefined_flag_CXX=' ${wl}-berok'
-	    if test "$with_gnu_ld" = yes; then
-	      # We only use this code for GNU lds that support --whole-archive.
-	      whole_archive_flag_spec_CXX='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
-	    else
-	      # Exported symbols can be pulled into shared objects from archives
-	      whole_archive_flag_spec_CXX='$convenience'
-	    fi
-	    archive_cmds_need_lc_CXX=yes
-	    # This is similar to how AIX traditionally builds its shared
-	    # libraries.
-	    archive_expsym_cmds_CXX="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
-          fi
-        fi
-        ;;
-
-      beos*)
-	if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	  allow_undefined_flag_CXX=unsupported
-	  # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
-	  # support --undefined.  This deserves some investigation.  FIXME
-	  archive_cmds_CXX='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	else
-	  ld_shlibs_CXX=no
-	fi
-	;;
-
-      chorus*)
-        case $cc_basename in
-          *)
-	  # FIXME: insert proper C++ library support
-	  ld_shlibs_CXX=no
-	  ;;
-        esac
-        ;;
-
-      cygwin* | mingw* | pw32* | cegcc*)
-        # _LT_TAGVAR(hardcode_libdir_flag_spec, CXX) is actually meaningless,
-        # as there is no search path for DLLs.
-        hardcode_libdir_flag_spec_CXX='-L$libdir'
-        export_dynamic_flag_spec_CXX='${wl}--export-all-symbols'
-        allow_undefined_flag_CXX=unsupported
-        always_export_symbols_CXX=no
-        enable_shared_with_static_runtimes_CXX=yes
-
-        if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
-          archive_cmds_CXX='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-          # If the export-symbols file already is a .def file (1st line
-          # is EXPORTS), use it as is; otherwise, prepend...
-          archive_expsym_cmds_CXX='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	    cp $export_symbols $output_objdir/$soname.def;
-          else
-	    echo EXPORTS > $output_objdir/$soname.def;
-	    cat $export_symbols >> $output_objdir/$soname.def;
-          fi~
-          $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-        else
-          ld_shlibs_CXX=no
-        fi
-        ;;
-      darwin* | rhapsody*)
-
-
-  archive_cmds_need_lc_CXX=no
-  hardcode_direct_CXX=no
-  hardcode_automatic_CXX=yes
-  hardcode_shlibpath_var_CXX=unsupported
-  if test "$lt_cv_ld_force_load" = "yes"; then
-    whole_archive_flag_spec_CXX='`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience ${wl}-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`'
-  else
-    whole_archive_flag_spec_CXX=''
-  fi
-  link_all_deplibs_CXX=yes
-  allow_undefined_flag_CXX="$_lt_dar_allow_undefined"
-  case $cc_basename in
-     ifort*) _lt_dar_can_shared=yes ;;
-     *) _lt_dar_can_shared=$GCC ;;
-  esac
-  if test "$_lt_dar_can_shared" = "yes"; then
-    output_verbose_link_cmd=func_echo_all
-    archive_cmds_CXX="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod${_lt_dsymutil}"
-    module_cmds_CXX="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dsymutil}"
-    archive_expsym_cmds_CXX="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring ${_lt_dar_single_mod}${_lt_dar_export_syms}${_lt_dsymutil}"
-    module_expsym_cmds_CXX="sed -e 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dar_export_syms}${_lt_dsymutil}"
-       if test "$lt_cv_apple_cc_single_mod" != "yes"; then
-      archive_cmds_CXX="\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dsymutil}"
-      archive_expsym_cmds_CXX="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dar_export_syms}${_lt_dsymutil}"
-    fi
-
-  else
-  ld_shlibs_CXX=no
-  fi
-
-	;;
-
-      dgux*)
-        case $cc_basename in
-          ec++*)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-          ghcx*)
-	    # Green Hills C++ Compiler
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-          *)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-        esac
-        ;;
-
-      freebsd2.*)
-        # C++ shared libraries reported to be fairly broken before
-	# switch to ELF
-        ld_shlibs_CXX=no
-        ;;
-
-      freebsd-elf*)
-        archive_cmds_need_lc_CXX=no
-        ;;
-
-      freebsd* | dragonfly*)
-        # FreeBSD 3 and later use GNU C++ and GNU ld with standard ELF
-        # conventions
-        ld_shlibs_CXX=yes
-        ;;
-
-      gnu*)
-        ;;
-
-      haiku*)
-        archive_cmds_CXX='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-        link_all_deplibs_CXX=yes
-        ;;
-
-      hpux9*)
-        hardcode_libdir_flag_spec_CXX='${wl}+b ${wl}$libdir'
-        hardcode_libdir_separator_CXX=:
-        export_dynamic_flag_spec_CXX='${wl}-E'
-        hardcode_direct_CXX=yes
-        hardcode_minus_L_CXX=yes # Not in the search PATH,
-				             # but as the default
-				             # location of the library.
-
-        case $cc_basename in
-          CC*)
-            # FIXME: insert proper C++ library support
-            ld_shlibs_CXX=no
-            ;;
-          aCC*)
-            archive_cmds_CXX='$RM $output_objdir/$soname~$CC -b ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
-            # Commands to make compiler produce verbose output that lists
-            # what "hidden" libraries, object files and flags are used when
-            # linking a shared library.
-            #
-            # There doesn't appear to be a way to prevent this compiler from
-            # explicitly linking system object files so we need to strip them
-            # from the output so that they don't get included in the library
-            # dependencies.
-            output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $EGREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-            ;;
-          *)
-            if test "$GXX" = yes; then
-              archive_cmds_CXX='$RM $output_objdir/$soname~$CC -shared -nostdlib -fPIC ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
-            else
-              # FIXME: insert proper C++ library support
-              ld_shlibs_CXX=no
-            fi
-            ;;
-        esac
-        ;;
-
-      hpux10*|hpux11*)
-        if test $with_gnu_ld = no; then
-	  hardcode_libdir_flag_spec_CXX='${wl}+b ${wl}$libdir'
-	  hardcode_libdir_separator_CXX=:
-
-          case $host_cpu in
-            hppa*64*|ia64*)
-              ;;
-            *)
-	      export_dynamic_flag_spec_CXX='${wl}-E'
-              ;;
-          esac
-        fi
-        case $host_cpu in
-          hppa*64*|ia64*)
-            hardcode_direct_CXX=no
-            hardcode_shlibpath_var_CXX=no
-            ;;
-          *)
-            hardcode_direct_CXX=yes
-            hardcode_direct_absolute_CXX=yes
-            hardcode_minus_L_CXX=yes # Not in the search PATH,
-					         # but as the default
-					         # location of the library.
-            ;;
-        esac
-
-        case $cc_basename in
-          CC*)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-          aCC*)
-	    case $host_cpu in
-	      hppa*64*)
-	        archive_cmds_CXX='$CC -b ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	        ;;
-	      ia64*)
-	        archive_cmds_CXX='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	        ;;
-	      *)
-	        archive_cmds_CXX='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	        ;;
-	    esac
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $GREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-	    ;;
-          *)
-	    if test "$GXX" = yes; then
-	      if test $with_gnu_ld = no; then
-	        case $host_cpu in
-	          hppa*64*)
-	            archive_cmds_CXX='$CC -shared -nostdlib -fPIC ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	            ;;
-	          ia64*)
-	            archive_cmds_CXX='$CC -shared -nostdlib -fPIC ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	            ;;
-	          *)
-	            archive_cmds_CXX='$CC -shared -nostdlib -fPIC ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	            ;;
-	        esac
-	      fi
-	    else
-	      # FIXME: insert proper C++ library support
-	      ld_shlibs_CXX=no
-	    fi
-	    ;;
-        esac
-        ;;
-
-      interix[3-9]*)
-	hardcode_direct_CXX=no
-	hardcode_shlibpath_var_CXX=no
-	hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	export_dynamic_flag_spec_CXX='${wl}-E'
-	# Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
-	# Instead, shared libraries are loaded at an image base (0x10000000 by
-	# default) and relocated if they conflict, which is a slow very memory
-	# consuming and fragmenting process.  To avoid this, we pick a random,
-	# 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
-	# time.  Moving up from 0x10000000 also allows more sbrk(2) space.
-	archive_cmds_CXX='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
-	archive_expsym_cmds_CXX='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
-	;;
-      irix5* | irix6*)
-        case $cc_basename in
-          CC*)
-	    # SGI C++
-	    archive_cmds_CXX='$CC -shared -all -multigot $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-
-	    # Archives containing C++ object files must be created using
-	    # "CC -ar", where "CC" is the IRIX C++ compiler.  This is
-	    # necessary to make sure instantiated templates are included
-	    # in the archive.
-	    old_archive_cmds_CXX='$CC -ar -WR,-u -o $oldlib $oldobjs'
-	    ;;
-          *)
-	    if test "$GXX" = yes; then
-	      if test "$with_gnu_ld" = no; then
-	        archive_cmds_CXX='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-	      else
-	        archive_cmds_CXX='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` -o $lib'
-	      fi
-	    fi
-	    link_all_deplibs_CXX=yes
-	    ;;
-        esac
-        hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-        hardcode_libdir_separator_CXX=:
-        inherit_rpath_CXX=yes
-        ;;
-
-      linux* | k*bsd*-gnu | kopensolaris*-gnu)
-        case $cc_basename in
-          KCC*)
-	    # Kuck and Associates, Inc. (KAI) C++ Compiler
-
-	    # KCC will only create a shared library if the output file
-	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
-	    # to its proper name (with version) after linking.
-	    archive_cmds_CXX='tempext=`echo $shared_ext | $SED -e '\''s/\([^()0-9A-Za-z{}]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
-	    archive_expsym_cmds_CXX='tempext=`echo $shared_ext | $SED -e '\''s/\([^()0-9A-Za-z{}]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib ${wl}-retain-symbols-file,$export_symbols; mv \$templib $lib'
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1 | $GREP "ld"`; rm -f libconftest$shared_ext; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-
-	    hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
-
-	    # Archives containing C++ object files must be created using
-	    # "CC -Bstatic", where "CC" is the KAI C++ compiler.
-	    old_archive_cmds_CXX='$CC -Bstatic -o $oldlib $oldobjs'
-	    ;;
-	  icpc* | ecpc* )
-	    # Intel C++
-	    with_gnu_ld=yes
-	    # version 8.0 and above of icpc choke on multiply defined symbols
-	    # if we add $predep_objects and $postdep_objects, however 7.1 and
-	    # earlier do not add the objects themselves.
-	    case `$CC -V 2>&1` in
-	      *"Version 7."*)
-	        archive_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-		archive_expsym_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-		;;
-	      *)  # Version 8.0 or newer
-	        tmp_idyn=
-	        case $host_cpu in
-		  ia64*) tmp_idyn=' -i_dynamic';;
-		esac
-	        archive_cmds_CXX='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-		archive_expsym_cmds_CXX='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-		;;
-	    esac
-	    archive_cmds_need_lc_CXX=no
-	    hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
-	    whole_archive_flag_spec_CXX='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
-	    ;;
-          pgCC* | pgcpp*)
-            # Portland Group C++ compiler
-	    case `$CC -V` in
-	    *pgCC\ [1-5].* | *pgcpp\ [1-5].*)
-	      prelink_cmds_CXX='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $objs $libobjs $compile_deplibs~
-		compile_command="$compile_command `find $tpldir -name \*.o | sort | $NL2SP`"'
-	      old_archive_cmds_CXX='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $oldobjs$old_deplibs~
-		$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs `find $tpldir -name \*.o | sort | $NL2SP`~
-		$RANLIB $oldlib'
-	      archive_cmds_CXX='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
-		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
-	      archive_expsym_cmds_CXX='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
-		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
-	      ;;
-	    *) # Version 6 and above use weak symbols
-	      archive_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
-	      archive_expsym_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
-	      ;;
-	    esac
-
-	    hardcode_libdir_flag_spec_CXX='${wl}--rpath ${wl}$libdir'
-	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
-	    whole_archive_flag_spec_CXX='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-            ;;
-	  cxx*)
-	    # Compaq C++
-	    archive_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    archive_expsym_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname  -o $lib ${wl}-retain-symbols-file $wl$export_symbols'
-
-	    runpath_var=LD_RUN_PATH
-	    hardcode_libdir_flag_spec_CXX='-rpath $libdir'
-	    hardcode_libdir_separator_CXX=:
-
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld .*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "X$list" | $Xsed'
-	    ;;
-	  xl* | mpixl* | bgxl*)
-	    # IBM XL 8.0 on PPC, with GNU ld
-	    hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-	    export_dynamic_flag_spec_CXX='${wl}--export-dynamic'
-	    archive_cmds_CXX='$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    if test "x$supports_anon_versioning" = xyes; then
-	      archive_expsym_cmds_CXX='echo "{ global:" > $output_objdir/$libname.ver~
-		cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
-		echo "local: *; };" >> $output_objdir/$libname.ver~
-		$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
-	    fi
-	    ;;
-	  *)
-	    case `$CC -V 2>&1 | sed 5q` in
-	    *Sun\ C*)
-	      # Sun C++ 5.9
-	      no_undefined_flag_CXX=' -zdefs'
-	      archive_cmds_CXX='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	      archive_expsym_cmds_CXX='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file ${wl}$export_symbols'
-	      hardcode_libdir_flag_spec_CXX='-R$libdir'
-	      whole_archive_flag_spec_CXX='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-	      compiler_needs_object_CXX=yes
-
-	      # Not sure whether something based on
-	      # $CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1
-	      # would be better.
-	      output_verbose_link_cmd='func_echo_all'
-
-	      # Archives containing C++ object files must be created using
-	      # "CC -xar", where "CC" is the Sun C++ compiler.  This is
-	      # necessary to make sure instantiated templates are included
-	      # in the archive.
-	      old_archive_cmds_CXX='$CC -xar -o $oldlib $oldobjs'
-	      ;;
-	    esac
-	    ;;
-	esac
-	;;
-
-      lynxos*)
-        # FIXME: insert proper C++ library support
-	ld_shlibs_CXX=no
-	;;
-
-      m88k*)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
-	;;
-
-      mvs*)
-        case $cc_basename in
-          cxx*)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-	  *)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-	esac
-	;;
-
-      netbsd*)
-        if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
-	  archive_cmds_CXX='$LD -Bshareable  -o $lib $predep_objects $libobjs $deplibs $postdep_objects $linker_flags'
-	  wlarc=
-	  hardcode_libdir_flag_spec_CXX='-R$libdir'
-	  hardcode_direct_CXX=yes
-	  hardcode_shlibpath_var_CXX=no
-	fi
-	# Workaround some broken pre-1.5 toolchains
-	output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP conftest.$objext | $SED -e "s:-lgcc -lc -lgcc::"'
-	;;
-
-      *nto* | *qnx*)
-        ld_shlibs_CXX=yes
-	;;
-
-      openbsd2*)
-        # C++ shared libraries are fairly broken
-	ld_shlibs_CXX=no
-	;;
-
-      openbsd*)
-	if test -f /usr/libexec/ld.so; then
-	  hardcode_direct_CXX=yes
-	  hardcode_shlibpath_var_CXX=no
-	  hardcode_direct_absolute_CXX=yes
-	  archive_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
-	  hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-	    archive_expsym_cmds_CXX='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file,$export_symbols -o $lib'
-	    export_dynamic_flag_spec_CXX='${wl}-E'
-	    whole_archive_flag_spec_CXX="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
-	  fi
-	  output_verbose_link_cmd=func_echo_all
-	else
-	  ld_shlibs_CXX=no
-	fi
-	;;
-
-      osf3* | osf4* | osf5*)
-        case $cc_basename in
-          KCC*)
-	    # Kuck and Associates, Inc. (KAI) C++ Compiler
-
-	    # KCC will only create a shared library if the output file
-	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
-	    # to its proper name (with version) after linking.
-	    archive_cmds_CXX='tempext=`echo $shared_ext | $SED -e '\''s/\([^()0-9A-Za-z{}]\)/\\\\\1/g'\''`; templib=`echo "$lib" | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
-
-	    hardcode_libdir_flag_spec_CXX='${wl}-rpath,$libdir'
-	    hardcode_libdir_separator_CXX=:
-
-	    # Archives containing C++ object files must be created using
-	    # the KAI C++ compiler.
-	    case $host in
-	      osf3*) old_archive_cmds_CXX='$CC -Bstatic -o $oldlib $oldobjs' ;;
-	      *) old_archive_cmds_CXX='$CC -o $oldlib $oldobjs' ;;
-	    esac
-	    ;;
-          RCC*)
-	    # Rational C++ 2.4.1
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-          cxx*)
-	    case $host in
-	      osf3*)
-	        allow_undefined_flag_CXX=' ${wl}-expect_unresolved ${wl}\*'
-	        archive_cmds_CXX='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $soname `test -n "$verstring" && func_echo_all "${wl}-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	        hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-		;;
-	      *)
-	        allow_undefined_flag_CXX=' -expect_unresolved \*'
-	        archive_cmds_CXX='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	        archive_expsym_cmds_CXX='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done~
-	          echo "-hidden">> $lib.exp~
-	          $CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname ${wl}-input ${wl}$lib.exp  `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~
-	          $RM $lib.exp'
-	        hardcode_libdir_flag_spec_CXX='-rpath $libdir'
-		;;
-	    esac
-
-	    hardcode_libdir_separator_CXX=:
-
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld" | $GREP -v "ld:"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld.*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-	    ;;
-	  *)
-	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
-	      allow_undefined_flag_CXX=' ${wl}-expect_unresolved ${wl}\*'
-	      case $host in
-	        osf3*)
-	          archive_cmds_CXX='$CC -shared -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-		  ;;
-	        *)
-	          archive_cmds_CXX='$CC -shared -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-		  ;;
-	      esac
-
-	      hardcode_libdir_flag_spec_CXX='${wl}-rpath ${wl}$libdir'
-	      hardcode_libdir_separator_CXX=:
-
-	      # Commands to make compiler produce verbose output that lists
-	      # what "hidden" libraries, object files and flags are used when
-	      # linking a shared library.
-	      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-
-	    else
-	      # FIXME: insert proper C++ library support
-	      ld_shlibs_CXX=no
-	    fi
-	    ;;
-        esac
-        ;;
-
-      psos*)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
-        ;;
-
-      sunos4*)
-        case $cc_basename in
-          CC*)
-	    # Sun C++ 4.x
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-          lcc*)
-	    # Lucid
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-          *)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-        esac
-        ;;
-
-      solaris*)
-        case $cc_basename in
-          CC*)
-	    # Sun C++ 4.2, 5.x and Centerline C++
-            archive_cmds_need_lc_CXX=yes
-	    no_undefined_flag_CXX=' -zdefs'
-	    archive_cmds_CXX='$CC -G${allow_undefined_flag}  -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	    archive_expsym_cmds_CXX='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	      $CC -G${allow_undefined_flag} ${wl}-M ${wl}$lib.exp -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
-
-	    hardcode_libdir_flag_spec_CXX='-R$libdir'
-	    hardcode_shlibpath_var_CXX=no
-	    case $host_os in
-	      solaris2.[0-5] | solaris2.[0-5].*) ;;
-	      *)
-		# The compiler driver will combine and reorder linker options,
-		# but understands `-z linker_flag'.
-	        # Supported since Solaris 2.6 (maybe 2.5.1?)
-		whole_archive_flag_spec_CXX='-z allextract$convenience -z defaultextract'
-	        ;;
-	    esac
-	    link_all_deplibs_CXX=yes
-
-	    output_verbose_link_cmd='func_echo_all'
-
-	    # Archives containing C++ object files must be created using
-	    # "CC -xar", where "CC" is the Sun C++ compiler.  This is
-	    # necessary to make sure instantiated templates are included
-	    # in the archive.
-	    old_archive_cmds_CXX='$CC -xar -o $oldlib $oldobjs'
-	    ;;
-          gcx*)
-	    # Green Hills C++ Compiler
-	    archive_cmds_CXX='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
-
-	    # The C++ compiler must be used to create the archive.
-	    old_archive_cmds_CXX='$CC $LDFLAGS -archive -o $oldlib $oldobjs'
-	    ;;
-          *)
-	    # GNU C++ compiler with Solaris linker
-	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
-	      no_undefined_flag_CXX=' ${wl}-z ${wl}defs'
-	      if $CC --version | $GREP -v '^2\.7' > /dev/null; then
-	        archive_cmds_CXX='$CC -shared -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
-	        archive_expsym_cmds_CXX='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-		  $CC -shared -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
-
-	        # Commands to make compiler produce verbose output that lists
-	        # what "hidden" libraries, object files and flags are used when
-	        # linking a shared library.
-	        output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-	      else
-	        # g++ 2.7 appears to require `-G' NOT `-shared' on this
-	        # platform.
-	        archive_cmds_CXX='$CC -G -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
-	        archive_expsym_cmds_CXX='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-		  $CC -G -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
-
-	        # Commands to make compiler produce verbose output that lists
-	        # what "hidden" libraries, object files and flags are used when
-	        # linking a shared library.
-	        output_verbose_link_cmd='$CC -G $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-	      fi
-
-	      hardcode_libdir_flag_spec_CXX='${wl}-R $wl$libdir'
-	      case $host_os in
-		solaris2.[0-5] | solaris2.[0-5].*) ;;
-		*)
-		  whole_archive_flag_spec_CXX='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
-		  ;;
-	      esac
-	    fi
-	    ;;
-        esac
-        ;;
-
-    sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[01].[10]* | unixware7* | sco3.2v5.0.[024]*)
-      no_undefined_flag_CXX='${wl}-z,text'
-      archive_cmds_need_lc_CXX=no
-      hardcode_shlibpath_var_CXX=no
-      runpath_var='LD_RUN_PATH'
-
-      case $cc_basename in
-        CC*)
-	  archive_cmds_CXX='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  archive_expsym_cmds_CXX='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-	*)
-	  archive_cmds_CXX='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  archive_expsym_cmds_CXX='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-      esac
-      ;;
-
-      sysv5* | sco3.2v5* | sco5v6*)
-	# Note: We can NOT use -z defs as we might desire, because we do not
-	# link with -lc, and that would cause any symbols used from libc to
-	# always be unresolved, which means just about no library would
-	# ever link correctly.  If we're not using GNU ld we use -z text
-	# though, which does catch some bad symbols but isn't as heavy-handed
-	# as -z defs.
-	no_undefined_flag_CXX='${wl}-z,text'
-	allow_undefined_flag_CXX='${wl}-z,nodefs'
-	archive_cmds_need_lc_CXX=no
-	hardcode_shlibpath_var_CXX=no
-	hardcode_libdir_flag_spec_CXX='${wl}-R,$libdir'
-	hardcode_libdir_separator_CXX=':'
-	link_all_deplibs_CXX=yes
-	export_dynamic_flag_spec_CXX='${wl}-Bexport'
-	runpath_var='LD_RUN_PATH'
-
-	case $cc_basename in
-          CC*)
-	    archive_cmds_CXX='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    archive_expsym_cmds_CXX='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    old_archive_cmds_CXX='$CC -Tprelink_objects $oldobjs~
-	      '"$old_archive_cmds_CXX"
-	    reload_cmds_CXX='$CC -Tprelink_objects $reload_objs~
-	      '"$reload_cmds_CXX"
-	    ;;
-	  *)
-	    archive_cmds_CXX='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    archive_expsym_cmds_CXX='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    ;;
-	esac
-      ;;
-
-      tandem*)
-        case $cc_basename in
-          NCC*)
-	    # NonStop-UX NCC 3.20
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-          *)
-	    # FIXME: insert proper C++ library support
-	    ld_shlibs_CXX=no
-	    ;;
-        esac
-        ;;
-
-      vxworks*)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
-        ;;
-
-      *)
-        # FIXME: insert proper C++ library support
-        ld_shlibs_CXX=no
-        ;;
-    esac
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ld_shlibs_CXX" >&5
-$as_echo "$ld_shlibs_CXX" >&6; }
-    test "$ld_shlibs_CXX" = no && can_build_shared=no
-
-    GCC_CXX="$GXX"
-    LD_CXX="$LD"
-
-    ## CAVEAT EMPTOR:
-    ## There is no encapsulation within the following macros, do not change
-    ## the running order or otherwise move them around unless you know exactly
-    ## what you are doing...
-    # Dependencies to place before and after the object being linked:
-predep_objects_CXX=
-postdep_objects_CXX=
-predeps_CXX=
-postdeps_CXX=
-compiler_lib_search_path_CXX=
-
-cat > conftest.$ac_ext <<_LT_EOF
-class Foo
-{
-public:
-  Foo (void) { a = 0; }
-private:
-  int a;
-};
-_LT_EOF
-
-if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
-  (eval $ac_compile) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then
-  # Parse the compiler output and extract the necessary
-  # objects, libraries and library flags.
-
-  # Sentinel used to keep track of whether or not we are before
-  # the conftest object file.
-  pre_test_object_deps_done=no
-
-  for p in `eval "$output_verbose_link_cmd"`; do
-    case $p in
-
-    -L* | -R* | -l*)
-       # Some compilers place space between "-{L,R}" and the path.
-       # Remove the space.
-       if test $p = "-L" ||
-          test $p = "-R"; then
-	 prev=$p
-	 continue
-       else
-	 prev=
-       fi
-
-       if test "$pre_test_object_deps_done" = no; then
-	 case $p in
-	 -L* | -R*)
-	   # Internal compiler library paths should come after those
-	   # provided the user.  The postdeps already come after the
-	   # user supplied libs so there is no need to process them.
-	   if test -z "$compiler_lib_search_path_CXX"; then
-	     compiler_lib_search_path_CXX="${prev}${p}"
-	   else
-	     compiler_lib_search_path_CXX="${compiler_lib_search_path_CXX} ${prev}${p}"
-	   fi
-	   ;;
-	 # The "-l" case would never come before the object being
-	 # linked, so don't bother handling this case.
-	 esac
-       else
-	 if test -z "$postdeps_CXX"; then
-	   postdeps_CXX="${prev}${p}"
-	 else
-	   postdeps_CXX="${postdeps_CXX} ${prev}${p}"
-	 fi
-       fi
-       ;;
-
-    *.$objext)
-       # This assumes that the test object file only shows up
-       # once in the compiler output.
-       if test "$p" = "conftest.$objext"; then
-	 pre_test_object_deps_done=yes
-	 continue
-       fi
-
-       if test "$pre_test_object_deps_done" = no; then
-	 if test -z "$predep_objects_CXX"; then
-	   predep_objects_CXX="$p"
-	 else
-	   predep_objects_CXX="$predep_objects_CXX $p"
-	 fi
-       else
-	 if test -z "$postdep_objects_CXX"; then
-	   postdep_objects_CXX="$p"
-	 else
-	   postdep_objects_CXX="$postdep_objects_CXX $p"
-	 fi
-       fi
-       ;;
-
-    *) ;; # Ignore the rest.
-
-    esac
-  done
-
-  # Clean up.
-  rm -f a.out a.exe
-else
-  echo "libtool.m4: error: problem compiling CXX test program"
-fi
-
-$RM -f confest.$objext
-
-# PORTME: override above test on systems where it is broken
-case $host_os in
-interix[3-9]*)
-  # Interix 3.5 installs completely hosed .la files for C++, so rather than
-  # hack all around it, let's just trust "g++" to DTRT.
-  predep_objects_CXX=
-  postdep_objects_CXX=
-  postdeps_CXX=
-  ;;
-
-linux*)
-  case `$CC -V 2>&1 | sed 5q` in
-  *Sun\ C*)
-    # Sun C++ 5.9
-
-    # The more standards-conforming stlport4 library is
-    # incompatible with the Cstd library. Avoid specifying
-    # it if it's in CXXFLAGS. Ignore libCrun as
-    # -library=stlport4 depends on it.
-    case " $CXX $CXXFLAGS " in
-    *" -library=stlport4 "*)
-      solaris_use_stlport4=yes
-      ;;
-    esac
-
-    if test "$solaris_use_stlport4" != yes; then
-      postdeps_CXX='-library=Cstd -library=Crun'
-    fi
-    ;;
-  esac
-  ;;
-
-solaris*)
-  case $cc_basename in
-  CC*)
-    # The more standards-conforming stlport4 library is
-    # incompatible with the Cstd library. Avoid specifying
-    # it if it's in CXXFLAGS. Ignore libCrun as
-    # -library=stlport4 depends on it.
-    case " $CXX $CXXFLAGS " in
-    *" -library=stlport4 "*)
-      solaris_use_stlport4=yes
-      ;;
-    esac
-
-    # Adding this requires a known-good setup of shared libraries for
-    # Sun compiler versions before 5.6, else PIC objects from an old
-    # archive will be linked into the output, leading to subtle bugs.
-    if test "$solaris_use_stlport4" != yes; then
-      postdeps_CXX='-library=Cstd -library=Crun'
-    fi
-    ;;
-  esac
-  ;;
-esac
-
-
-case " $postdeps_CXX " in
-*" -lc "*) archive_cmds_need_lc_CXX=no ;;
-esac
- compiler_lib_search_dirs_CXX=
-if test -n "${compiler_lib_search_path_CXX}"; then
- compiler_lib_search_dirs_CXX=`echo " ${compiler_lib_search_path_CXX}" | ${SED} -e 's! -L! !g' -e 's!^ !!'`
-fi
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-    lt_prog_compiler_wl_CXX=
-lt_prog_compiler_pic_CXX=
-lt_prog_compiler_static_CXX=
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $compiler option to produce PIC" >&5
-$as_echo_n "checking for $compiler option to produce PIC... " >&6; }
-
-  # C++ specific cases for pic, static, wl, etc.
-  if test "$GXX" = yes; then
-    lt_prog_compiler_wl_CXX='-Wl,'
-    lt_prog_compiler_static_CXX='-static'
-
-    case $host_os in
-    aix*)
-      # All AIX code is PIC.
-      if test "$host_cpu" = ia64; then
-	# AIX 5 now supports IA64 processor
-	lt_prog_compiler_static_CXX='-Bstatic'
-      fi
-      lt_prog_compiler_pic_CXX='-fPIC'
-      ;;
-
-    amigaos*)
-      case $host_cpu in
-      powerpc)
-            # see comment about AmigaOS4 .so support
-            lt_prog_compiler_pic_CXX='-fPIC'
-        ;;
-      m68k)
-            # FIXME: we need at least 68020 code to build shared libraries, but
-            # adding the `-m68020' flag to GCC prevents building anything better,
-            # like `-m68040'.
-            lt_prog_compiler_pic_CXX='-m68020 -resident32 -malways-restore-a4'
-        ;;
-      esac
-      ;;
-
-    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
-      # PIC is the default for these OSes.
-      ;;
-    mingw* | cygwin* | os2* | pw32* | cegcc*)
-      # This hack is so that the source file can tell whether it is being
-      # built for inclusion in a dll (and should export symbols for example).
-      # Although the cygwin gcc ignores -fPIC, still need this for old-style
-      # (--disable-auto-import) libraries
-      lt_prog_compiler_pic_CXX='-DDLL_EXPORT'
-      ;;
-    darwin* | rhapsody*)
-      # PIC is the default on this platform
-      # Common symbols not allowed in MH_DYLIB files
-      lt_prog_compiler_pic_CXX='-fno-common'
-      ;;
-    *djgpp*)
-      # DJGPP does not support shared libraries at all
-      lt_prog_compiler_pic_CXX=
-      ;;
-    haiku*)
-      # PIC is the default for Haiku.
-      # The "-static" flag exists, but is broken.
-      lt_prog_compiler_static_CXX=
-      ;;
-    interix[3-9]*)
-      # Interix 3.x gcc -fpic/-fPIC options generate broken code.
-      # Instead, we relocate shared libraries at runtime.
-      ;;
-    sysv4*MP*)
-      if test -d /usr/nec; then
-	lt_prog_compiler_pic_CXX=-Kconform_pic
-      fi
-      ;;
-    hpux*)
-      # PIC is the default for 64-bit PA HP-UX, but not for 32-bit
-      # PA HP-UX.  On IA64 HP-UX, PIC is the default but the pic flag
-      # sets the default TLS model and affects inlining.
-      case $host_cpu in
-      hppa*64*)
-	;;
-      *)
-	lt_prog_compiler_pic_CXX='-fPIC'
-	;;
-      esac
-      ;;
-    *qnx* | *nto*)
-      # QNX uses GNU C++, but need to define -shared option too, otherwise
-      # it will coredump.
-      lt_prog_compiler_pic_CXX='-fPIC -shared'
-      ;;
-    *)
-      lt_prog_compiler_pic_CXX='-fPIC'
-      ;;
-    esac
-  else
-    case $host_os in
-      aix[4-9]*)
-	# All AIX code is PIC.
-	if test "$host_cpu" = ia64; then
-	  # AIX 5 now supports IA64 processor
-	  lt_prog_compiler_static_CXX='-Bstatic'
-	else
-	  lt_prog_compiler_static_CXX='-bnso -bI:/lib/syscalls.exp'
-	fi
-	;;
-      chorus*)
-	case $cc_basename in
-	cxch68*)
-	  # Green Hills C++ Compiler
-	  # _LT_TAGVAR(lt_prog_compiler_static, CXX)="--no_auto_instantiation -u __main -u __premain -u _abort -r $COOL_DIR/lib/libOrb.a $MVME_DIR/lib/CC/libC.a $MVME_DIR/lib/classix/libcx.s.a"
-	  ;;
-	esac
-	;;
-      dgux*)
-	case $cc_basename in
-	  ec++*)
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    ;;
-	  ghcx*)
-	    # Green Hills C++ Compiler
-	    lt_prog_compiler_pic_CXX='-pic'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      freebsd* | dragonfly*)
-	# FreeBSD uses GNU C++
-	;;
-      hpux9* | hpux10* | hpux11*)
-	case $cc_basename in
-	  CC*)
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_static_CXX='${wl}-a ${wl}archive'
-	    if test "$host_cpu" != ia64; then
-	      lt_prog_compiler_pic_CXX='+Z'
-	    fi
-	    ;;
-	  aCC*)
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_static_CXX='${wl}-a ${wl}archive'
-	    case $host_cpu in
-	    hppa*64*|ia64*)
-	      # +Z the default
-	      ;;
-	    *)
-	      lt_prog_compiler_pic_CXX='+Z'
-	      ;;
-	    esac
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      interix*)
-	# This is c89, which is MS Visual C++ (no shared libs)
-	# Anyone wants to do a port?
-	;;
-      irix5* | irix6* | nonstopux*)
-	case $cc_basename in
-	  CC*)
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_static_CXX='-non_shared'
-	    # CC pic flag -KPIC is the default.
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      linux* | k*bsd*-gnu | kopensolaris*-gnu)
-	case $cc_basename in
-	  KCC*)
-	    # KAI C++ Compiler
-	    lt_prog_compiler_wl_CXX='--backend -Wl,'
-	    lt_prog_compiler_pic_CXX='-fPIC'
-	    ;;
-	  ecpc* )
-	    # old Intel C++ for x86_64 which still supported -KPIC.
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    lt_prog_compiler_static_CXX='-static'
-	    ;;
-	  icpc* )
-	    # Intel C++, used to be incompatible with GCC.
-	    # ICC 10 doesn't accept -KPIC any more.
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-fPIC'
-	    lt_prog_compiler_static_CXX='-static'
-	    ;;
-	  pgCC* | pgcpp*)
-	    # Portland Group C++ compiler
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-fpic'
-	    lt_prog_compiler_static_CXX='-Bstatic'
-	    ;;
-	  cxx*)
-	    # Compaq C++
-	    # Make sure the PIC flag is empty.  It appears that all Alpha
-	    # Linux and Compaq Tru64 Unix objects are PIC.
-	    lt_prog_compiler_pic_CXX=
-	    lt_prog_compiler_static_CXX='-non_shared'
-	    ;;
-	  xlc* | xlC* | bgxl[cC]* | mpixl[cC]*)
-	    # IBM XL 8.0, 9.0 on PPC and BlueGene
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-qpic'
-	    lt_prog_compiler_static_CXX='-qstaticlink'
-	    ;;
-	  *)
-	    case `$CC -V 2>&1 | sed 5q` in
-	    *Sun\ C*)
-	      # Sun C++ 5.9
-	      lt_prog_compiler_pic_CXX='-KPIC'
-	      lt_prog_compiler_static_CXX='-Bstatic'
-	      lt_prog_compiler_wl_CXX='-Qoption ld '
-	      ;;
-	    esac
-	    ;;
-	esac
-	;;
-      lynxos*)
-	;;
-      m88k*)
-	;;
-      mvs*)
-	case $cc_basename in
-	  cxx*)
-	    lt_prog_compiler_pic_CXX='-W c,exportall'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      netbsd*)
-	;;
-      *qnx* | *nto*)
-        # QNX uses GNU C++, but need to define -shared option too, otherwise
-        # it will coredump.
-        lt_prog_compiler_pic_CXX='-fPIC -shared'
-        ;;
-      osf3* | osf4* | osf5*)
-	case $cc_basename in
-	  KCC*)
-	    lt_prog_compiler_wl_CXX='--backend -Wl,'
-	    ;;
-	  RCC*)
-	    # Rational C++ 2.4.1
-	    lt_prog_compiler_pic_CXX='-pic'
-	    ;;
-	  cxx*)
-	    # Digital/Compaq C++
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    # Make sure the PIC flag is empty.  It appears that all Alpha
-	    # Linux and Compaq Tru64 Unix objects are PIC.
-	    lt_prog_compiler_pic_CXX=
-	    lt_prog_compiler_static_CXX='-non_shared'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      psos*)
-	;;
-      solaris*)
-	case $cc_basename in
-	  CC*)
-	    # Sun C++ 4.2, 5.x and Centerline C++
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    lt_prog_compiler_static_CXX='-Bstatic'
-	    lt_prog_compiler_wl_CXX='-Qoption ld '
-	    ;;
-	  gcx*)
-	    # Green Hills C++ Compiler
-	    lt_prog_compiler_pic_CXX='-PIC'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      sunos4*)
-	case $cc_basename in
-	  CC*)
-	    # Sun C++ 4.x
-	    lt_prog_compiler_pic_CXX='-pic'
-	    lt_prog_compiler_static_CXX='-Bstatic'
-	    ;;
-	  lcc*)
-	    # Lucid
-	    lt_prog_compiler_pic_CXX='-pic'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*)
-	case $cc_basename in
-	  CC*)
-	    lt_prog_compiler_wl_CXX='-Wl,'
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    lt_prog_compiler_static_CXX='-Bstatic'
-	    ;;
-	esac
-	;;
-      tandem*)
-	case $cc_basename in
-	  NCC*)
-	    # NonStop-UX NCC 3.20
-	    lt_prog_compiler_pic_CXX='-KPIC'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      vxworks*)
-	;;
-      *)
-	lt_prog_compiler_can_build_shared_CXX=no
-	;;
-    esac
-  fi
-
-case $host_os in
-  # For platforms which do not support PIC, -DPIC is meaningless:
-  *djgpp*)
-    lt_prog_compiler_pic_CXX=
-    ;;
-  *)
-    lt_prog_compiler_pic_CXX="$lt_prog_compiler_pic_CXX -DPIC"
-    ;;
-esac
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_prog_compiler_pic_CXX" >&5
-$as_echo "$lt_prog_compiler_pic_CXX" >&6; }
-
-
-
-#
-# Check to make sure the PIC flag actually works.
-#
-if test -n "$lt_prog_compiler_pic_CXX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler PIC flag $lt_prog_compiler_pic_CXX works" >&5
-$as_echo_n "checking if $compiler PIC flag $lt_prog_compiler_pic_CXX works... " >&6; }
-if ${lt_cv_prog_compiler_pic_works_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  lt_cv_prog_compiler_pic_works_CXX=no
-   ac_outfile=conftest.$ac_objext
-   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
-   lt_compiler_flag="$lt_prog_compiler_pic_CXX -DPIC"
-   # Insert the option either (1) after the last *FLAGS variable, or
-   # (2) before a word containing "conftest.", or (3) at the end.
-   # Note that $ac_compile itself does not contain backslashes and begins
-   # with a dollar sign (not a hyphen), so the echo should work correctly.
-   # The option is referenced via a variable to avoid confusing sed.
-   lt_compile=`echo "$ac_compile" | $SED \
-   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-   -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
-   (eval "$lt_compile" 2>conftest.err)
-   ac_status=$?
-   cat conftest.err >&5
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   if (exit $ac_status) && test -s "$ac_outfile"; then
-     # The compiler can only warn and ignore the option if not recognized
-     # So say no if there are warnings other than the usual output.
-     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' >conftest.exp
-     $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
-     if test ! -s conftest.er2 || diff conftest.exp conftest.er2 >/dev/null; then
-       lt_cv_prog_compiler_pic_works_CXX=yes
-     fi
-   fi
-   $RM conftest*
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_pic_works_CXX" >&5
-$as_echo "$lt_cv_prog_compiler_pic_works_CXX" >&6; }
-
-if test x"$lt_cv_prog_compiler_pic_works_CXX" = xyes; then
-    case $lt_prog_compiler_pic_CXX in
-     "" | " "*) ;;
-     *) lt_prog_compiler_pic_CXX=" $lt_prog_compiler_pic_CXX" ;;
-     esac
-else
-    lt_prog_compiler_pic_CXX=
-     lt_prog_compiler_can_build_shared_CXX=no
-fi
-
-fi
-
-
-
-#
-# Check to make sure the static flag actually works.
-#
-wl=$lt_prog_compiler_wl_CXX eval lt_tmp_static_flag=\"$lt_prog_compiler_static_CXX\"
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler static flag $lt_tmp_static_flag works" >&5
-$as_echo_n "checking if $compiler static flag $lt_tmp_static_flag works... " >&6; }
-if ${lt_cv_prog_compiler_static_works_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  lt_cv_prog_compiler_static_works_CXX=no
-   save_LDFLAGS="$LDFLAGS"
-   LDFLAGS="$LDFLAGS $lt_tmp_static_flag"
-   echo "$lt_simple_link_test_code" > conftest.$ac_ext
-   if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
-     # The linker can only warn and ignore the option if not recognized
-     # So say no if there are warnings
-     if test -s conftest.err; then
-       # Append any errors to the config.log.
-       cat conftest.err 1>&5
-       $ECHO "$_lt_linker_boilerplate" | $SED '/^$/d' > conftest.exp
-       $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
-       if diff conftest.exp conftest.er2 >/dev/null; then
-         lt_cv_prog_compiler_static_works_CXX=yes
-       fi
-     else
-       lt_cv_prog_compiler_static_works_CXX=yes
-     fi
-   fi
-   $RM -r conftest*
-   LDFLAGS="$save_LDFLAGS"
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_static_works_CXX" >&5
-$as_echo "$lt_cv_prog_compiler_static_works_CXX" >&6; }
-
-if test x"$lt_cv_prog_compiler_static_works_CXX" = xyes; then
-    :
-else
-    lt_prog_compiler_static_CXX=
-fi
-
-
-
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
-$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
-if ${lt_cv_prog_compiler_c_o_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  lt_cv_prog_compiler_c_o_CXX=no
-   $RM -r conftest 2>/dev/null
-   mkdir conftest
-   cd conftest
-   mkdir out
-   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
-
-   lt_compiler_flag="-o out/conftest2.$ac_objext"
-   # Insert the option either (1) after the last *FLAGS variable, or
-   # (2) before a word containing "conftest.", or (3) at the end.
-   # Note that $ac_compile itself does not contain backslashes and begins
-   # with a dollar sign (not a hyphen), so the echo should work correctly.
-   lt_compile=`echo "$ac_compile" | $SED \
-   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-   -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
-   (eval "$lt_compile" 2>out/conftest.err)
-   ac_status=$?
-   cat out/conftest.err >&5
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   if (exit $ac_status) && test -s out/conftest2.$ac_objext
-   then
-     # The compiler can only warn and ignore the option if not recognized
-     # So say no if there are warnings
-     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
-     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
-     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
-       lt_cv_prog_compiler_c_o_CXX=yes
-     fi
-   fi
-   chmod u+w . 2>&5
-   $RM conftest*
-   # SGI C++ compiler will create directory out/ii_files/ for
-   # template instantiation
-   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
-   $RM out/* && rmdir out
-   cd ..
-   $RM -r conftest
-   $RM conftest*
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o_CXX" >&5
-$as_echo "$lt_cv_prog_compiler_c_o_CXX" >&6; }
-
-
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking if $compiler supports -c -o file.$ac_objext" >&5
-$as_echo_n "checking if $compiler supports -c -o file.$ac_objext... " >&6; }
-if ${lt_cv_prog_compiler_c_o_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  lt_cv_prog_compiler_c_o_CXX=no
-   $RM -r conftest 2>/dev/null
-   mkdir conftest
-   cd conftest
-   mkdir out
-   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
-
-   lt_compiler_flag="-o out/conftest2.$ac_objext"
-   # Insert the option either (1) after the last *FLAGS variable, or
-   # (2) before a word containing "conftest.", or (3) at the end.
-   # Note that $ac_compile itself does not contain backslashes and begins
-   # with a dollar sign (not a hyphen), so the echo should work correctly.
-   lt_compile=`echo "$ac_compile" | $SED \
-   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-   -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
-   -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&5)
-   (eval "$lt_compile" 2>out/conftest.err)
-   ac_status=$?
-   cat out/conftest.err >&5
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   if (exit $ac_status) && test -s out/conftest2.$ac_objext
-   then
-     # The compiler can only warn and ignore the option if not recognized
-     # So say no if there are warnings
-     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
-     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
-     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
-       lt_cv_prog_compiler_c_o_CXX=yes
-     fi
-   fi
-   chmod u+w . 2>&5
-   $RM conftest*
-   # SGI C++ compiler will create directory out/ii_files/ for
-   # template instantiation
-   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
-   $RM out/* && rmdir out
-   cd ..
-   $RM -r conftest
-   $RM conftest*
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_prog_compiler_c_o_CXX" >&5
-$as_echo "$lt_cv_prog_compiler_c_o_CXX" >&6; }
-
-
-
-
-hard_links="nottested"
-if test "$lt_cv_prog_compiler_c_o_CXX" = no && test "$need_locks" != no; then
-  # do not overwrite the value of need_locks provided by the user
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking if we can lock with hard links" >&5
-$as_echo_n "checking if we can lock with hard links... " >&6; }
-  hard_links=yes
-  $RM conftest*
-  ln conftest.a conftest.b 2>/dev/null && hard_links=no
-  touch conftest.a
-  ln conftest.a conftest.b 2>&5 || hard_links=no
-  ln conftest.a conftest.b 2>/dev/null && hard_links=no
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $hard_links" >&5
-$as_echo "$hard_links" >&6; }
-  if test "$hard_links" = no; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&5
-$as_echo "$as_me: WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2;}
-    need_locks=warn
-  fi
-else
-  need_locks=no
-fi
-
-
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the $compiler linker ($LD) supports shared libraries" >&5
-$as_echo_n "checking whether the $compiler linker ($LD) supports shared libraries... " >&6; }
-
-  export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
-  case $host_os in
-  aix[4-9]*)
-    # If we're using GNU nm, then we don't want the "-C" option.
-    # -C means demangle to AIX nm, but means don't demangle with GNU nm
-    # Also, AIX nm treats weak defined symbols like other global defined
-    # symbols, whereas GNU nm marks them as "W".
-    if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
-      export_symbols_cmds_CXX='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
-    else
-      export_symbols_cmds_CXX='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "L")) && (substr(\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
-    fi
-    ;;
-  pw32*)
-    export_symbols_cmds_CXX="$ltdll_cmds"
-  ;;
-  cygwin* | mingw* | cegcc*)
-    export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[BCDGRS][ ]/s/.*[ ]\([^ ]*\)/\1 DATA/;/^.*[ ]__nm__/s/^.*[ ]__nm__\([^ ]*\)[ ][^ ]*/\1 DATA/;/^I[ ]/d;/^[AITW][ ]/s/.* //'\'' | sort | uniq > $export_symbols'
-  ;;
-  *)
-    export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
-  ;;
-  esac
-  exclude_expsyms_CXX='_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*'
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ld_shlibs_CXX" >&5
-$as_echo "$ld_shlibs_CXX" >&6; }
-test "$ld_shlibs_CXX" = no && can_build_shared=no
-
-with_gnu_ld_CXX=$with_gnu_ld
-
-
-
-
-
-
-#
-# Do we need to explicitly link libc?
-#
-case "x$archive_cmds_need_lc_CXX" in
-x|xyes)
-  # Assume -lc should be added
-  archive_cmds_need_lc_CXX=yes
-
-  if test "$enable_shared" = yes && test "$GCC" = yes; then
-    case $archive_cmds_CXX in
-    *'~'*)
-      # FIXME: we may have to deal with multi-command sequences.
-      ;;
-    '$CC '*)
-      # Test whether the compiler implicitly links with -lc since on some
-      # systems, -lgcc has to come before -lc. If gcc already passes -lc
-      # to ld, don't add -lc before -lgcc.
-      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether -lc should be explicitly linked in" >&5
-$as_echo_n "checking whether -lc should be explicitly linked in... " >&6; }
-if ${lt_cv_archive_cmds_need_lc_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  $RM conftest*
-	echo "$lt_simple_compile_test_code" > conftest.$ac_ext
-
-	if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_compile\""; } >&5
-  (eval $ac_compile) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } 2>conftest.err; then
-	  soname=conftest
-	  lib=conftest
-	  libobjs=conftest.$ac_objext
-	  deplibs=
-	  wl=$lt_prog_compiler_wl_CXX
-	  pic_flag=$lt_prog_compiler_pic_CXX
-	  compiler_flags=-v
-	  linker_flags=-v
-	  verstring=
-	  output_objdir=.
-	  libname=conftest
-	  lt_save_allow_undefined_flag=$allow_undefined_flag_CXX
-	  allow_undefined_flag_CXX=
-	  if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$archive_cmds_CXX 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1\""; } >&5
-  (eval $archive_cmds_CXX 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-	  then
-	    lt_cv_archive_cmds_need_lc_CXX=no
-	  else
-	    lt_cv_archive_cmds_need_lc_CXX=yes
-	  fi
-	  allow_undefined_flag_CXX=$lt_save_allow_undefined_flag
-	else
-	  cat conftest.err 1>&5
-	fi
-	$RM conftest*
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $lt_cv_archive_cmds_need_lc_CXX" >&5
-$as_echo "$lt_cv_archive_cmds_need_lc_CXX" >&6; }
-      archive_cmds_need_lc_CXX=$lt_cv_archive_cmds_need_lc_CXX
-      ;;
-    esac
-  fi
-  ;;
-esac
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking dynamic linker characteristics" >&5
-$as_echo_n "checking dynamic linker characteristics... " >&6; }
-
-library_names_spec=
-libname_spec='lib$name'
-soname_spec=
-shrext_cmds=".so"
-postinstall_cmds=
-postuninstall_cmds=
-finish_cmds=
-finish_eval=
-shlibpath_var=
-shlibpath_overrides_runpath=unknown
-version_type=none
-dynamic_linker="$host_os ld.so"
-sys_lib_dlsearch_path_spec="/lib /usr/lib"
-need_lib_prefix=unknown
-hardcode_into_libs=no
-
-# when you set need_version to no, make sure it does not cause -set_version
-# flags to be left without arguments
-need_version=unknown
-
-case $host_os in
-aix3*)
-  version_type=linux
-  library_names_spec='${libname}${release}${shared_ext}$versuffix $libname.a'
-  shlibpath_var=LIBPATH
-
-  # AIX 3 has no versioning support, so we append a major version to the name.
-  soname_spec='${libname}${release}${shared_ext}$major'
-  ;;
-
-aix[4-9]*)
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  hardcode_into_libs=yes
-  if test "$host_cpu" = ia64; then
-    # AIX 5 supports IA64
-    library_names_spec='${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext}$versuffix $libname${shared_ext}'
-    shlibpath_var=LD_LIBRARY_PATH
-  else
-    # With GCC up to 2.95.x, collect2 would create an import file
-    # for dependence libraries.  The import file would start with
-    # the line `#! .'.  This would cause the generated library to
-    # depend on `.', always an invalid library.  This was fixed in
-    # development snapshots of GCC prior to 3.0.
-    case $host_os in
-      aix4 | aix4.[01] | aix4.[01].*)
-      if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
-	   echo ' yes '
-	   echo '#endif'; } | ${CC} -E - | $GREP yes > /dev/null; then
-	:
-      else
-	can_build_shared=no
-      fi
-      ;;
-    esac
-    # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
-    # soname into executable. Probably we can add versioning support to
-    # collect2, so additional links can be useful in future.
-    if test "$aix_use_runtimelinking" = yes; then
-      # If using run time linking (on AIX 4.2 or later) use lib<name>.so
-      # instead of lib<name>.a to let people know that these are not
-      # typical AIX shared libraries.
-      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    else
-      # We preserve .a as extension for shared libraries through AIX4.2
-      # and later when we are not doing run time linking.
-      library_names_spec='${libname}${release}.a $libname.a'
-      soname_spec='${libname}${release}${shared_ext}$major'
-    fi
-    shlibpath_var=LIBPATH
-  fi
-  ;;
-
-amigaos*)
-  case $host_cpu in
-  powerpc)
-    # Since July 2007 AmigaOS4 officially supports .so libraries.
-    # When compiling the executable, add -use-dynld -Lsobjs: to the compileline.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    ;;
-  m68k)
-    library_names_spec='$libname.ixlibrary $libname.a'
-    # Create ${libname}_ixlibrary.a entries in /sys/libs.
-    finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done'
-    ;;
-  esac
-  ;;
-
-beos*)
-  library_names_spec='${libname}${shared_ext}'
-  dynamic_linker="$host_os ld.so"
-  shlibpath_var=LIBRARY_PATH
-  ;;
-
-bsdi[45]*)
-  version_type=linux
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
-  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
-  # the default ld.so.conf also contains /usr/contrib/lib and
-  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
-  # libtool to hard-code these into programs
-  ;;
-
-cygwin* | mingw* | pw32* | cegcc*)
-  version_type=windows
-  shrext_cmds=".dll"
-  need_version=no
-  need_lib_prefix=no
-
-  case $GCC,$host_os in
-  yes,cygwin* | yes,mingw* | yes,pw32* | yes,cegcc*)
-    library_names_spec='$libname.dll.a'
-    # DLL is installed to $(libdir)/../bin by postinstall_cmds
-    postinstall_cmds='base_file=`basename \${file}`~
-      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
-      dldir=$destdir/`dirname \$dlpath`~
-      test -d \$dldir || mkdir -p \$dldir~
-      $install_prog $dir/$dlname \$dldir/$dlname~
-      chmod a+x \$dldir/$dlname~
-      if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then
-        eval '\''$striplib \$dldir/$dlname'\'' || exit \$?;
-      fi'
-    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
-      dlpath=$dir/\$dldll~
-       $RM \$dlpath'
-    shlibpath_overrides_runpath=yes
-
-    case $host_os in
-    cygwin*)
-      # Cygwin DLLs use 'cyg' prefix rather than 'lib'
-      soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
-
-      ;;
-    mingw* | cegcc*)
-      # MinGW DLLs use traditional 'lib' prefix
-      soname_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
-      ;;
-    pw32*)
-      # pw32 DLLs use 'pw' prefix rather than 'lib'
-      library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext}'
-      ;;
-    esac
-    ;;
-
-  *)
-    library_names_spec='${libname}`echo ${release} | $SED -e 's/[.]/-/g'`${versuffix}${shared_ext} $libname.lib'
-    ;;
-  esac
-  dynamic_linker='Win32 ld.exe'
-  # FIXME: first we should search . and the directory the executable is in
-  shlibpath_var=PATH
-  ;;
-
-darwin* | rhapsody*)
-  dynamic_linker="$host_os dyld"
-  version_type=darwin
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${major}$shared_ext ${libname}$shared_ext'
-  soname_spec='${libname}${release}${major}$shared_ext'
-  shlibpath_overrides_runpath=yes
-  shlibpath_var=DYLD_LIBRARY_PATH
-  shrext_cmds='`test .$module = .yes && echo .so || echo .dylib`'
-
-  sys_lib_dlsearch_path_spec='/usr/local/lib /lib /usr/lib'
-  ;;
-
-dgux*)
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname$shared_ext'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-freebsd* | dragonfly*)
-  # DragonFly does not have aout.  When/if they implement a new
-  # versioning mechanism, adjust this.
-  if test -x /usr/bin/objformat; then
-    objformat=`/usr/bin/objformat`
-  else
-    case $host_os in
-    freebsd[23].*) objformat=aout ;;
-    *) objformat=elf ;;
-    esac
-  fi
-  version_type=freebsd-$objformat
-  case $version_type in
-    freebsd-elf*)
-      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
-      need_version=no
-      need_lib_prefix=no
-      ;;
-    freebsd-*)
-      library_names_spec='${libname}${release}${shared_ext}$versuffix $libname${shared_ext}$versuffix'
-      need_version=yes
-      ;;
-  esac
-  shlibpath_var=LD_LIBRARY_PATH
-  case $host_os in
-  freebsd2.*)
-    shlibpath_overrides_runpath=yes
-    ;;
-  freebsd3.[01]* | freebsdelf3.[01]*)
-    shlibpath_overrides_runpath=yes
-    hardcode_into_libs=yes
-    ;;
-  freebsd3.[2-9]* | freebsdelf3.[2-9]* | \
-  freebsd4.[0-5] | freebsdelf4.[0-5] | freebsd4.1.1 | freebsdelf4.1.1)
-    shlibpath_overrides_runpath=no
-    hardcode_into_libs=yes
-    ;;
-  *) # from 4.6 on, and DragonFly
-    shlibpath_overrides_runpath=yes
-    hardcode_into_libs=yes
-    ;;
-  esac
-  ;;
-
-gnu*)
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  hardcode_into_libs=yes
-  ;;
-
-haiku*)
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  dynamic_linker="$host_os runtime_loader"
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  sys_lib_dlsearch_path_spec='/boot/home/config/lib /boot/common/lib /boot/beos/system/lib'
-  hardcode_into_libs=yes
-  ;;
-
-hpux9* | hpux10* | hpux11*)
-  # Give a soname corresponding to the major version so that dld.sl refuses to
-  # link against other versions.
-  version_type=sunos
-  need_lib_prefix=no
-  need_version=no
-  case $host_cpu in
-  ia64*)
-    shrext_cmds='.so'
-    hardcode_into_libs=yes
-    dynamic_linker="$host_os dld.so"
-    shlibpath_var=LD_LIBRARY_PATH
-    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    if test "X$HPUX_IA64_MODE" = X32; then
-      sys_lib_search_path_spec="/usr/lib/hpux32 /usr/local/lib/hpux32 /usr/local/lib"
-    else
-      sys_lib_search_path_spec="/usr/lib/hpux64 /usr/local/lib/hpux64"
-    fi
-    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
-    ;;
-  hppa*64*)
-    shrext_cmds='.sl'
-    hardcode_into_libs=yes
-    dynamic_linker="$host_os dld.sl"
-    shlibpath_var=LD_LIBRARY_PATH # How should we handle SHLIB_PATH
-    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    sys_lib_search_path_spec="/usr/lib/pa20_64 /usr/ccs/lib/pa20_64"
-    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
-    ;;
-  *)
-    shrext_cmds='.sl'
-    dynamic_linker="$host_os dld.sl"
-    shlibpath_var=SHLIB_PATH
-    shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    ;;
-  esac
-  # HP-UX runs *really* slowly unless shared libraries are mode 555, ...
-  postinstall_cmds='chmod 555 $lib'
-  # or fails outright, so override atomically:
-  install_override_mode=555
-  ;;
-
-interix[3-9]*)
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  dynamic_linker='Interix 3.x ld.so.1 (PE, like ELF)'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  ;;
-
-irix5* | irix6* | nonstopux*)
-  case $host_os in
-    nonstopux*) version_type=nonstopux ;;
-    *)
-	if test "$lt_cv_prog_gnu_ld" = yes; then
-		version_type=linux
-	else
-		version_type=irix
-	fi ;;
-  esac
-  need_lib_prefix=no
-  need_version=no
-  soname_spec='${libname}${release}${shared_ext}$major'
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext} $libname${shared_ext}'
-  case $host_os in
-  irix5* | nonstopux*)
-    libsuff= shlibsuff=
-    ;;
-  *)
-    case $LD in # libtool.m4 will add one of these switches to LD
-    *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ")
-      libsuff= shlibsuff= libmagic=32-bit;;
-    *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ")
-      libsuff=32 shlibsuff=N32 libmagic=N32;;
-    *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ")
-      libsuff=64 shlibsuff=64 libmagic=64-bit;;
-    *) libsuff= shlibsuff= libmagic=never-match;;
-    esac
-    ;;
-  esac
-  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
-  shlibpath_overrides_runpath=no
-  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
-  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
-  hardcode_into_libs=yes
-  ;;
-
-# No shared lib support for Linux oldld, aout, or coff.
-linux*oldld* | linux*aout* | linux*coff*)
-  dynamic_linker=no
-  ;;
-
-# This must be Linux ELF.
-linux* | k*bsd*-gnu | kopensolaris*-gnu)
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-
-  # Some binutils ld are patched to set DT_RUNPATH
-  if ${lt_cv_shlibpath_overrides_runpath+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  lt_cv_shlibpath_overrides_runpath=no
-    save_LDFLAGS=$LDFLAGS
-    save_libdir=$libdir
-    eval "libdir=/foo; wl=\"$lt_prog_compiler_wl_CXX\"; \
-	 LDFLAGS=\"\$LDFLAGS $hardcode_libdir_flag_spec_CXX\""
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_link "$LINENO"; then :
-  if  ($OBJDUMP -p conftest$ac_exeext) 2>/dev/null | grep "RUNPATH.*$libdir" >/dev/null; then :
-  lt_cv_shlibpath_overrides_runpath=yes
-fi
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-    LDFLAGS=$save_LDFLAGS
-    libdir=$save_libdir
-
-fi
-
-  shlibpath_overrides_runpath=$lt_cv_shlibpath_overrides_runpath
-
-  # This implies no fast_install, which is unacceptable.
-  # Some rework will be needed to allow for fast_install
-  # before this can be enabled.
-  hardcode_into_libs=yes
-
-  # Append ld.so.conf contents to the search path
-  if test -f /etc/ld.so.conf; then
-    lt_ld_extra=`awk '/^include / { system(sprintf("cd /etc; cat %s 2>/dev/null", \$2)); skip = 1; } { if (!skip) print \$0; skip = 0; }' < /etc/ld.so.conf | $SED -e 's/#.*//;/^[	 ]*hwcap[	 ]/d;s/[:,	]/ /g;s/=[^=]*$//;s/=[^= ]* / /g;s/"//g;/^$/d' | tr '\n' ' '`
-    sys_lib_dlsearch_path_spec="/lib /usr/lib $lt_ld_extra"
-  fi
-
-  # We used to test for /lib/ld.so.1 and disable shared libraries on
-  # powerpc, because MkLinux only supported shared libraries with the
-  # GNU dynamic linker.  Since this was broken with cross compilers,
-  # most powerpc-linux boxes support dynamic linking these days and
-  # people can always --disable-shared, the test was removed, and we
-  # assume the GNU/Linux dynamic linker is in use.
-  dynamic_linker='GNU/Linux ld.so'
-  ;;
-
-netbsd*)
-  version_type=sunos
-  need_lib_prefix=no
-  need_version=no
-  if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
-    dynamic_linker='NetBSD (a.out) ld.so'
-  else
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    dynamic_linker='NetBSD ld.elf_so'
-  fi
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  hardcode_into_libs=yes
-  ;;
-
-newsos6)
-  version_type=linux
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  ;;
-
-*nto* | *qnx*)
-  version_type=qnx
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  dynamic_linker='ldqnx.so'
-  ;;
-
-openbsd*)
-  version_type=sunos
-  sys_lib_dlsearch_path_spec="/usr/lib"
-  need_lib_prefix=no
-  # Some older versions of OpenBSD (3.3 at least) *do* need versioned libs.
-  case $host_os in
-    openbsd3.3 | openbsd3.3.*)	need_version=yes ;;
-    *)				need_version=no  ;;
-  esac
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-    case $host_os in
-      openbsd2.[89] | openbsd2.[89].*)
-	shlibpath_overrides_runpath=no
-	;;
-      *)
-	shlibpath_overrides_runpath=yes
-	;;
-      esac
-  else
-    shlibpath_overrides_runpath=yes
-  fi
-  ;;
-
-os2*)
-  libname_spec='$name'
-  shrext_cmds=".dll"
-  need_lib_prefix=no
-  library_names_spec='$libname${shared_ext} $libname.a'
-  dynamic_linker='OS/2 ld.exe'
-  shlibpath_var=LIBPATH
-  ;;
-
-osf3* | osf4* | osf5*)
-  version_type=osf
-  need_lib_prefix=no
-  need_version=no
-  soname_spec='${libname}${release}${shared_ext}$major'
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  shlibpath_var=LD_LIBRARY_PATH
-  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
-  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
-  ;;
-
-rdos*)
-  dynamic_linker=no
-  ;;
-
-solaris*)
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  hardcode_into_libs=yes
-  # ldd complains unless libraries are executable
-  postinstall_cmds='chmod +x $lib'
-  ;;
-
-sunos4*)
-  version_type=sunos
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  if test "$with_gnu_ld" = yes; then
-    need_lib_prefix=no
-  fi
-  need_version=yes
-  ;;
-
-sysv4 | sysv4.3*)
-  version_type=linux
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  case $host_vendor in
-    sni)
-      shlibpath_overrides_runpath=no
-      need_lib_prefix=no
-      runpath_var=LD_RUN_PATH
-      ;;
-    siemens)
-      need_lib_prefix=no
-      ;;
-    motorola)
-      need_lib_prefix=no
-      need_version=no
-      shlibpath_overrides_runpath=no
-      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
-      ;;
-  esac
-  ;;
-
-sysv4*MP*)
-  if test -d /usr/nec ;then
-    version_type=linux
-    library_names_spec='$libname${shared_ext}.$versuffix $libname${shared_ext}.$major $libname${shared_ext}'
-    soname_spec='$libname${shared_ext}.$major'
-    shlibpath_var=LD_LIBRARY_PATH
-  fi
-  ;;
-
-sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
-  version_type=freebsd-elf
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  hardcode_into_libs=yes
-  if test "$with_gnu_ld" = yes; then
-    sys_lib_search_path_spec='/usr/local/lib /usr/gnu/lib /usr/ccs/lib /usr/lib /lib'
-  else
-    sys_lib_search_path_spec='/usr/ccs/lib /usr/lib'
-    case $host_os in
-      sco3.2v5*)
-        sys_lib_search_path_spec="$sys_lib_search_path_spec /lib"
-	;;
-    esac
-  fi
-  sys_lib_dlsearch_path_spec='/usr/lib'
-  ;;
-
-tpf*)
-  # TPF is a cross-target only.  Preferred cross-host = GNU/Linux.
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  ;;
-
-uts4*)
-  version_type=linux
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-*)
-  dynamic_linker=no
-  ;;
-esac
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $dynamic_linker" >&5
-$as_echo "$dynamic_linker" >&6; }
-test "$dynamic_linker" = no && can_build_shared=no
-
-variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
-if test "$GCC" = yes; then
-  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
-fi
-
-if test "${lt_cv_sys_lib_search_path_spec+set}" = set; then
-  sys_lib_search_path_spec="$lt_cv_sys_lib_search_path_spec"
-fi
-if test "${lt_cv_sys_lib_dlsearch_path_spec+set}" = set; then
-  sys_lib_dlsearch_path_spec="$lt_cv_sys_lib_dlsearch_path_spec"
-fi
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking how to hardcode library paths into programs" >&5
-$as_echo_n "checking how to hardcode library paths into programs... " >&6; }
-hardcode_action_CXX=
-if test -n "$hardcode_libdir_flag_spec_CXX" ||
-   test -n "$runpath_var_CXX" ||
-   test "X$hardcode_automatic_CXX" = "Xyes" ; then
-
-  # We can hardcode non-existent directories.
-  if test "$hardcode_direct_CXX" != no &&
-     # If the only mechanism to avoid hardcoding is shlibpath_var, we
-     # have to relink, otherwise we might link with an installed library
-     # when we should be linking with a yet-to-be-installed one
-     ## test "$_LT_TAGVAR(hardcode_shlibpath_var, CXX)" != no &&
-     test "$hardcode_minus_L_CXX" != no; then
-    # Linking always hardcodes the temporary library directory.
-    hardcode_action_CXX=relink
-  else
-    # We can link without hardcoding, and we can hardcode nonexisting dirs.
-    hardcode_action_CXX=immediate
-  fi
-else
-  # We cannot hardcode anything, or else we can only hardcode existing
-  # directories.
-  hardcode_action_CXX=unsupported
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $hardcode_action_CXX" >&5
-$as_echo "$hardcode_action_CXX" >&6; }
-
-if test "$hardcode_action_CXX" = relink ||
-   test "$inherit_rpath_CXX" = yes; then
-  # Fast installation is not supported
-  enable_fast_install=no
-elif test "$shlibpath_overrides_runpath" = yes ||
-     test "$enable_shared" = no; then
-  # Fast installation is not necessary
-  enable_fast_install=needless
-fi
-
-
-
-
-
-
-
-  fi # test -n "$compiler"
-
-  CC=$lt_save_CC
-  LDCXX=$LD
-  LD=$lt_save_LD
-  GCC=$lt_save_GCC
-  with_gnu_ld=$lt_save_with_gnu_ld
-  lt_cv_path_LDCXX=$lt_cv_path_LD
-  lt_cv_path_LD=$lt_save_path_LD
-  lt_cv_prog_gnu_ldcxx=$lt_cv_prog_gnu_ld
-  lt_cv_prog_gnu_ld=$lt_save_with_gnu_ld
-fi # test "$_lt_caught_CXX_error" != yes
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
 
 
 
@@ -16332,7 +12798,7 @@
   fi
 
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
   libat_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -16769,10 +13235,6 @@
   as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then
-  as_fn_error $? "conditional \"am__fastdepCXX\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${am__fastdepCCAS_TRUE}" && test -z "${am__fastdepCCAS_FALSE}"; then
   as_fn_error $? "conditional \"am__fastdepCCAS\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -17540,61 +14002,6 @@
 enable_dlopen_self_static='`$ECHO "$enable_dlopen_self_static" | $SED "$delay_single_quote_subst"`'
 old_striplib='`$ECHO "$old_striplib" | $SED "$delay_single_quote_subst"`'
 striplib='`$ECHO "$striplib" | $SED "$delay_single_quote_subst"`'
-compiler_lib_search_dirs='`$ECHO "$compiler_lib_search_dirs" | $SED "$delay_single_quote_subst"`'
-predep_objects='`$ECHO "$predep_objects" | $SED "$delay_single_quote_subst"`'
-postdep_objects='`$ECHO "$postdep_objects" | $SED "$delay_single_quote_subst"`'
-predeps='`$ECHO "$predeps" | $SED "$delay_single_quote_subst"`'
-postdeps='`$ECHO "$postdeps" | $SED "$delay_single_quote_subst"`'
-compiler_lib_search_path='`$ECHO "$compiler_lib_search_path" | $SED "$delay_single_quote_subst"`'
-LD_CXX='`$ECHO "$LD_CXX" | $SED "$delay_single_quote_subst"`'
-reload_flag_CXX='`$ECHO "$reload_flag_CXX" | $SED "$delay_single_quote_subst"`'
-reload_cmds_CXX='`$ECHO "$reload_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-old_archive_cmds_CXX='`$ECHO "$old_archive_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-compiler_CXX='`$ECHO "$compiler_CXX" | $SED "$delay_single_quote_subst"`'
-GCC_CXX='`$ECHO "$GCC_CXX" | $SED "$delay_single_quote_subst"`'
-lt_prog_compiler_no_builtin_flag_CXX='`$ECHO "$lt_prog_compiler_no_builtin_flag_CXX" | $SED "$delay_single_quote_subst"`'
-lt_prog_compiler_wl_CXX='`$ECHO "$lt_prog_compiler_wl_CXX" | $SED "$delay_single_quote_subst"`'
-lt_prog_compiler_pic_CXX='`$ECHO "$lt_prog_compiler_pic_CXX" | $SED "$delay_single_quote_subst"`'
-lt_prog_compiler_static_CXX='`$ECHO "$lt_prog_compiler_static_CXX" | $SED "$delay_single_quote_subst"`'
-lt_cv_prog_compiler_c_o_CXX='`$ECHO "$lt_cv_prog_compiler_c_o_CXX" | $SED "$delay_single_quote_subst"`'
-archive_cmds_need_lc_CXX='`$ECHO "$archive_cmds_need_lc_CXX" | $SED "$delay_single_quote_subst"`'
-enable_shared_with_static_runtimes_CXX='`$ECHO "$enable_shared_with_static_runtimes_CXX" | $SED "$delay_single_quote_subst"`'
-export_dynamic_flag_spec_CXX='`$ECHO "$export_dynamic_flag_spec_CXX" | $SED "$delay_single_quote_subst"`'
-whole_archive_flag_spec_CXX='`$ECHO "$whole_archive_flag_spec_CXX" | $SED "$delay_single_quote_subst"`'
-compiler_needs_object_CXX='`$ECHO "$compiler_needs_object_CXX" | $SED "$delay_single_quote_subst"`'
-old_archive_from_new_cmds_CXX='`$ECHO "$old_archive_from_new_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-old_archive_from_expsyms_cmds_CXX='`$ECHO "$old_archive_from_expsyms_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-archive_cmds_CXX='`$ECHO "$archive_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-archive_expsym_cmds_CXX='`$ECHO "$archive_expsym_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-module_cmds_CXX='`$ECHO "$module_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-module_expsym_cmds_CXX='`$ECHO "$module_expsym_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-with_gnu_ld_CXX='`$ECHO "$with_gnu_ld_CXX" | $SED "$delay_single_quote_subst"`'
-allow_undefined_flag_CXX='`$ECHO "$allow_undefined_flag_CXX" | $SED "$delay_single_quote_subst"`'
-no_undefined_flag_CXX='`$ECHO "$no_undefined_flag_CXX" | $SED "$delay_single_quote_subst"`'
-hardcode_libdir_flag_spec_CXX='`$ECHO "$hardcode_libdir_flag_spec_CXX" | $SED "$delay_single_quote_subst"`'
-hardcode_libdir_flag_spec_ld_CXX='`$ECHO "$hardcode_libdir_flag_spec_ld_CXX" | $SED "$delay_single_quote_subst"`'
-hardcode_libdir_separator_CXX='`$ECHO "$hardcode_libdir_separator_CXX" | $SED "$delay_single_quote_subst"`'
-hardcode_direct_CXX='`$ECHO "$hardcode_direct_CXX" | $SED "$delay_single_quote_subst"`'
-hardcode_direct_absolute_CXX='`$ECHO "$hardcode_direct_absolute_CXX" | $SED "$delay_single_quote_subst"`'
-hardcode_minus_L_CXX='`$ECHO "$hardcode_minus_L_CXX" | $SED "$delay_single_quote_subst"`'
-hardcode_shlibpath_var_CXX='`$ECHO "$hardcode_shlibpath_var_CXX" | $SED "$delay_single_quote_subst"`'
-hardcode_automatic_CXX='`$ECHO "$hardcode_automatic_CXX" | $SED "$delay_single_quote_subst"`'
-inherit_rpath_CXX='`$ECHO "$inherit_rpath_CXX" | $SED "$delay_single_quote_subst"`'
-link_all_deplibs_CXX='`$ECHO "$link_all_deplibs_CXX" | $SED "$delay_single_quote_subst"`'
-fix_srcfile_path_CXX='`$ECHO "$fix_srcfile_path_CXX" | $SED "$delay_single_quote_subst"`'
-always_export_symbols_CXX='`$ECHO "$always_export_symbols_CXX" | $SED "$delay_single_quote_subst"`'
-export_symbols_cmds_CXX='`$ECHO "$export_symbols_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-exclude_expsyms_CXX='`$ECHO "$exclude_expsyms_CXX" | $SED "$delay_single_quote_subst"`'
-include_expsyms_CXX='`$ECHO "$include_expsyms_CXX" | $SED "$delay_single_quote_subst"`'
-prelink_cmds_CXX='`$ECHO "$prelink_cmds_CXX" | $SED "$delay_single_quote_subst"`'
-file_list_spec_CXX='`$ECHO "$file_list_spec_CXX" | $SED "$delay_single_quote_subst"`'
-hardcode_action_CXX='`$ECHO "$hardcode_action_CXX" | $SED "$delay_single_quote_subst"`'
-compiler_lib_search_dirs_CXX='`$ECHO "$compiler_lib_search_dirs_CXX" | $SED "$delay_single_quote_subst"`'
-predep_objects_CXX='`$ECHO "$predep_objects_CXX" | $SED "$delay_single_quote_subst"`'
-postdep_objects_CXX='`$ECHO "$postdep_objects_CXX" | $SED "$delay_single_quote_subst"`'
-predeps_CXX='`$ECHO "$predeps_CXX" | $SED "$delay_single_quote_subst"`'
-postdeps_CXX='`$ECHO "$postdeps_CXX" | $SED "$delay_single_quote_subst"`'
-compiler_lib_search_path_CXX='`$ECHO "$compiler_lib_search_path_CXX" | $SED "$delay_single_quote_subst"`'
 
 LTCC='$LTCC'
 LTCFLAGS='$LTCFLAGS'
@@ -17667,40 +14074,7 @@
 install_override_mode \
 finish_eval \
 old_striplib \
-striplib \
-compiler_lib_search_dirs \
-predep_objects \
-postdep_objects \
-predeps \
-postdeps \
-compiler_lib_search_path \
-LD_CXX \
-reload_flag_CXX \
-compiler_CXX \
-lt_prog_compiler_no_builtin_flag_CXX \
-lt_prog_compiler_wl_CXX \
-lt_prog_compiler_pic_CXX \
-lt_prog_compiler_static_CXX \
-lt_cv_prog_compiler_c_o_CXX \
-export_dynamic_flag_spec_CXX \
-whole_archive_flag_spec_CXX \
-compiler_needs_object_CXX \
-with_gnu_ld_CXX \
-allow_undefined_flag_CXX \
-no_undefined_flag_CXX \
-hardcode_libdir_flag_spec_CXX \
-hardcode_libdir_flag_spec_ld_CXX \
-hardcode_libdir_separator_CXX \
-fix_srcfile_path_CXX \
-exclude_expsyms_CXX \
-include_expsyms_CXX \
-file_list_spec_CXX \
-compiler_lib_search_dirs_CXX \
-predep_objects_CXX \
-postdep_objects_CXX \
-predeps_CXX \
-postdeps_CXX \
-compiler_lib_search_path_CXX; do
+striplib; do
     case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
     *[\\\\\\\`\\"\\\$]*)
       eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED \\"\\\$sed_quote_subst\\"\\\`\\\\\\""
@@ -17729,17 +14103,7 @@
 postuninstall_cmds \
 finish_cmds \
 sys_lib_search_path_spec \
-sys_lib_dlsearch_path_spec \
-reload_cmds_CXX \
-old_archive_cmds_CXX \
-old_archive_from_new_cmds_CXX \
-old_archive_from_expsyms_cmds_CXX \
-archive_cmds_CXX \
-archive_expsym_cmds_CXX \
-module_cmds_CXX \
-module_expsym_cmds_CXX \
-export_symbols_cmds_CXX \
-prelink_cmds_CXX; do
+sys_lib_dlsearch_path_spec; do
     case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
     *[\\\\\\\`\\"\\\$]*)
       eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED -e \\"\\\$double_quote_subst\\" -e \\"\\\$sed_quote_subst\\" -e \\"\\\$delay_variable_subst\\"\\\`\\\\\\""
@@ -17769,8 +14133,6 @@
 
 
 
-
-
 TARGETDIR="$TARGETDIR"
 
 _ACEOF
@@ -18571,7 +14933,7 @@
 
 
 # The names of the tagged configurations supported by this script.
-available_tags="CXX "
+available_tags=""
 
 # ### BEGIN LIBTOOL CONFIG
 
@@ -18927,20 +15289,6 @@
 # How to hardcode a shared library path into an executable.
 hardcode_action=$hardcode_action
 
-# The directories searched by this compiler when creating a shared library.
-compiler_lib_search_dirs=$lt_compiler_lib_search_dirs
-
-# Dependencies to place before and after the objects being linked to
-# create a shared library.
-predep_objects=$lt_predep_objects
-postdep_objects=$lt_postdep_objects
-predeps=$lt_predeps
-postdeps=$lt_postdeps
-
-# The library search path used internally by the compiler when linking
-# a shared library.
-compiler_lib_search_path=$lt_compiler_lib_search_path
-
 # ### END LIBTOOL CONFIG
 
 _LT_EOF
@@ -19174,163 +15522,6 @@
     (rm -f "$ofile" && cp "$cfgfile" "$ofile" && rm -f "$cfgfile")
   chmod +x "$ofile"
 
-
-    cat <<_LT_EOF >> "$ofile"
-
-# ### BEGIN LIBTOOL TAG CONFIG: CXX
-
-# The linker used to build libraries.
-LD=$lt_LD_CXX
-
-# How to create reloadable object files.
-reload_flag=$lt_reload_flag_CXX
-reload_cmds=$lt_reload_cmds_CXX
-
-# Commands used to build an old-style archive.
-old_archive_cmds=$lt_old_archive_cmds_CXX
-
-# A language specific compiler.
-CC=$lt_compiler_CXX
-
-# Is the compiler the GNU compiler?
-with_gcc=$GCC_CXX
-
-# Compiler flag to turn off builtin functions.
-no_builtin_flag=$lt_lt_prog_compiler_no_builtin_flag_CXX
-
-# How to pass a linker flag through the compiler.
-wl=$lt_lt_prog_compiler_wl_CXX
-
-# Additional compiler flags for building library objects.
-pic_flag=$lt_lt_prog_compiler_pic_CXX
-
-# Compiler flag to prevent dynamic linking.
-link_static_flag=$lt_lt_prog_compiler_static_CXX
-
-# Does compiler simultaneously support -c and -o options?
-compiler_c_o=$lt_lt_cv_prog_compiler_c_o_CXX
-
-# Whether or not to add -lc for building shared libraries.
-build_libtool_need_lc=$archive_cmds_need_lc_CXX
-
-# Whether or not to disallow shared libs when runtime libs are static.
-allow_libtool_libs_with_static_runtimes=$enable_shared_with_static_runtimes_CXX
-
-# Compiler flag to allow reflexive dlopens.
-export_dynamic_flag_spec=$lt_export_dynamic_flag_spec_CXX
-
-# Compiler flag to generate shared objects directly from archives.
-whole_archive_flag_spec=$lt_whole_archive_flag_spec_CXX
-
-# Whether the compiler copes with passing no objects directly.
-compiler_needs_object=$lt_compiler_needs_object_CXX
-
-# Create an old-style archive from a shared archive.
-old_archive_from_new_cmds=$lt_old_archive_from_new_cmds_CXX
-
-# Create a temporary old-style archive to link instead of a shared archive.
-old_archive_from_expsyms_cmds=$lt_old_archive_from_expsyms_cmds_CXX
-
-# Commands used to build a shared archive.
-archive_cmds=$lt_archive_cmds_CXX
-archive_expsym_cmds=$lt_archive_expsym_cmds_CXX
-
-# Commands used to build a loadable module if different from building
-# a shared archive.
-module_cmds=$lt_module_cmds_CXX
-module_expsym_cmds=$lt_module_expsym_cmds_CXX
-
-# Whether we are building with GNU ld or not.
-with_gnu_ld=$lt_with_gnu_ld_CXX
-
-# Flag that allows shared libraries with undefined symbols to be built.
-allow_undefined_flag=$lt_allow_undefined_flag_CXX
-
-# Flag that enforces no undefined symbols.
-no_undefined_flag=$lt_no_undefined_flag_CXX
-
-# Flag to hardcode \$libdir into a binary during linking.
-# This must work even if \$libdir does not exist
-hardcode_libdir_flag_spec=$lt_hardcode_libdir_flag_spec_CXX
-
-# If ld is used when linking, flag to hardcode \$libdir into a binary
-# during linking.  This must work even if \$libdir does not exist.
-hardcode_libdir_flag_spec_ld=$lt_hardcode_libdir_flag_spec_ld_CXX
-
-# Whether we need a single "-rpath" flag with a separated argument.
-hardcode_libdir_separator=$lt_hardcode_libdir_separator_CXX
-
-# Set to "yes" if using DIR/libNAME\${shared_ext} during linking hardcodes
-# DIR into the resulting binary.
-hardcode_direct=$hardcode_direct_CXX
-
-# Set to "yes" if using DIR/libNAME\${shared_ext} during linking hardcodes
-# DIR into the resulting binary and the resulting library dependency is
-# "absolute",i.e impossible to change by setting \${shlibpath_var} if the
-# library is relocated.
-hardcode_direct_absolute=$hardcode_direct_absolute_CXX
-
-# Set to "yes" if using the -LDIR flag during linking hardcodes DIR
-# into the resulting binary.
-hardcode_minus_L=$hardcode_minus_L_CXX
-
-# Set to "yes" if using SHLIBPATH_VAR=DIR during linking hardcodes DIR
-# into the resulting binary.
-hardcode_shlibpath_var=$hardcode_shlibpath_var_CXX
-
-# Set to "yes" if building a shared library automatically hardcodes DIR
-# into the library and all subsequent libraries and executables linked
-# against it.
-hardcode_automatic=$hardcode_automatic_CXX
-
-# Set to yes if linker adds runtime paths of dependent libraries
-# to runtime path list.
-inherit_rpath=$inherit_rpath_CXX
-
-# Whether libtool must link a program against all its dependency libraries.
-link_all_deplibs=$link_all_deplibs_CXX
-
-# Fix the shell variable \$srcfile for the compiler.
-fix_srcfile_path=$lt_fix_srcfile_path_CXX
-
-# Set to "yes" if exported symbols are required.
-always_export_symbols=$always_export_symbols_CXX
-
-# The commands to list exported symbols.
-export_symbols_cmds=$lt_export_symbols_cmds_CXX
-
-# Symbols that should not be listed in the preloaded symbols.
-exclude_expsyms=$lt_exclude_expsyms_CXX
-
-# Symbols that must always be exported.
-include_expsyms=$lt_include_expsyms_CXX
-
-# Commands necessary for linking programs (against libraries) with templates.
-prelink_cmds=$lt_prelink_cmds_CXX
-
-# Specify filename containing input files.
-file_list_spec=$lt_file_list_spec_CXX
-
-# How to hardcode a shared library path into an executable.
-hardcode_action=$hardcode_action_CXX
-
-# The directories searched by this compiler when creating a shared library.
-compiler_lib_search_dirs=$lt_compiler_lib_search_dirs_CXX
-
-# Dependencies to place before and after the objects being linked to
-# create a shared library.
-predep_objects=$lt_predep_objects_CXX
-postdep_objects=$lt_postdep_objects_CXX
-predeps=$lt_predeps_CXX
-postdeps=$lt_postdeps_CXX
-
-# The library search path used internally by the compiler when linking
-# a shared library.
-compiler_lib_search_path=$lt_compiler_lib_search_path_CXX
-
-# ### END LIBTOOL TAG CONFIG: CXX
-_LT_EOF
-
  ;;
     "include":C) test -d include || mkdir include ;;
     "src":C)
diff --git a/libffi/configure.ac b/libffi/configure.ac
index d28981b..693c65b 100644
--- a/libffi/configure.ac
+++ b/libffi/configure.ac
@@ -47,7 +47,10 @@
 m4_define([_AC_ARG_VAR_PRECIOUS],[])
 save_CFLAGS=$CFLAGS
 AC_PROG_CC
-AC_PROG_CXX
+dnl Disable libstdc++ dependency for libffi as it's only needed for
+dnl the library's test suite and it can cause a dependency loop when
+dnl libgomp, needed by libstdc++, uses libffi.
+dnl AC_PROG_CXX
 CFLAGS=$save_CFLAGS
 m4_undefine([_AC_ARG_VAR_PRECIOUS])
 m4_rename_force([real_PRECIOUS],[_AC_ARG_VAR_PRECIOUS])
diff --git a/libffi/include/Makefile.in b/libffi/include/Makefile.in
index 5dabdff..3be1563 100644
--- a/libffi/include/Makefile.in
+++ b/libffi/include/Makefile.in
@@ -194,10 +194,6 @@
 CFLAGS = @CFLAGS@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
@@ -259,7 +255,6 @@
 abs_top_builddir = @abs_top_builddir@
 abs_top_srcdir = @abs_top_srcdir@
 ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
 ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
 am__include = @am__include@
 am__leading_dot = @am__leading_dot@
diff --git a/libffi/man/Makefile.in b/libffi/man/Makefile.in
index 1a21f38..327947a 100644
--- a/libffi/man/Makefile.in
+++ b/libffi/man/Makefile.in
@@ -176,10 +176,6 @@
 CFLAGS = @CFLAGS@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
@@ -241,7 +237,6 @@
 abs_top_builddir = @abs_top_builddir@
 abs_top_srcdir = @abs_top_srcdir@
 ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
 ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
 am__include = @am__include@
 am__leading_dot = @am__leading_dot@
diff --git a/libffi/testsuite/Makefile.in b/libffi/testsuite/Makefile.in
index 59f09bb..a90618c 100644
--- a/libffi/testsuite/Makefile.in
+++ b/libffi/testsuite/Makefile.in
@@ -147,10 +147,6 @@
 CFLAGS = @CFLAGS@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
 CYGPATH_W = @CYGPATH_W@
 DEFS = @DEFS@
 DEPDIR = @DEPDIR@
@@ -212,7 +208,6 @@
 abs_top_builddir = @abs_top_builddir@
 abs_top_srcdir = @abs_top_srcdir@
 ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
 ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
 am__include = @am__include@
 am__leading_dot = @am__leading_dot@
diff --git a/libgcc/ChangeLog.omp b/libgcc/ChangeLog.omp
new file mode 100644
index 0000000..a237a70
--- /dev/null
+++ b/libgcc/ChangeLog.omp
@@ -0,0 +1,25 @@
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+	    Andrew Stubbs  <ams@codesourcery.com>
+
+	* Makefile.in: Allow disabling of emutls.
+	* config/gcn/gomp_print.c: New.
+	* config/gcn/reduction.c: New.
+	* config/gcn/t-amdgcn (LIB2ADD): Add gomp_print.c and reduction.c.
+	Disable emutls.c.
+	* config/gcn/t-gcn-hsa: New.
+
+2019-06-25  Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+
+	* config/gcn/t-amdgcn (LIB2ADD): Add unwind-gcn.c.
+	* config/gcn/unwind-gcn.c: New file.
+
+2019-06-25  Kwok Cheung Yeung  <kcy@codesourcery.com>
+            Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+
+	* configure: Regenerate.
+	* config/gcn/gthr-gcn.h: New.
+
diff --git a/libgcc/Makefile.in b/libgcc/Makefile.in
index ea390a5..3261ab0 100644
--- a/libgcc/Makefile.in
+++ b/libgcc/Makefile.in
@@ -430,9 +430,11 @@
 # While emutls.c has nothing to do with EH, it is in LIB2ADDEH*
 # instead of LIB2ADD because that's the way to be sure on some targets
 # (e.g. *-*-darwin*) only one copy of it is linked.
+ifneq ($(enable_emutls),no)
 LIB2ADDEH += $(srcdir)/emutls.c
 LIB2ADDEHSTATIC += $(srcdir)/emutls.c
 LIB2ADDEHSHARED += $(srcdir)/emutls.c
+endif
 
 # Library members defined in libgcc2.c.
 lib2funcs = _muldi3 _negdi2 _lshrdi3 _ashldi3 _ashrdi3 _cmpdi2 _ucmpdi2	   \
diff --git a/libgcc/config/gcn/crt0.c b/libgcc/config/gcn/crt0.c
index c0b9353..9b58029 100644
--- a/libgcc/config/gcn/crt0.c
+++ b/libgcc/config/gcn/crt0.c
@@ -19,5 +19,61 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+typedef long long size_t;
+
 /* Provide an entry point symbol to silence a linker warning.  */
 void _start() {}
+
+#ifdef USE_NEWLIB_INITFINI
+
+extern void __libc_init_array (void) __attribute__((weak));
+extern void __libc_fini_array (void) __attribute__((weak));
+
+__attribute__((amdgpu_hsa_kernel ()))
+void _init_array()
+{
+  __libc_init_array ();
+}
+
+__attribute__((amdgpu_hsa_kernel ()))
+void _fini_array()
+{
+  __libc_fini_array ();
+}
+
+#endif
+
+/* These magic symbols are provided by the linker.  */
+extern void (*__preinit_array_start []) (void) __attribute__((weak));
+extern void (*__preinit_array_end []) (void) __attribute__((weak));
+extern void (*__init_array_start []) (void) __attribute__((weak));
+extern void (*__init_array_end []) (void) __attribute__((weak));
+extern void (*__fini_array_start []) (void) __attribute__((weak));
+extern void (*__fini_array_end []) (void) __attribute__((weak));
+
+__attribute__((amdgpu_hsa_kernel ()))
+void _init_array()
+{
+  /* Iterate over all the init routines.  */
+  size_t count;
+  size_t i;
+
+  count = __preinit_array_end - __preinit_array_start;
+  for (i = 0; i < count; i++)
+    __preinit_array_start[i] ();
+
+  count = __init_array_end - __init_array_start;
+  for (i = 0; i < count; i++)
+    __init_array_start[i] ();
+}
+
+__attribute__((amdgpu_hsa_kernel ()))
+void _fini_array()
+{
+  size_t count;
+  size_t i;
+
+  count = __fini_array_end - __fini_array_start;
+  for (i = count; i > 0; i--)
+    __fini_array_start[i-1] ();
+}
diff --git a/libgcc/config/gcn/gomp_print.c b/libgcc/config/gcn/gomp_print.c
new file mode 100644
index 0000000..5d1a3fc
--- /dev/null
+++ b/libgcc/config/gcn/gomp_print.c
@@ -0,0 +1,101 @@
+/* Newlib may not have been built yet.  */
+typedef long int64_t;
+typedef long size_t;
+extern char *strncpy (char *dst, const char *src, size_t length);
+extern void exit(int);
+
+void gomp_print_string (const char *msg, const char *value);
+void gomp_print_integer (const char *msg, int64_t value);
+void gomp_print_double (const char *msg, double value);
+
+/* This struct must match the one used by gcn-run and libgomp.
+   It holds all the data output from a kernel (besides mapping data).
+
+   The base address pointer can be found at kernargs+16.
+
+   The next_output counter must be atomically incremented for each
+   print output.  Only when the print data is fully written can the
+   "written" flag be set.  */
+struct output {
+  int return_value;
+  unsigned int next_output;
+  struct printf_data {
+    int written;
+    char msg[128];
+    int type;
+    union {
+      int64_t ivalue;
+      double dvalue;
+      char text[128];
+    };
+  } queue[1024];
+  unsigned int consumed;
+};
+
+static struct printf_data *
+reserve_print_slot (void) {
+  /* The kernargs pointer is in s[8:9].
+     This will break if the enable_sgpr_* flags are ever changed.  */
+  char *kernargs;
+  asm ("s_mov_b64 %0, s[8:9]" : "=Sg"(kernargs));
+
+  /* The output data is at kernargs[2].  */
+  struct output *data = *(struct output **)(kernargs + 16);
+
+  /* Reserve the slot.  */
+  unsigned int index = __atomic_fetch_add (&data->next_output, 1,
+					   __ATOMIC_ACQUIRE);
+
+  /* Spinlock while the host catches up.  */
+  if (index >= 1024)
+    while (__atomic_load_n (&data->consumed, __ATOMIC_ACQUIRE)
+	   <= (index - 1024))
+      asm ("s_sleep 64");
+
+  if ((unsigned int)(index + 1) < data->consumed)
+    {
+      /* Overflow.  */
+      exit (1);
+    }
+  return &(data->queue[index%1024]);
+}
+
+void
+gomp_print_string (const char *msg, const char *value)
+{
+  struct printf_data *output = reserve_print_slot ();
+  output->type = 2; /* String.  */
+
+  strncpy (output->msg, msg, 127);
+  output->msg[127] = '\0';
+  strncpy (output->text, value, 127);
+  output->text[127] = '\0';
+
+  __atomic_store_n (&output->written, 1, __ATOMIC_RELEASE);
+}
+
+void
+gomp_print_integer (const char *msg, int64_t value)
+{
+  struct printf_data *output = reserve_print_slot ();
+  output->type = 0; /* Integer.  */
+
+  strncpy (output->msg, msg, 127);
+  output->msg[127] = '\0';
+  output->ivalue = value;
+
+  __atomic_store_n (&output->written, 1, __ATOMIC_RELEASE);
+}
+
+void
+gomp_print_double (const char *msg, double value)
+{
+  struct printf_data *output = reserve_print_slot ();
+  output->type = 1; /* Double.  */
+
+  strncpy (output->msg, msg, 127);
+  output->msg[127] = '\0';
+  output->dvalue = value;
+
+  __atomic_store_n (&output->written, 1, __ATOMIC_RELEASE);
+}
diff --git a/libgcc/config/gcn/gthr-gcn.h b/libgcc/config/gcn/gthr-gcn.h
new file mode 100644
index 0000000..4227b51
--- /dev/null
+++ b/libgcc/config/gcn/gthr-gcn.h
@@ -0,0 +1,163 @@
+/* Threads compatibility routines for libgcc2 and libobjc.  */
+/* Compile this one with gcc.  */
+/* Copyright (C) 2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* AMD GCN does not support dynamic creation of threads.  There may be many
+   hardware threads, but they're all created simultaneously at launch time.
+
+   This implementation is intended to provide mutexes for libgfortran, etc.
+   It is not intended to provide a TLS implementation at this time,
+   although that may be added later if needed.
+
+   __gthread_active_p returns "1" to ensure that mutexes are used, and that
+   programs attempting to use emutls will fail with the appropriate abort.
+   It is expected that the TLS tests will fail.  */
+
+#ifndef GCC_GTHR_GCN_H
+#define GCC_GTHR_GCN_H
+
+#define __GTHREADS 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _LIBOBJC
+#error "Objective C is not supported on AMD GCN"
+#else
+
+static inline int
+__gthread_active_p (void)
+{
+  return 1;
+}
+
+typedef int __gthread_key_t;
+typedef int __gthread_once_t;
+typedef int __gthread_mutex_t;
+typedef int __gthread_recursive_mutex_t;
+
+#define __GTHREAD_ONCE_INIT 0
+#define __GTHREAD_MUTEX_INIT 0
+#define __GTHREAD_RECURSIVE_MUTEX_INIT 0
+
+static inline int
+__gthread_once (__gthread_once_t *__once __attribute__((unused)),
+		void (*__func) (void) __attribute__((unused)))
+{
+  return 0;
+}
+
+static inline int
+__gthread_key_create (__gthread_key_t *__key __attribute__((unused)),
+		      void (*__dtor) (void *) __attribute__((unused)))
+{
+  /* Operation is not supported.  */
+  return -1;
+}
+
+static inline int
+__gthread_key_delete (__gthread_key_t __key __attribute__ ((__unused__)))
+{
+  /* Operation is not supported.  */
+  return -1;
+}
+
+static inline void *
+__gthread_getspecific (__gthread_key_t __key __attribute__((unused)))
+{
+  return NULL;
+}
+
+static inline int
+__gthread_setspecific (__gthread_key_t __key __attribute__((unused)),
+		       const void *__ptr __attribute__((unused)))
+{
+  /* Operation is not supported.  */
+  return -1;
+}
+
+static inline int
+__gthread_mutex_destroy (__gthread_mutex_t *__mutex __attribute__((unused)))
+{
+  return 0;
+}
+
+static inline int
+__gthread_recursive_mutex_destroy (__gthread_recursive_mutex_t *__mutex __attribute__((unused)))
+{
+  return 0;
+}
+
+
+static inline int
+__gthread_mutex_lock (__gthread_mutex_t *__mutex)
+{
+  while (__sync_lock_test_and_set (__mutex, 1))
+    asm volatile ("s_sleep\t1" ::: "memory");
+
+  return 0;
+}
+
+static inline int
+__gthread_mutex_trylock (__gthread_mutex_t *__mutex)
+{
+  return __sync_lock_test_and_set (__mutex, 1);
+}
+
+static inline int
+__gthread_mutex_unlock (__gthread_mutex_t *__mutex)
+{
+  __sync_lock_release (__mutex);
+
+  return 0;
+}
+
+static inline int
+__gthread_recursive_mutex_lock (__gthread_recursive_mutex_t *__mutex __attribute__((unused)))
+{
+  /* Operation is not supported.  */
+  return -1;
+}
+
+static inline int
+__gthread_recursive_mutex_trylock (__gthread_recursive_mutex_t *__mutex __attribute__((unused)))
+{
+  /* Operation is not supported.  */
+  return -1;
+}
+
+static inline int
+__gthread_recursive_mutex_unlock (__gthread_recursive_mutex_t *__mutex __attribute__((unused)))
+{
+  /* Operation is not supported.  */
+  return -1;
+}
+#endif /* _LIBOBJC */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ! GCC_GTHR_GCN_H */
diff --git a/libgcc/config/gcn/reduction.c b/libgcc/config/gcn/reduction.c
new file mode 100644
index 0000000..eafcee8
--- /dev/null
+++ b/libgcc/config/gcn/reduction.c
@@ -0,0 +1,30 @@
+/* Oversized reductions lock variable
+   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Graphics.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* We use a global lock variable for reductions on objects larger than
+   64 bits.  Until and unless proven that lock contention for
+   different reductions is a problem, a single lock will suffice.  */
+
+unsigned volatile __reduction_lock = 0;
diff --git a/libgcc/config/gcn/t-amdgcn b/libgcc/config/gcn/t-amdgcn
index 8687c9f..c04de3c 100644
--- a/libgcc/config/gcn/t-amdgcn
+++ b/libgcc/config/gcn/t-amdgcn
@@ -1,5 +1,10 @@
+LIB2ADD += $(srcdir)/config/gcn/gomp_print.c
+
 LIB2ADD += $(srcdir)/config/gcn/lib2-divmod.c \
-	   $(srcdir)/config/gcn/lib2-divmod-hi.c
+	   $(srcdir)/config/gcn/lib2-divmod-hi.c \
+	   $(srcdir)/config/gcn/unwind-gcn.c
+
+LIB2ADD += $(srcdir)/config/gcn/reduction.c
 
 LIB2ADDEH=
 LIB2FUNCS_EXCLUDE=__main
@@ -12,5 +17,10 @@
 crt0.o: $(srcdir)/config/gcn/crt0.c
 	$(crt_compile) -c $<
 
-# Prevent building "advanced" stuff (for example, gcov support).
+# Prevent building "advanced" stuff (for example, gcov support).  We don't
+# support it, and it may cause the build to fail, because of alloca usage, for
+# example.
 INHIBIT_LIBC_CFLAGS = -Dinhibit_libc
+
+# Disable emutls.c (temporarily?)
+enable_emutls = no
diff --git a/libgcc/config/gcn/t-gcn-hsa b/libgcc/config/gcn/t-gcn-hsa
new file mode 100644
index 0000000..1600a58
--- /dev/null
+++ b/libgcc/config/gcn/t-gcn-hsa
@@ -0,0 +1,52 @@
+#  Copyright (C) 2016-2019 Free Software Foundation, Inc.
+#
+#  This file is free software; you can redistribute it and/or modify it under
+#  the terms of the GNU General Public License as published by the Free
+#  Software Foundation; either version 3 of the License, or (at your option)
+#  any later version.
+#
+#  This file is distributed in the hope that it will be useful, but WITHOUT
+#  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+#  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with GCC; see the file COPYING3.  If not see
+#  <http://www.gnu.org/licenses/>.
+
+GTM_H += $(HASH_TABLE_H)
+
+driver-gcn.o: $(srcdir)/config/gcn/driver-gcn.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
+CFLAGS-mkoffload.o += $(DRIVER_DEFINES) \
+	-DGCC_INSTALL_NAME=\"$(GCC_INSTALL_NAME)\"
+mkoffload.o: $(srcdir)/config/gcn/mkoffload.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+ALL_HOST_OBJS += mkoffload.o
+
+mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a \
+		      $(LIBIBERTY) $(LIBDEPS)
+	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \
+	  mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS)
+
+CFLAGS-gcn-run.o += -DVERSION_STRING=$(PKGVERSION_s)
+COMPILE-gcn-run.o = $(filter-out -fno-rtti,$(COMPILE))
+gcn-run.o: $(srcdir)/config/gcn/gcn-run.c
+	$(COMPILE-gcn-run.o) -x c -std=gnu11 -Wno-error=pedantic $<
+	$(POSTCOMPILE)
+ALL_HOST_OBJS += gcn-run.o
+
+gcn-run$(exeext): gcn-run.o
+	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ $< -ldl
+
+MULTILIB_OPTIONS = march=gfx900 march=gfx906
+MULTILIB_DIRNAMES = gfx900 gfx906
+
+PASSES_EXTRA += $(srcdir)/config/gcn/gcn-passes.def
+gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+ALL_HOST_OBJS += gcn-tree.o
diff --git a/libgcc/config/gcn/unwind-gcn.c b/libgcc/config/gcn/unwind-gcn.c
new file mode 100644
index 0000000..bfb7028
--- /dev/null
+++ b/libgcc/config/gcn/unwind-gcn.c
@@ -0,0 +1,37 @@
+/* Stub unwinding implementation.
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by Mentor Graphics
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "unwind.h"
+
+_Unwind_Reason_Code
+_Unwind_Backtrace (_Unwind_Trace_Fn trace, void *trace_argument)
+{
+  return 0;
+}
+
+_Unwind_Ptr
+_Unwind_GetIPInfo (struct _Unwind_Context *c, int *ip_before_insn)
+{
+  return 0;
+}
diff --git a/libgcc/configure b/libgcc/configure
index 36dbbc1..1cf3408 100644
--- a/libgcc/configure
+++ b/libgcc/configure
@@ -5542,6 +5542,7 @@
 case $target_thread_file in
     aix)	thread_header=config/rs6000/gthr-aix.h ;;
     dce)	thread_header=config/pa/gthr-dce.h ;;
+    gcn)	thread_header=config/gcn/gthr-gcn.h ;;
     lynx)	thread_header=config/gthr-lynx.h ;;
     mipssde)	thread_header=config/mips/gthr-mipssde.h ;;
     posix)	thread_header=gthr-posix.h ;;
diff --git a/libgfortran/ChangeLog.omp b/libgfortran/ChangeLog.omp
new file mode 100644
index 0000000..9836164
--- /dev/null
+++ b/libgfortran/ChangeLog.omp
@@ -0,0 +1,9 @@
+2019-06-25  Kwok Cheung Yeung  <kcy@codesourcery.com>
+	    Andrew Stubbs  <ams@codesourcery.com>
+
+	Backport from mainline:
+
+	libgfortran/
+	* configure: Regenerate.
+	* configure.ac (LIBGFOR_MINIMAL): Do not use on AMD GCN.
+
diff --git a/libgfortran/configure b/libgfortran/configure
index 60867b9..15c9307f 100755
--- a/libgfortran/configure
+++ b/libgfortran/configure
@@ -6179,8 +6179,7 @@
 # * C library support for other features such as signal, environment
 #   variables, time functions
 
- if test "x${target_cpu}" = xnvptx \
-				 || test "x${target_cpu}" = xamdgcn; then
+ if test "x${target_cpu}" = xnvptx; then
   LIBGFOR_MINIMAL_TRUE=
   LIBGFOR_MINIMAL_FALSE='#'
 else
diff --git a/libgfortran/configure.ac b/libgfortran/configure.ac
index 7cfce28..8dcc860 100644
--- a/libgfortran/configure.ac
+++ b/libgfortran/configure.ac
@@ -205,8 +205,7 @@
 # * C library support for other features such as signal, environment
 #   variables, time functions
 
-AM_CONDITIONAL(LIBGFOR_MINIMAL, [test "x${target_cpu}" = xnvptx \
-				 || test "x${target_cpu}" = xamdgcn])
+AM_CONDITIONAL(LIBGFOR_MINIMAL, [test "x${target_cpu}" = xnvptx])
 
 # Figure out whether the compiler supports "-ffunction-sections -fdata-sections",
 # similarly to how libstdc++ does it
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
new file mode 100644
index 0000000..8895786
--- /dev/null
+++ b/libgomp/ChangeLog.omp
@@ -0,0 +1,1443 @@
+2019-11-22  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* testsuite/libgomp.oacc-fortran/lib-16.f90: Fix async-safety issue.
+
+2019-10-16  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90: Use
+	oaccdevlow dump and update scanned output.
+	* testsuite/libgomp.oacc-fortran/gangprivate-attrib-2.f90: Likewise.
+	Add missing atomic to force worker partitioning for test variable.
+
+2019-10-16  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/serial-dims.c: Support AMD GCN.
+
+2019-10-09  Tobias Burnus  <tobias@codesourcery.com>
+
+	* testsuite/libgomp.fortran/use_device_ptr1.f90: New.
+
+2019-09-20  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-fortran/privatized-ref-1.f95: New test.
+	* testsuite/libgomp.oacc-c++/privatized-ref-2.C: New test.
+	* testsuite/libgomp.oacc-c++/privatized-ref-3.C: New test.
+
+2019-09-19  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_openacc_async_host2dev):
+	Add EPHEMERAL parameter, and FIXME function comment.
+
+2019-09-18  Tobias Burnus  <tobias@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Fix dg-warning
+	line numbers.
+	* testsuite/libgomp.oacc-c-c++-common/serial-dims.c: Likewise.
+
+2019-09-18  Tobias Burnus  <tobias@codesourcery.com>
+
+	* linux/gomp_print.c (gomp_print_integer): Use PRId64 if available,
+	otherwise cast for %ld.
+
+2019-09-17  Julian Brown  <julian@codesourcery.com>
+
+	* libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_host2dev): Update
+	prototype.
+	* libgomp.h (gomp_copy_host2dev): Update prototype.
+	* oacc-host.c (host_openacc_async_host2dev): Add ephemeral parameter.
+	* oacc-mem.c (memcpy_tofrom_device): Update call to gomp_copy_host2dev.
+	(update_dev_host): Likewise.
+	* oacc-parallel.c (GOACC_enter_exit_data): Call async versions of
+	acc_attach/acc_detach/acc_detach_finalize functions.
+	* plugin/plugin-gcn.c (wait_for_queue_nonfull): Don't lock/unlock
+	aq->mutex here.
+	(queue_push_launch): Lock aq->mutex before calling
+	wait_for_queue_nonfull.
+	(queue_push_callback): Likewise.
+	(queue_push_asyncwait): Likewise.
+	(queue_push_placeholder): Likewise.
+	(GOMP_OFFLOAD_openacc_async_host2dev): Add ephemeral parameter.  Copy
+	source data to temporary space immediately if true, and pass to
+	queue_push_copy.
+	(goacc_device_copy_async): Remove.
+	(gomp_copy_host2dev): Add ephemeral parameter. Update function comment.
+	Call async host2dev plugin hook directly.
+	(gomp_copy_dev2host): Call async dev2host plugin hook directly.
+	(gomp_map_vars_existing, gomp_map_pointer, gomp_attach_pointer,
+	gomp_detach_pointer): Update calls to gomp_copy_host2dev.
+	(gomp_map_vars_internal): Don't use coalescing buffer for asynchronous
+	copies. Update calls to gomp_copy_host2dev.
+	(gomp_update): Update calls to gomp_copy_host2dev.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c (main): Fix
+	async-safety issue. Increase number of iterations.
+	* testsuite/libgomp.oacc-fortran/lib-16-2.f90: Fix async-safety issue.
+
+2019-09-17  Julian Brown  <julian@codesourcery.com>
+
+	* oacc-host.c (host_openacc_async_queue_callback): Invoke callback
+	function immediately.
+	* oacc-parallel.c (struct async_prof_callback_info, async_prof_dispatch,
+	queue_async_prof_dispatch): New.
+	(GOACC_parallel_keyed): Call queue_async_prof_dispatch for asynchronous
+	profile-event dispatches.
+	(GOACC_enter_exit_data): Likewise.
+	(GOACC_update): Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
+	(cb_compute_construct_start): Remove/fix TODO.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
+	(cb_exit_data_start): Tweak expected state values.
+	(cb_exit_data_end): Likewise.
+	(cb_compute_construct_start): Remove/fix TODO.
+	(cb_compute_construct_end): Don't do adjustments for
+	acc_ev_enqueue_launch_start/acc_ev_enqueue_launch_end callbacks.
+	(cb_compute_construct_end): Tweak expected state values.
+	(cb_enqueue_launch_start, cb_enqueue_launch_end): Don't expect
+	launch-enqueue operations to happen synchronously with respect to
+	profiling events on async streams.
+	(main): Tweak expected state values.
+	* testsuite/libgomp.oacc-c-c++-common/lib-94.c (main): Reorder
+	operations for async-safety.
+
+2019-09-17  Julian Brown  <julian@codesourcery.com>
+
+	* target.c (gomp_map_vars_internal): Remove read of uninitialised
+	data.
+
+2019-09-17  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Only run
+	NVidia-specific test on NVidia hardware.
+	* testsuite/libgomp.oacc-c-c++-common/asyncwait-nop-1.c (main):
+	Initialise for acc_device_gcn if testing on AMD GCN.
+	* testsuite/libgomp.oacc-c-c++-common/function-not-offloaded.c: Support
+	AMD GCN.
+	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c (check): Skip
+	vector dimension test for AMD GCN.
+
+2019-09-13  Tobias Burnus  <tobias@codesourcery.com>
+
+	* plugin/plugin-gcn.c (hsa_warn, hsa_fatal, hsa_error): Ensure
+	string is initialized.
+
+2019-09-10  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (GOMP_hsa_kernel_dispatch): Remove
+	omp_data_memory, kernel_dispatch_count, debug, omp_level,
+	children_dispatches and omp_num_threads fields.
+	(hsa_kernel_description): Remove omp_data_size, gridified_kernel_p,
+	kernel_dependencies_count, kernel_dependencies fields to match
+	mkoffload output.
+	(kernel_info): Remove omp_data_size, dependencies, dependencies_count,
+	max_omp_data_size and gridified_kernel_p fields.
+	(init_basic_kernel_info): Don't copy newly-deleted fields.
+	(create_single_kernel_dispatch): Remove omp_data_size parameter.
+	Remove write-only initialization of deleted GOMP_hsa_kernel_dispatch
+	fields.
+	(release_kernel_dispatch): Update debug output.  Don't free deleted
+	omp_data_memory field.
+	(init_single_kernel): Remove max_omp_data_size parameter. Remove deleted
+	fields from debug output.
+	(print_kernel_dispatch): Don't print deleted fields.
+	(create_kernel_dispatch): Remove omp_data_size parameter.
+	(init_kernel): Update calls to init_single_kernel and
+	create_kernel_dispatch.
+
+2019-09-10  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (struct placeholder, struct asyncwait_info,
+	enum entry_type): New.
+	(queue_entry): Use entry_type enum for tag.  Add asyncwait and
+	placeholder event type fields.
+	(wait_for_queue_nonfull): New function.
+	(queue_push_launch): Use above function instead of raising a fatal
+	error on queue-full condition.  Use KERNEL_LAUNCH instead of hardwired
+	0.
+	(queue_push_callback): Use wait_for_queue_nonfull instead of open-coded
+	wait sequence.  Use CALLBACK instead of hardwired 1.
+	(queue_push_asyncwait, queue_push_placeholder): New.
+	(execute_queue_entry): Implement ASYNC_WAIT and ASYNC_PLACEHOLDER event
+	types.
+	(GOMP_OFFLOAD_openacc_async_serialize): Use queue_push_placeholder and
+	queue_push_asyncwait instead of host-synchronized wait_queue calls.
+	* testsuite/libgomp.oacc-c-c++-common/data-2-lib.c (main): Add missing
+	asynchronous waits.
+	* testsuite/libgomp.oacc-c-c++-common/data-2.c (main): Likewise.
+
+2019-09-10  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (GOMP_OFFLOAD_openacc_async_host2dev): Enqueue
+	copy from src_copy not src.
+
+2019-09-10  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* config/gcn/team.c (gomp_gcn_exit_kernel): Free GCN thread list.
+
+2019-09-10  Andrew Stubbs  <ams@codesourcery.com>
+
+	* plugin/plugin-gcn.c (HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT): Define.
+	(dump_hsa_agent_info): Dump compute unit count.
+	(get_cu_count): New function.
+	(parse_target_attributes): Use get_cu_count for default gdims.
+	(gcn_exec): Likewise.
+
+2019-09-10  Andrew Stubbs  <ams@codesourcery.com>
+
+	* plugin/plugin-gcn.c (obstack_chunk_alloc): Delete.
+	(obstack_chunk_free): Delete.
+	(obstack.h): Remove include.
+	(create_and_finalize_hsa_program): Remove all unmodified_sections_os
+	and use sections directly from the issue.
+	Use "or 0x80" instead of SHT_NOTE to hide relocations, and then
+	simply recognise that ourselves.
+
+2019-09-09  Andrew Stubbs  <ams@codesourcery.com>
+
+	* plugin/plugin-gcn.c (struct hsa_runtime_fn_info): Add
+	hsa_memory_assign_agent_fn.
+	(struct agent_info): Add data_region.
+	(init_hsa_runtime_functions): Initialize hsa_memory_assign_agent.
+	(get_kernarg_memory_region): Move contents to new function ...
+	(get_memory_region): ... here.
+	(get_data_memory_region): New function.
+	(GOMP_OFFLOAD_get_property): Use data_region, not kernarg_region.
+	(GOMP_OFFLOAD_init_device): Initialize data_region.
+	(create_and_finalize_hsa_program): Use data_region, not
+	kernarg_region, and assign heap to device agent.
+	(GOMP_OFFLOAD_alloc_by_agent): Likewise.
+	(image_address_p): Delete function.
+	(struct copy_data): Remove use_hsa_memory_copy.
+	(copy_data): Always use hsa_memory_copy.
+	(queue_push_copy): Remove use_hsa_memory_copy.
+	(GOMP_OFFLOAD_dev2host): Always use hsa_memory_copy.
+	(GOMP_OFFLOAD_host2dev): Likewise.
+	(GOMP_OFFLOAD_dev2dev): Likewise.
+	(gcn_exec): Use hsa_memory_copy.
+	(GOMP_OFFLOAD_openacc_async_host2dev): Always use hsa_memory_copy.
+	(GOMP_OFFLOAD_openacc_async_dev2host): Likewise.
+
+2019-09-06  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (GOMP_OFFLOAD_alloc_by_agent,
+	GOMP_OFFLOAD_free, gcn_exec): Add profiling support.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Add GCN
+	support.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c: Likewise.
+
+2019-09-06  Julian Brown  <julian@codesourcery.com>
+
+	* config/gcn/target.c (omp_pause_resource, omp_pause_resource_all): New
+	functions, plus ialiases.
+
+2019-09-05  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (gcn_exec): Change default number of workers to
+	16.
+
+2019-09-05  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-fortran/lib-13.f90: End data region after
+	wait API calls.
+
+2019-08-13  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (queue_push_callback): Wait on queue-full
+	condition.
+
+2019-08-13  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (struct copy_data): Add using_src_copy field.
+	(copy_data): Free temporary buffer if using.
+	(queue_push_copy): Add using_src_copy parameter.
+	(GOMP_OFFLOAD_dev2dev, GOMP_OFFLOAD_async_dev2host): Update calls to
+	queue_push_copy.
+	(GOMP_OFFLOAD_async_host2dev): Likewise.  Allocate temporary buffer and
+	copy source data to it immediately.
+	* target.c (gomp_copy_host2dev): Update function comment.
+	(copy_host2dev_immediate): Remove.
+	(gomp_map_pointer, gomp_map_vars_internal): Replace calls to
+	copy_host2dev_immediate with calls to gomp_copy_host2dev.
+
+2019-08-08  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (gcn_exec): Use 1 for the default number of
+	workers.
+
+2019-08-08  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/configfrag.ac (amdgcn): Set tgt_plugin.
+	* testsuite/lib/libgomp.exp (offload_target_to_openacc_device_type):
+	Add AMD GCN support.
+	(check_effective_target_openacc_amdgcn_accel_selected): Test
+	offload_target instead of offload_target_openacc.
+	* testsuite/libgomp.oacc-c++/c++.exp (amdgcn*): Rename stanza to...
+	(gcn): ...this. Don't set tagopt redundantly here.
+	* testsuite/libgomp.oacc-c/c.exp (amdgcn*, gcn): Likewise.
+	* testsuite/libgomp.oacc-fortran/fortran.exp (amdgcn*, gcn): Likewise.
+	* configure: Regenerated.
+
+2019-08-08  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (GOMP_OFFLOAD_openacc_exec_params,
+	GOMP_OFFLOAD_openacc_async_exec_params): New functions.
+
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Use relative
+	line numbers for warning.
+	* testsuite/libgomp.oacc-c-c++-common/serial-dims.c: Likewise.
+
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+
+	* config/nvptx/gomp_print.c (gomp_print_string, gomp_print_integer,
+	gomp_print_double): New.
+	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_openacc_async_construct): Add
+	dummy device parameter.
+
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+
+	* libgomp.map (GOMP_2.0.GOMP_4_BRANCH): Remove GOACC_parallel_keyed_v2.
+	* libgomp_g.h (GOACC_parallel_keyed_v2): Remove prototype.
+	* oacc-parallel.c (GOACC_parallel_keyed_internal): Rename to...
+	(GOACC_parallel_keyed): ...this.  Handle GOMP_LAUNCH_ARGS_EXPLODED
+	launch tag.  Remove previous wrapper functions.
+	(GOACC_parallel_keyed_v2): Remove.
+
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+	    Andrew Stubbs  <ams@codesourcery.com>
+
+	* Makefile.am (libgomp_la_SOURCES): Add gomp_print.c.
+	* Makefile.in: Regenerate.
+	* affinity-fmt.c: Rename calls to gomp_write_string from
+	gomp_print_string.
+	* config.h.in (PLUGIN_GCN): Add #undef.
+	* config/nvptx/libgomp-plugin.c: Rename to...
+	* config/accel/libgomp-plugin.c: ...this.
+	* config/nvptx/lock.c: Rename to...
+	* config/accel/lock.c: ...this.
+	* config/nvptx/mutex.c: Rename to...
+	* config/accel/mutex.c: ...this.
+	* config/nvptx/mutex.h: Rename to...
+	* config/accel/mutex.h: ...this.
+	* config/nvptx/oacc-async.c: Rename to...
+	* config/accel/oacc-async.c: ...this.
+	* config/nvptx/oacc-cuda.c: Rename to...
+	* config/accel/oacc-cuda.c: ...this.
+	* config/nvptx/oacc-host.c: Rename to...
+	* config/accel/oacc-host.c: ...this.
+	* config/nvptx/oacc-init.c: Rename to...
+	* config/accel/oacc-init.c: ...this.
+	* config/nvptx/oacc-mem.c: Rename to...
+	* config/accel/oacc-mem.c: ...this.
+	* config/nvptx/oacc-plugin.c: Rename to...
+	* config/accel/oacc-plugin.c: ...this.
+	* config/nvptx/omp-lock.h: Rename to...
+	* config/accel/omp-lock.h: ...this.
+	* config/nvptx/openacc.f90: Rename to...
+	* config/accel/openacc.f90: ...this.  Add acc_device_hsa and
+	acc_device_gcn.
+	* config/nvptx/pool.h: Rename to...
+	* config/accel/pool.h: ...this.
+	* config/nvptx/proc.c: Rename to...
+	* config/accel/proc.c: ...this.  Add omp_get_num_procs alias.
+	* config/nvptx/ptrlock.c: Rename to...
+	* config/accel/ptrlock.c: ...this.
+	* config/nvptx/ptrlock.h: Rename to...
+	* config/accel/ptrlock.h: ...this.
+	* config/nvptx/sem.c: Rename to...
+	* config/accel/sem.c: ...this.
+	* config/nvptx/sem.h: Rename to...
+	* config/accel/sem.h: ...this.
+	* config/nvptx/thread-stacksize.h: Rename to...
+	* config/accel/thread-stacksize.h: ...this.
+	* config/gcn/affinity-fmt.c: New.
+	* config/gcn/bar.c: New.
+	* config/gcn/bar.h: New.
+	* config/gcn/doacross.h: New.
+	* config/gcn/gomp_print.c: New.
+	* config/gcn/icv-device.c: New.
+	* config/gcn/simple-bar.h: New.
+	* config/gcn/target.c: New.
+	* config/gcn/task.c: New.
+	* config/gcn/team.c: New.
+	* config/gcn/time.c: New.
+	* config/linux/gomp_print.c: New.
+	* configure.ac (amdgcn*-*-*): Disable pthreads.
+	* configure: Regenerated.
+	* configure.tgt (nvptx*-*-*): Add 'accel' config_path.
+	(amdgcn*-*-*): Set config_path.
+	* fortran.c (omp_display_affinity_): Rename calls to gomp_write_string
+	from gomp_print_string.
+	* libgomp-plugin.h (enum offload_target_type): Add
+	OFFLOAD_TARGET_TYPE_GCN.
+	(GOMP_OFFLOAD_openacc_async_construct): Change parameter type to int.
+	* libgomp.h (gcn_thrs, set_gcn_thrs, gomp_thread): Add for __AMDGCN__.
+	(gomp_print_string): Rename to...
+	(gomp_write_string): ...this.
+	* libgomp.map (GOMP_4.5): Add gomp_rpint_string, gomp_print_integer,
+	gomp_print_double.
+	* oacc-async.c (lookup_goacc_asyncqueue): Pass target_id to async queue
+	construct function.
+	* oacc-host.c (host_openacc_async_construct): Add dummy device
+	parameter.
+	* oacc-init.c (name_of_acc_device_t): Add acc_device_gcn.
+	* oacc-int.h (goacc_thread): Add dummy implementation for __AMDGCN__.
+	* oacc-parallel.c (GOACC_enter_exit_data): Support acc_async_noval and
+	zero-length array sections.
+	* omp.h.in (gomp_print_string, gomp_print_integer, gomp_print_double):
+	Add prototypes.
+	* omp_lib.f90.in (gomp_print_string, gomp_print_integer,
+	gomp_print_double): Add interfaces.
+	* openacc.f90 (openacc_kinds): Add acc_device_gcn.  Bump
+	acc_device_current code.
+	* openacc.h (acc_device_t): Add acc_device_gcn, bump acc_device_current
+	code.
+	* openacc_lib.h (acc_device_hsa, acc_device_gcn): Add.
+	* plugin/Makefrag.am (PLUGIN_GCN): Support building GCN plugin.
+	* plugin/configfrag.am (PLUGIN_GCN, PLUGIN_GCN_CPPFLAGS,
+	PLUGIN_GCN_LDFLAGS, PLUGIN_GCN_LIBS): Add.  Add suport for GCN plugin.
+	* plugin/plugin-gcn.c: New.
+	* target.c (stdio.h): Include unconditionally.
+	(gomp_copy_host2dev): Add function comment.
+	(copy_host2dev_immediate): New function.
+	(gomp_map_pointer, gomp_map_vars_internal): Use
+	copy_host2dev_immediate where appropriate.
+	(offload_target_to_plugin_name): Support gcn.
+	* team.c (gomp_free_pool_helper): Support gcn.
+	* testsuite/Makefile.in: Regenerated.
+	* testsuite/lib/libgomp.exp
+	(check_effective_target_openacc_amdgcn_accel_present): New.
+	(check_effective_target_openacc_amdgcn_accel_selected): New.
+	* testsuite/libgomp.c/c.exp (generate_tests, test_lists,
+	generated_tests): New.
+	(tests): Add generated tests.
+	* testsuite/libgomp.c/for-1.h: New.
+	* testsuite/libgomp.c/for-2.h: New.
+	* testsuite/libgomp.c/for-3.h: New.
+	* testsuite/libgomp.c/for-3.list: New.
+	* testsuite/libgomp.c/for-5.c: New.
+	* testsuite/libgomp.c/for-5.list: New.
+	* testsuite/libgomp.c/for-6.c: New.
+	* testsuite/libgomp.c/for-6.list: New.
+	* testsuite/libgomp.c/target-print-1.c: New.
+	* testsuite/libgomp.fortran/target-print-1.f90: New.
+	* testsuite/libgomp.oacc-c++/c++.exp (amdgcn*): Add support for AMD GCN.
+	* testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c: Adjust for
+	portability.
+	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Skip unsuitable
+	test for AMD GCN.
+	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Adjust for
+	portability.
+	* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-v-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/serial-dims.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/private-variables-2.c: New.
+	* testsuite/libgomp.oacc-c-c++-common/tile-1.c: Skip for AMD GCN.
+	* testsuite/libgomp.oacc-c/c.exp (amdgcn*): Add support for AMD GCN.
+	* testsuite/libgomp.oacc-c/offload-target-1.c: Add AMD GCN support.
+	* testsuite/libgomp.oacc-c/print-1.c: New.
+	* testsuite/libgomp.oacc-fortran/fortran.exp (amdgcn*): Add AMD GCN
+	support.
+	* testsuite/libgomp.oacc-fortran/atomic_capture-1.f90: Adjust for
+	portability.
+	* testsuite/libgomp.oacc-fortran/collapse-1.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/collapse-2.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/error_stop-1.f: Support AMD GCN.
+	* testsuite/libgomp.oacc-fortran/error_stop-2.f: Support AMD GCN.
+	* testsuite/libgomp.oacc-fortran/error_stop-3.f: Support AMD GCN.
+	* testsuite/libgomp.oacc-fortran/print-1.f90: New.
+
+2019-01-23  Thomas Schwinge <thomas@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: Update.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* oacc-init.c (get_property_any): Add profiling code.
+
+2017-02-28  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* Makefile.am (libgomp_la_SOURCES): Add
+	oacc-profiling-acc_register_library.c.
+	* Makefile.in: Regenerate.
+	* libgomp.texi: Remove paragraph about acc_register_library.
+	* oacc-parallel.c (GOACC_parallel_keyed_internal): Set device_api for
+	profiling.
+	* oacc-profiling-acc_register_library.c: New file.
+	* oacc-profiling.c (goacc_profiling_initialize): Call
+	acc_register_library.  Avoid duplicate registration.
+	(acc_register_library): Remove.
+	* config/nvptx/oacc-profiling-acc_register_library.c:
+	New empty file.
+	* config/nvptx/oacc-profiling.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: Remove
+	call to acc_register_library.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c: Likewise.
+
+2019-05-17  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* acc_prof.h: New file.
+	* oacc-profiling.c: Likewise.
+	* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
+	Add these, respectively.
+	* Makefile.in: Regenerate.
+	* env.c (initialize_env): Call goacc_profiling_initialize.
+	* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
+	(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
+	* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
+	(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
+	* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
+	acc_prof_register, acc_prof_unregister, and acc_register_library.
+	(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
+	GOMP_PLUGIN_goacc_thread.
+	* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
+	prof_callbacks_enabled members.
+	(goacc_prof_enabled, goacc_profiling_initialize)
+	(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
+	(goacc_profiling_dispatch): Declare.
+	(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
+	(GOACC_PROFILING_SETUP_P): Define.
+	* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
+	(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
+	OpenACC Profiling Interface.
+	* oacc-cuda.c (acc_get_current_cuda_device)
+	(acc_get_current_cuda_context, acc_get_cuda_stream)
+	(acc_set_cuda_stream): Likewise.
+	* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
+	(acc_init, acc_set_device_type, acc_get_device_type)
+	(acc_get_device_num, goacc_lazy_initialize): Likewise.
+	* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
+	(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
+	(acc_unmap_data, present_create_copy, delete_copyout)
+	(update_dev_host): Likewise.
+	* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
+	(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
+	Likewise.
+	* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
+	(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
+	Likewise.
+	* libgomp.texi: Update.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
+	file.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
+	Likewise.
+
+2019-07-10  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* target.c (gomp_load_image_to_device): Allow the accelerator to
+	possess more offloaded functions than the host.
+
+2019-07-10  Julian Brown  <julian@codesourcery.com>
+
+	* oacc-parallel.c (GOACC_enter_exit_data): Fix optional arguments for
+	changes to clause stripping in enter data/exit data directives.
+	* testsuite/libgomp.oacc-fortran/class-ptr-param.f95: New test.
+	* testsuite/libgomp.oacc-fortran/classtypes-1.f95: New test.
+	* testsuite/libgomp.oacc-fortran/classtypes-2.f95: New test.
+	* testsuite/libgomp.oacc-fortran/derivedtype-1.f95: New test.
+	* testsuite/libgomp.oacc-fortran/derivedtype-2.f95: New test.
+	* testsuite/libgomp.oacc-fortran/multidim-slice.f95: New test.
+
+2019-05-28  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-fortran/gangprivate-attrib-2.f90: New test.
+
+2019-01-30  Andrew Jenner  <andrew@codesourcery.com>
+
+	* testsuite/libgomp.fortan/fortran.exp (lang_link_flags): Add
+	-lquadmath.
+	* testsuite/libgomp.oacc-fortran/fortran.exp (lang_link_flags): Add
+	-lquadmath.
+
+2019-05-30  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* libgomp_g.h: Include stdint.h instead of gstdint.h.
+
+2019-05-20  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c: Expect
+	"optimized:" not "note:" in warnings.
+	* testsuite/libgomp.oacc-c-c++-common/serial-dims.c: Fix typos in
+	warnings.
+
+2019-05-16  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/kernels-for-index-reuse-1.c: New
+	test.
+
+2019-05-16  Julian Brown  <julian@codesourcery.com>
+
+	* target.c (gomp_map_vars_async): Initialise KEY and OFFSET fields in
+	not-present case.
+
+2019-01-09  Julian Brown  <julian@codesourcery.com>
+
+	* libgomp.texi: Update mentions of OpenACC version to 2.6.  Update
+	section numbers to match version 2.6 of the spec.
+	* openacc.f90 (openacc_version): Update to 201711.
+	* openacc_lib.h (openacc_version): Update to 201711.
+	* testsuite/libgomp.oacc-fortran/openacc_version-1.f: Update expected
+	openacc_version to 201711.
+	* testsuite/libgomp.oacc-fortran/openacc_version-2.f90: Likewise.
+
+2019-01-23  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
+	Update.
+	* testsuite/libgomp.oacc-c-c++-common/avoid-offloading-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/avoid-offloading-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/avoid-offloading-3.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/avoid-offloading-1.f: Likewise.
+	* testsuite/libgomp.oacc-fortran/avoid-offloading-2.f: Likewise.
+	* testsuite/libgomp.oacc-fortran/avoid-offloading-3.f: Likewise.
+	* testsuite/libgomp.oacc-fortran/initialize_kernels_loops.f90:
+	Likewise.
+
+2019-01-24  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c: New
+	file.
+
+2019-01-30  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-2.c:
+	Add "-fopenacc-kernels=parloops".
+	* testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-3.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-empty.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-3.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-3.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-4.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-5.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-6.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-collapse.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-g.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-mod-not-zero.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-n.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-loop.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-2.f95: Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-loop-data.f95: Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-loop.f95: Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95:
+	Likewise.
+	* testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90:
+	Likewise.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* libgomp.h (gomp_device_descr): Add `get_property_func' member.
+	* libgomp-plugin.h (gomp_device_property_value): New union.
+	(gomp_device_property_value): New prototype.
+	* openacc.h (acc_device_t): Add `acc_device_current' enumeration
+	constant.
+	(acc_device_property_t): New enum.
+	(acc_get_property, acc_get_property_string): New prototypes.
+	* oacc-init.c (acc_get_device_type): Also assert on
+	`!acc_device_current' result.
+	(get_property_any, acc_get_property, acc_get_property_string):
+	New functions.
+	* openacc.f90 (openacc_kinds): From `iso_fortran_env' also
+	import `int64'.  Add `acc_device_current' and
+	`acc_property_memory', `acc_property_free_memory',
+	`acc_property_name', `acc_property_vendor' and
+	`acc_property_driver' constants.  Add `acc_device_property' data
+	type.
+	(openacc_internal): Add `acc_get_property' and
+	`acc_get_property_string' interfaces.  Add `acc_get_property_h',
+	`acc_get_property_string_h', `acc_get_property_l' and
+	`acc_get_property_string_l'.
+	(openacc_c_string): New module.
+	* oacc-host.c (host_get_property): New function.
+	(host_dispatch): Wire it.
+	* target.c (gomp_load_plugin_for_device): Handle `get_property'.
+	* libgomp.map (OACC_2.6): Add `acc_get_property',
+	`acc_get_property_h_', `acc_get_property_string' and
+	`acc_get_property_string_h_' symbols.
+	* libgomp.texi (OpenACC Runtime Library Routines): Add
+	`acc_get_property'.
+	(acc_get_property): New node.
+
+	* plugin/plugin-hsa.c (GOMP_OFFLOAD_get_property): New function.
+	* plugin/plugin-nvptx.c (CUDA_CALLS): Add `cuDeviceGetName',
+	`cuDeviceTotalMem', `cuDriverGetVersion' and `cuMemGetInfo'
+	calls.
+	(GOMP_OFFLOAD_get_property): New function.
+
+	* testsuite/libgomp.oacc-c-c++-common/acc-get-property.c: New
+	test.
+	* testsuite/libgomp.oacc-fortran/acc-get-property.f: New test.
+
+2018-12-19  Julian Brown  <julian@codesourcery.com>
+            Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* target.c (gomp_map_vars_async): Support GOMP_MAP_NO_ALLOC.
+
+	* testsuite/libgomp.oacc-c-c++-common/nocreate-1.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/nocreate-2.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/nocreate-3.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/nocreate-4.c: New test.
+	* testsuite/libgomp.oacc-fortran/nocreate-1.f90: New test.
+	* testsuite/libgomp.oacc-fortran/nocreate-2.f90: New test.
+
+2018-12-20  Maciej W. Rozycki  <macro@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/serial-dims.c: New test.
+
+2017-12-21  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* Makefile.am: Add libffi build dependency.
+	* configure.ac: Likewise.
+	* Makefile.in: Regenerate.
+	* config.h.in: Regenerate.
+	* configure: Regenerate.
+	* libgomp-plugin.h: Define GOMP_OFFLOAD_openacc_exec_params and
+	GOMP_OFFLOAD_openacc_async_exec_params.
+	* libgomp.h (acc_dispatch_t): Use them here.
+	* libgomp.map (GOACC_parallel_keyed_v2): Declare.
+	* libgomp_g.h (GOACC_parallel_keyed_v2): Likewise.
+	* oacc-host.c (host_openacc_exec_params): New function.
+	(host_openacc_async_exec_params): Likewise.
+	* oacc-parallel.c (goacc_call_host_fn): Likewise.
+	(GOACC_parallel_keyed_internal): Likewise.
+	(GOACC_parallel_keyed): Wrapper for GOACC_parallel_keyed_internal.
+	(GOACC_parallel_keyed_v2): Likewise.
+	* plugin/plugin-nvptx.c (nvptx_exec): Replace CUDeviceptr dp parameter
+	with void **kargs.
+	(openacc_exec_internal): New function.
+	(GOMP_OFFLOAD_openacc_exec_params): New function.
+	(GOMP_OFFLOAD_openacc_exec): Update to call openacc_exec_internal.
+	(openacc_async_exec_internal): New function.
+	(GOMP_OFFLOAD_openacc_async_exec_params): New function.
+	(GOMP_OFFLOAD_openacc_async_exec): Update call to
+	openacc_async_exec_internal.
+	* target.c (gomp_load_plugin_for_device): Handle
+	openacc_exec_params and openacc_async_exec_params.
+	* testsuite/Makefile.in: Regenerate.
+	* testsuite/libgomp.oacc-c-c++-common/combined-directives-1.c:
+	Xfail on offloaded targets.
+
+	* Makefile.def: Bootstrap module libffi. Add libffi dependency
+	to all-target-libgomp.
+	* Makefile.in: Regenerate.
+	* configure.ac: Add libffi to bootstrap_target_libs when libgomp
+	is bootstrapped.
+	* configure: Regenerate.
+
+2018-12-21  Gergö Barany  <gergo@codesourcery.com>
+
+	* libgomp.h (enum gomp_map_vars_kind): Add
+	GOMP_MAP_VARS_OPENACC_IF_PRESENT.
+	* oacc-parallel.c (GOACC_data_start): Handle
+	GOACC_FLAG_HOST_DATA_IF_PRESENT flag.
+	* target.c (gomp_map_vars_async): Handle
+	GOMP_MAP_VARS_OPENACC_IF_PRESENT mapping kind.
+	* testsuite/libgomp.oacc-c-c++-common/host_data-6.c: New test.
+
+2018-12-20  Gergö Barany  <gergo@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-1.c:
+	Add missing reduction clauses.
+	* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-2.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-3.c:
+	Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c:
+	Likewise.
+
+2019-01-30  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* testsuite/libgomp.oacc-fortran/optional-cache.f95
+	* testsuite/libgomp.oacc-fortran/optional-data-copyin-by-value.f90
+	* testsuite/libgomp.oacc-fortran/optional-data-copyin.f90
+	* testsuite/libgomp.oacc-fortran/optional-data-copyout.f90
+	* testsuite/libgomp.oacc-fortran/optional-data-enter-exit.f90
+	* testsuite/libgomp.oacc-fortran/optional-declare.f90
+	* testsuite/libgomp.oacc-fortran/optional-firstprivate.f90
+	* testsuite/libgomp.oacc-fortran/optional-host_data.f90
+	* testsuite/libgomp.oacc-fortran/optional-nested-calls.f90
+	* testsuite/libgomp.oacc-fortran/optional-private.f90
+	* testsuite/libgomp.oacc-fortran/optional-reduction.f90
+	* testsuite/libgomp.oacc-fortran/optional-update-device.f90
+	* testsuite/libgomp.oacc-fortran/optional-update-host.f90
+
+2019-01-30  Kwok Cheung Yeung  <kcy@codesourcery.com>
+
+	* oacc-mem.c (update_dev_host): Return early if the host address
+	is NULL.
+	* testsuite/libgomp.oacc-c-c++-common/lib-43.c: Remove.
+	* testsuite/libgomp.oacc-c-c++-common/lib-47.c: Likewise.
+
+2018-12-11  Julian Brown  <julian@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/gang-private-1.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c: New test.
+	* testsuite/libgomp.oacc-c/pr85465.c: New test.
+	* testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90: New test.kk
+
+2019-03-19  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/lib-93.c: Adjust target selector.
+
+2018-09-20  Tom de Vries  <tdevries@suse.de>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread):
+	New variable.
+	(acc_init_1): Set acc_init_thread to pthread_self ().  Set
+	acc_init_state to initializing at the start, and to initialized at the
+	end.
+	(self_initializing_p): New function.
+	(acc_get_device_type): Return acc_device_none if called by thread that
+	is currently executing acc_init_1.
+
+2018-09-05  Cesar Philippidis  <cesar@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c: New
+	test.
+	* testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c: New
+	test.
+
+2018-09-20  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust test case
+	to conform to the new behavior of the auto clause in OpenACC 2.5.
+
+2018-10-04  Cesar Philippidis  <cesar@codesourcery.com>
+            Julian Brown  <julian@codesourcery.com>
+
+	* oacc-mem.c (gomp_acc_declare_allocate): New function.
+	* oacc-parallel.c (GOACC_enter_exit_data): Handle
+	GOMP_MAP_DECLARE_{ALLOCATE,DEALLOCATE}.
+	* testsuite/libgomp.oacc-fortran/allocatable-array.f90: New test.
+	* testsuite/libgomp.oacc-fortran/allocatable-scalar.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-2.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-3.f90: New test.
+	* testsuite/libgomp.oacc-fortran/declare-allocatable-4.f90: New test.
+
+2018-12-22  Cesar Philippidis  <cesar@codesourcery.com>
+            Julian Brown  <julian@codesourcery.com>
+
+	* oacc-parallel.c (GOACC_parallel_keyed): Handle
+	GOMP_MAP_FIRSTPRIVATE_INT host addresses.
+	* plugin/plugin-nvptx.c (nvptx_exec): Handle
+	GOMP_MAP_FIRSTPRIVATE_INT host addresses.
+	* testsuite/libgomp.oacc-c++/firstprivate-int.C: New test.
+	* testsuite/libgomp.oacc-c-c++-common/firstprivate-int.c: New
+	test.
+	* testsuite/libgomp.oacc-fortran/firstprivate-int.f90: New test.
+
+2018-08-28  Julian Brown  <julian@codesourcery.com>
+            Cesar Philippidis  <cesar@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/pr70828.c: New test.
+	* testsuite/libgomp.oacc-fortran/implicit_copy.f90: New test.
+	* testsuite/libgomp.oacc-fortran/pr70828.f90: New test.
+	* testsuite/libgomp.oacc-fortran/pr70828-2.f90: New test.
+	* testsuite/libgomp.oacc-fortran/pr70828-3.f90: New test.
+	* testsuite/libgomp.oacc-fortran/pr70828-4.f90: New test.
+	* testsuite/libgomp.oacc-fortran/pr70828-5.f90: New test.
+	* testsuite/libgomp.oacc-fortran/pr70828-6.f90: New test.
+
+2018-10-05  Nathan Sidwell  <nathan@acm.org>
+	    Tom de Vries  <tdevries@suse.de>
+	    Thomas Schwinge  <thomas@codesourcery.com>
+	    Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: Add -w.
+	* testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-warn-1.c: New.
+	* testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c: Update.
+	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/mode-transitions.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/private-variables.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-7.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-g-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise.
+	* testsuite/libgomp.oacc-fortran/par-reduction-2-1.f: Likewise.
+	* testsuite/libgomp.oacc-fortran/par-reduction-2-2.f: Likewise.
+	* testsuite/libgomp.oacc-fortran/pr84028.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/private-variables.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/routine-7.f90: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c: New.
+
+2018-10-22  James Norris  <jnorris@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+	    Tom de Vries  <tom@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Update parallel
+	regions to denote variables copyied in via acc enter data as
+	present.
+	* testsuite/libgomp.oacc-fortran/data-3.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/data-4.f90: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/subr.h: Reimplement.
+	* testsuite/libgomp.oacc-c-c++-common/subr.ptx: Regenerated PTX.
+	* testsuite/libgomp.oacc-c-c++-common/timer.h: Removed.
+	* testsuite/libgomp.oacc-c-c++-common/lib-69.c: Change async checks.
+	* testsuite/libgomp.oacc-c-c++-common/lib-70.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-71.c: Rework kernel i/f.
+	* testsuite/libgomp.oacc-c-c++-common/lib-72.c: Rework kernel i/f and
+	change async checks.
+	* testsuite/libgomp.oacc-c-c++-common/lib-73.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-74.c: Rework kernel i/f and
+	timing checks.
+	* testsuite/libgomp.oacc-c-c++-common/lib-75.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-76.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-77.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-78.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-80.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-81.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-82.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-93.c: New test.
+
+2018-12-13  Cesar Philippidis  <cesar@codesourcery.com>
+            Nathan Sidwell  <nathan@acm.org>
+            Julian Brown  <julian@codesourcery.com>
+
+        * libgomp.oacc-c-c++-common/par-reduction-3.c: New test.
+        * libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c: New test.
+        * libgomp.oacc-fortran/reduction-9.f90: New test.
+
+2018-06-29  Cesar Philippidis  <cesar@codesourcery.com>
+	    James Norris  <jnorris@codesourcery.com>
+
+	* oacc-parallel.c (GOACC_parallel_keyed): Handle Fortran deviceptr
+	clause.
+	(GOACC_data_start): Likewise.
+	* testsuite/libgomp.oacc-fortran/common-block-1.f90: New test.
+	* testsuite/libgomp.oacc-fortran/common-block-2.f90: New test.
+	* testsuite/libgomp.oacc-fortran/common-block-3.f90: New test.
+	* testsuite/libgomp.oacc-fortran/deviceptr-1.f90: New test.
+
+2019-02-12  Julian Brown <julian@codesourcery.com>
+
+	* oacc-cuda.c (acc_set_cuda_stream): Return 0 on error/invalid
+	arguments.
+	* testsuite/libgomp.oacc-c-c++-common/lib-84.c: Handle unnumbered
+	async stream being an alias for a numbered async stream.
+	* testsuite/libgomp.oacc-c-c++-common/lib-85.c: Likewise.
+
+2018-10-02  Thomas Schwinge  <thomas@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/routine-3.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/routine-nohost-1.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/routine-bind-nohost-1.c:
+	Update test.
+	* testsuite/libgomp.oacc-fortran/routine-6.f90: Likewise.
+
+2018-10-16  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/da-1.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/da-2.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/da-3.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/da-4.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/da-utils.h: New test.
+
+2018-10-16  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* target.c (struct da_dim): New struct declaration.
+	(struct da_descr_type): Likewise.
+	(struct da_info): Likewise.
+	(gomp_dynamic_array_count_rows): New function.
+	(gomp_dynamic_array_compute_info): Likewise.
+	(gomp_dynamic_array_fill_rows_1): Likewise.
+	(gomp_dynamic_array_fill_rows): Likewise.
+	(gomp_dynamic_array_create_ptrblock): Likewise.
+	(gomp_map_vars): Add code to handle dynamic array map kinds.
+
+2019-01-31  Julian Brown  <julian@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c++/deep-copy-12.C: New test.
+	* testsuite/libgomp.oacc-c++/deep-copy-13.C: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-9.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-11.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-14.c: New test.
+
+2018-11-28  Julian Brown  <julian@codesourcery.com>
+
+	* libgomp.h (RC_CHECKING): New macro, disabled by default, guarding all
+	hunks in this patch.
+	(target_mem_desc): Add forward declaration.
+	(async_tgt_use): New struct.
+	(target_mem_desc): Add refcount_chk, mark fields.
+	(acc_dispatch_t): Add tgt_uses, au_lock fields.
+	(dump_tgt, gomp_rc_check): Add prototypes.
+	* oacc-async (goacc_async_unmap_tgt): Add refcount self-check code.
+	(goacc_async_copyout_unmap_vars): Likewise.
+	(goacc_remove_var_async): Likewise.
+	* oacc-parallel.c (GOACC_parallel_keyed_internal): Add refcount
+	self-check code.
+	(GOACC_data_start, GOACC_data_end, GOACC_enter_exit_data): Likewise.
+	* target.c (stdio.h): Include.
+	(dump_tgt, rc_check_clear, rc_check_count, rc_check_verify)
+	(gomp_rc_check): New functions to consistency-check reference counts.
+	(gomp_target_init): Initialise self-check-related device fields.
+
+2018-12-14  Julian Brown  <julian@codesourcery.com>
+
+	* libgomp.h (struct target_var_desc): Add do_detach flag.
+	(VREFCOUNT_LINK_KEY): New macro.
+	(struct splay_tree_key_s): Put link_key and new attach_count field into
+	a new union.  Substitute dynamic_refcount field for virtual_refcount.
+	(struct acc_dispatch_t): Remove data_environ field.
+	(enum gomp_map_vars_kind): Add GOMP_MAP_VARS_OPENACC_ENTER_DATA.
+	(gomp_acc_insert_pointer): Remove prototype.
+	(gomp_acc_remove_pointer): Update prototype.
+	(struct gomp_coalesce_buf): Add forward declaration.
+	(gomp_map_val, gomp_attach_pointer, gomp_detach_pointer): Add
+	prototypes.
+	* libgomp.map (OACC_2.6): New section. Add acc_attach, acc_attach_async,
+	acc_detach, acc_detach_async, acc_detach_finalize,
+	acc_detach_finalize_async.
+	* oacc-async.c (goacc_remove_var_async): New function.
+	* oacc-host.c (host_dispatch): Don't initialise removed data_environ
+	field.
+	* oacc-init.c (acc_shutdown_1): Use gomp_remove_var instead of
+	gomp_unmap_vars to remove mappings by splay tree key instead of target
+	memory descriptor.
+	* oacc-int.h (splay_tree_key_s): Add forward declaration.
+	(goacc_remove_var_async): Add prototype.
+	* oacc-mem.c (lookup_dev_1): New function.
+	(lookup_dev): Reimplement using above.
+	(acc_free, acc_hostptr): Update calls to lookup_dev.
+	(acc_map_data): Likewise.  Don't add to data_environ list.
+	(acc_unmap_data): Remove call to gomp_unmap_vars.  Fix semantics to
+	remove mapping, but not mapped data.
+	(present_create_copy): Use virtual_refcount instead of
+	dynamic_refcount.  Don't manipulate data_environ.  Fix target pointer
+	return value.
+	(delete_copyout): Update for virtual_refcount semantics.  Use
+	goacc_remove_var_async for asynchronous delete/copyouts.
+	(gomp_acc_insert_pointer): Remove function.
+	(gomp_acc_remove_pointer): Reimplement.
+	(acc_attach_async, acc_attach, goacc_detach_internal, acc_detach)
+	(acc_detach_async, acc_detach_finalize, acc_detach_finalize_async): New
+	functions.
+	* oacc-parallel.c (find_pointer): Support attach/detach.  Make a little
+	more strict.
+	(GOACC_parallel_keyed): Use gomp_map_val to calculate device addresses.
+	(GOACC_enter_exit_data): Support attach/detach and GOMP_MAP_STRUCT.
+	Don't call gomp_acc_insert_pointer.
+	* openacc.h (acc_attach, acc_attach_async, acc_detach)
+	(acc_detach_async, acc_detach_finalize, acc_detach_finalize_async): Add
+	prototypes.
+	* target.c (gomp_map_vars_existing): Initialise do_detach field of
+	tgt_var_desc.
+	(gomp_attach_pointer, gomp_detach_pointer): New functions.
+	(gomp_map_val): Make global.
+	(gomp_map_vars_async): Handle GOMP_MAP_VARS_OPENACC_ENTER_DATA.  Update
+	for virtual_refcount semantics.  Support attach and detach.
+	(gomp_remove_var): Free attach count array if present.
+	(gomp_unmap_vars_async): Support detach and update for virtual_refcount
+	semantics.  Disambiguate link_key/attach_count using virtual_refcount
+	with magic value as a tag.
+	(gomp_load_image_to_device): Zero-initialise virtual_refcount fields.
+	(gomp_free_memmap): Remove function.
+	(gomp_exit_data): Check virtual_refcount for tag value before using
+	link_key.
+	(omp_target_associate_ptr): Zero-initialise virtual_refcount and
+	link_key splay tree key fields.
+	(gomp_target_init): Don't initialise removed data_environ field.
+	* testsuite/libgomp.oacc-c-c++-common/context-2.c: Use correct API to
+	deallocate acc_copyin'd data.
+	* testsuite/libgomp.oacc-c-c++-common/context-4.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-2.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-3.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-4.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-5.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-6.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-7.c: New test.
+	* testsuite/libgomp.oacc-c-c++-common/deep-copy-8.c: New test.
+	* testsuite/libgomp.oacc-fortran/deep-copy-1.c: New test.
+	* testsuite/libgomp.oacc-fortran/deep-copy-2.c: New test.
+	* testsuite/libgomp.oacc-fortran/deep-copy-3.c: New test.
+	* testsuite/libgomp.oacc-fortran/deep-copy-4.c: New test.
+	* testsuite/libgomp.oacc-fortran/deep-copy-5.c: New test.
+	* testsuite/libgomp.oacc-fortran/deep-copy-6.c: New test.
+	* testsuite/libgomp.oacc-fortran/deep-copy-7.c: New test.
+	* testsuite/libgomp.oacc-fortran/deep-copy-8.c: New test.
+	* testsuite/libgomp.oacc-fortran/data-2.f90: Update test.
+	* testsuite/libgomp.oacc-fortran/derived-type-1.f90: New test.
+	* testsuite/libgomp.oacc-fortran/update-2.f90: New test.
+
+2018-11-10  Julian Brown  <julian@codesourcery.com>
+
+	* libgomp.h (OFFSET_INLINED, OFFSET_POINTER, OFFSET_STRUCT): Define.
+	* target.c (FIELD_TGT_EMPTY): Define.
+	(gomp_map_val): Use OFFSET_* macros instead of magic constants.  Write
+	as switch instead of list of ifs.
+	(gomp_map_vars_internal): Use OFFSET_* and FIELD_TGT_EMPTY macros.
+
+2019-02-26  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* target.c (goacc_device_copy_async): New function.
+	(gomp_copy_host2dev): Remove 'static', add goacc_asyncqueue parameter,
+	add goacc_device_copy_async case.
+	(gomp_copy_dev2host): Likewise.
+	(gomp_map_vars_existing): Add goacc_asyncqueue parameter, adjust code.
+	(gomp_map_pointer): Likewise.
+	(gomp_map_fields_existing): Likewise.
+	(gomp_map_vars_internal): New always_inline function, renamed from
+	gomp_map_vars.
+	(gomp_map_vars): Implement by calling gomp_map_vars_internal.
+	(gomp_map_vars_async): Implement by calling gomp_map_vars_internal,
+	passing goacc_asyncqueue argument.
+	(gomp_unmap_tgt): Remove statis, add attribute_hidden.
+	(gomp_unmap_vars_internal): New always_inline function, renamed from
+	gomp_unmap_vars.
+	(gomp_unmap_vars): Implement by calling gomp_unmap_vars_internal.
+	(gomp_unmap_vars_async): Implement by calling gomp_unmap_vars_internal,
+	passing goacc_asyncqueue argument.
+	(gomp_fini_device): New function.
+	(gomp_exit_data): Adjust gomp_copy_dev2host call.
+	(gomp_load_plugin_for_device): Remove old interface, adjust to load
+	new async interface.
+	(gomp_target_fini): Adjust code to call gomp_fini_device.
+
+	* oacc-async.c (get_goacc_thread): New function.
+	(get_goacc_thread_device): New function.
+	(lookup_goacc_asyncqueue): New function.
+	(validate_async_val): New function.
+	(lookup_goacc_asyncqueue): New function.
+	(get_goacc_asyncqueue): New function.
+	(acc_async_test): Adjust code to use new async design.
+	(acc_async_test_all): Likewise.
+	(acc_wait): Likewise.
+	(acc_wait_async): Likewise.
+	(acc_wait_all): Likewise.
+	(acc_wait_all_async): Likewise.
+	(acc_get_default_async): New API function.
+	(acc_set_default_async): Likewise.
+	(goacc_async_unmap_tgt): New function.
+	(goacc_async_copyout_unmap_vars): Likewise.
+	(goacc_async_free): Likewise.
+	(goacc_init_asyncqueues): Likewise.
+	(goacc_fini_asyncqueues): Likewise.
+
+	* oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async
+	design.
+	(acc_set_cuda_stream): Likewise.
+
+	* oacc-host.c (host_version): Update to return GOMP_PLUGIN_IF_VERSION.
+	(host_openacc_exec): Adjust parameters, remove 'async'.
+	(host_openacc_register_async_cleanup): Remove.
+	(host_openacc_async_exec): New function.
+	(host_openacc_async_test): Adjust parameters.
+	(host_openacc_async_test_all): Remove.
+	(host_openacc_async_wait): Remove.
+	(host_openacc_async_wait_async): Remove.
+	(host_openacc_async_wait_all): Remove.
+	(host_openacc_async_wait_all_async): Remove.
+	(host_openacc_async_set_async): Remove.
+	(host_openacc_async_synchronize): New function.
+	(host_openacc_async_serialize): New function.
+	(host_openacc_async_host2dev): New function.
+	(host_openacc_async_dev2host): New function.
+	(host_openacc_async_queue_callback): New function.
+	(host_openacc_async_construct): New function.
+	(host_openacc_async_destruct): New function.
+	(struct gomp_device_descr host_dispatch): Remove initialization of old
+	interface, add intialization of new async sub-struct.
+
+	* oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device.
+	(goacc_attach_host_thread_to_device): Remove old async code usage, add
+	initialization of per-thread default_async.
+
+	* oacc-int.h (goacc_init_asyncqueues): New declaration.
+	(goacc_fini_asyncqueues): Likewise.
+	(goacc_async_free): Likewise.
+	(get_goacc_asyncqueue): Likewise.
+	(lookup_goacc_asyncqueue): Likewise.
+
+	* oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async
+	design.
+	(present_create_copy): Likewise.
+	(delete_copyout): Likewise.
+	(update_dev_host): Likewise.
+	(gomp_acc_insert_pointer): Add async parameter, adjust code to use new
+	async design.
+	(gomp_acc_remove_pointer): Adjust code to use new async design.
+
+	* oacc-parallel.c (GOACC_parallel_keyed): Likewise.
+	(GOACC_enter_exit_data): Likewise.
+	(goacc_wait): Likewise.
+	(GOACC_update): Likewise.
+
+	* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Change to assert fail
+	when called, warn as obsolete in comment.
+
+	* libgomp-plugin.h (struct goacc_asyncqueue): Declare.
+	(struct goacc_asyncqueue_list): Likewise.
+	(goacc_aq): Likewise.
+	(goacc_aq_list): Likewise.
+	(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
+	(GOMP_OFFLOAD_openacc_async_test): Remove.
+	(GOMP_OFFLOAD_openacc_async_test_all): Remove.
+	(GOMP_OFFLOAD_openacc_async_wait): Remove.
+	(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
+	(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
+	(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
+	(GOMP_OFFLOAD_openacc_async_set_async): Remove.
+	(GOMP_OFFLOAD_openacc_exec): Adjust declaration.
+	(GOMP_OFFLOAD_openacc_cuda_get_stream): Likewise.
+	(GOMP_OFFLOAD_openacc_cuda_set_stream): Likewise.
+	(GOMP_OFFLOAD_openacc_async_exec): Declare.
+	(GOMP_OFFLOAD_openacc_async_construct): Declare.
+	(GOMP_OFFLOAD_openacc_async_destruct): Declare.
+	(GOMP_OFFLOAD_openacc_async_test): Declare.
+	(GOMP_OFFLOAD_openacc_async_synchronize): Declare.
+	(GOMP_OFFLOAD_openacc_async_serialize): Declare.
+	(GOMP_OFFLOAD_openacc_async_queue_callback): Declare.
+	(GOMP_OFFLOAD_openacc_async_host2dev): Declare.
+	(GOMP_OFFLOAD_openacc_async_dev2host): Declare.
+
+	* openacc.h (acc_async_t): Add acc_async_default enum.
+	(acc_set_default_async): Declare API function.
+	(acc_get_default_async): Likewise.
+
+	* libgomp.h (struct acc_dispatch_t): Define 'async' sub-struct.
+	Delete register_async_cleanup_func, async_test_func,
+	async_test_all_func, async_wait_func, async_wait_async_func,
+	async_wait_all_func, async_wait_all_async_func, async_set_async_func
+	hook fields.
+	(gomp_acc_insert_pointer): Adjust declaration.
+	(gomp_copy_host2dev): New declaration.
+	(gomp_copy_dev2host): Likewise.
+	(gomp_map_vars_async): Likewise.
+	(gomp_unmap_tgt): Likewise.
+	(gomp_unmap_vars_async): Likewise.
+	(gomp_fini_device): Likewise.
+
+	* libgomp.map (OACC_2.5): Add acc_get_default_async,
+	acc_get_default_async_h_, acc_set_default_async, and
+	acc_set_default_async_h_.
+
+	* openacc.f90 (acc_async_default): Declare.
+	(acc_set_default_async): Likewise.
+	(acc_get_default_async): Likewise.
+
+	* openacc_lib.h (acc_async_default): Declare.
+	 (acc_set_default_async): Likewise.
+	(acc_get_default_async): Likewise.
+
+	* plugin/plugin-hsa.c (GOMP_OFFLOAD_version): Update to return
+	GOMP_PLUGIN_IF_VERSION.
+
+	* plugin/plugin-nvptx.c (struct cuda_map): Remove.
+	(struct ptx_stream): Remove.
+	(struct nvptx_thread): Remove current_stream field.
+	(cuda_map_create): Remove.
+	(cuda_map_destroy): Remove.
+	(map_init): Remove.
+	(map_fini): Remove.
+	(map_pop): Remove.
+	(map_push): Remove.
+	(struct goacc_asyncqueue): Define.
+	(struct nvptx_callback): Define.
+	(struct ptx_free_block): Define.
+	(struct ptx_device): Remove null_stream, active_streams, async_streams,
+	stream_lock, and next fields.
+	(enum ptx_event_type): Remove.
+	(struct ptx_event): Remove.
+	(ptx_event_lock): Remove.
+	(ptx_events): Remove.
+	(init_streams_for_device): Remove.
+	(fini_streams_for_device): Remove.
+	(select_stream_for_async): Remove.
+	(nvptx_init): Remove ptx_events and ptx_event_lock references.
+	(nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED
+	case.
+	(nvptx_open_device): Add free_blocks initialization, remove
+	init_streams_for_device call.
+	(nvptx_close_device): Remove fini_streams_for_device call, add
+	free_blocks destruct code.
+	(event_gc): Remove.
+	(event_add): Remove.
+	(nvptx_exec): Adjust parameters and code.
+	(nvptx_free): Likewise.
+	(nvptx_host2dev): Remove.
+	(nvptx_dev2host): Remove.
+	(nvptx_set_async): Remove.
+	(nvptx_async_test): Remove.
+	(nvptx_async_test_all): Remove.
+	(nvptx_wait): Remove.
+	(nvptx_wait_async): Remove.
+	(nvptx_wait_all): Remove.
+	(nvptx_wait_all_async): Remove.
+	(nvptx_get_cuda_stream): Remove.
+	(nvptx_set_cuda_stream): Remove.
+	(GOMP_OFFLOAD_alloc): Adjust code.
+	(GOMP_OFFLOAD_free): Likewise.
+	(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
+	(GOMP_OFFLOAD_openacc_exec): Adjust parameters and code.
+	(GOMP_OFFLOAD_openacc_async_test_all): Remove.
+	(GOMP_OFFLOAD_openacc_async_wait): Remove.
+	(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
+	(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
+	(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
+	(GOMP_OFFLOAD_openacc_async_set_async): Remove.
+	(cuda_free_argmem): New function.
+	(GOMP_OFFLOAD_openacc_async_exec): New plugin hook function.
+	(GOMP_OFFLOAD_openacc_create_thread_data): Adjust code.
+	(GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code.
+	(GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code.
+	(GOMP_OFFLOAD_openacc_async_construct): New plugin hook function.
+	(GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function.
+	(GOMP_OFFLOAD_openacc_async_test): Remove and re-implement.
+	(GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function.
+	(GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function.
+	(GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function.
+	(cuda_callback_wrapper): New function.
+	(cuda_memcpy_sanity_check): New function.
+	(GOMP_OFFLOAD_host2dev): Remove and re-implement.
+	(GOMP_OFFLOAD_dev2host): Remove and re-implement.
+	(GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function.
+	(GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function.
+
+	* testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c: New testcase.
+	* testsuite/libgomp.oacc-c-c++-common/lib-81.c: Adjust testcase.
+	* testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise.
+
+2018-05-20  Thomas Schwinge  <thomas@codesourcery.com>
+
+	PR libgomp/81886
+	* openacc.h (enum acc_device_t): Add _acc_device_intel_mic,
+	_acc_device_hsa.
+	* oacc-init.c (get_openacc_name): Handle these.
+	(resolve_device): Debugging output.
+	* target.c (resolve_device, gomp_init_device)
+	(gomp_offload_target_available_p): Likewise.
+	(GOMP_set_offload_targets): Rewrite.
+	* testsuite/libgomp.oacc-c++/c++.exp: Provide offload target in
+	"-DACC_DEVICE_TYPE_host", and "-DACC_DEVICE_TYPE_nvidia".
+	* testsuite/libgomp.oacc-c/c.exp: Likewise.
+	* testsuite/libgomp.oacc-fortran/fortran.exp: Likewise.
+	* testsuite/libgomp.oacc-c/offload-targets-1.c: New file.
+	* testsuite/libgomp.oacc-c/offload-targets-2.c: Likewise.
+	* testsuite/libgomp.oacc-c/offload-targets-3.c: Likewise.
+	* testsuite/libgomp.oacc-c/offload-targets-4.c: Likewise.
+	* testsuite/libgomp.oacc-c/offload-targets-5.c: Likewise.
+	* testsuite/libgomp.oacc-c/offload-targets-6.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Adjust.
+	* testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: Likewise.
+	* testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f: Likewise.
+	* testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f: Likewise.
+
+2017-05-14  Thomas Schwinge  <thomas@codesourcery.com>
+
+	PR libgomp/81886
+	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Adapt.
+
+2015-08-20  Thomas Schwinge  <thomas@codesourcery.com>
+	    Joseph Myers  <joseph@codesourcery.com>
+
+	PR libgomp/81886
+	* plugin/configfrag.ac (tgt_name): Do not set.
+	(offload_targets): Separate with colons not commas.
+	* config.h.in, configure: Regenerate.
+	* libgomp.h (gomp_offload_target_available_p): New prototype.
+	* libgomp.map (GOACC_2.0.GOMP_4_BRANCH): Add
+	GOMP_set_offload_targets.
+	* libgomp_g.h (GOMP_set_offload_targets): New prototype.
+	* oacc-init.c (resolve_device): Use
+	gomp_offload_target_available_p.
+	* target.c (resolve_device): Use host fallback when offload data
+	not available.
+	(gomp_offload_target_available_p, offload_target_to_plugin_name)
+	(gomp_offload_targets, gomp_offload_targets_init)
+	(GOMP_set_offload_targets, gomp_plugin_prefix)
+	(gomp_plugin_suffix): New.
+	(gomp_load_plugin_for_device): Add gomp_debug call.
+	(gomp_target_init): Use gomp_offload_targets instead of
+	OFFLOAD_TARGETS.  Handle and rewrie colon-separated string.
+	* testsuite/lib/libgomp.exp: Expect offload targets to be
+	colon-separated.  Adjust matching of offload targets.
+	(libgomp_init)
+	(check_effective_target_openacc_nvidia_accel_configured)
+	(check_effective_target_openacc_host_selected): Adjust checks of
+	offload target names.
+	* testsuite/libgomp.oacc-c++/c++.exp: Adjust set of offload
+	targets.  Use -foffload instead of setenv ACC_DEVICE_TYPE.
+	* testsuite/libgomp.oacc-c/c.exp: Likewise.
+	* testsuite/libgomp.oacc-fortran/fortran.exp: Likewise.
diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am
index 062fded..ba9b56c 100644
--- a/libgomp/Makefile.am
+++ b/libgomp/Makefile.am
@@ -14,9 +14,16 @@
 fincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)$(MULTISUBDIR)/finclude
 libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
 
+LIBFFI = @LIBFFI@
+LIBFFIINCS = @LIBFFIINCS@
+
+if USE_LIBFFI
+libgomp_la_LIBADD = $(LIBFFI)
+endif
+
 vpath % $(strip $(search_path))
 
-AM_CPPFLAGS = $(addprefix -I, $(search_path))
+AM_CPPFLAGS = $(addprefix -I, $(search_path)) $(LIBFFIINCS)
 AM_CFLAGS = $(XCFLAGS)
 AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
 
@@ -65,7 +72,8 @@
 	proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
 	splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
 	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
-	affinity-fmt.c teams.c
+	affinity-fmt.c teams.c gomp_print.c oacc-profiling.c \
+	oacc-profiling-acc_register_library.c
 
 include $(top_srcdir)/plugin/Makefrag.am
 
@@ -74,7 +82,7 @@
 endif
 
 nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h openacc.h
+nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
 if USE_FORTRAN
 nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
 	openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in
index c7e63f1..650437f 100644
--- a/libgomp/Makefile.in
+++ b/libgomp/Makefile.in
@@ -16,7 +16,7 @@
 
 # Plugins for offload execution, Makefile.am fragment.
 #
-# Copyright (C) 2014-2018 Free Software Foundation, Inc.
+# Copyright (C) 2014-2019 Free Software Foundation, Inc.
 #
 # Contributed by Mentor Embedded.
 #
@@ -120,7 +120,8 @@
 target_triplet = @target@
 @PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la
 @PLUGIN_HSA_TRUE@am__append_2 = libgomp-plugin-hsa.la
-@USE_FORTRAN_TRUE@am__append_3 = openacc.f90
+@PLUGIN_GCN_TRUE@am__append_3 = libgomp-plugin-gcn.la
+@USE_FORTRAN_TRUE@am__append_4 = openacc.f90
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
@@ -180,15 +181,26 @@
 	"$(DESTDIR)$(toolexeclibdir)"
 LTLIBRARIES = $(toolexeclib_LTLIBRARIES)
 am__DEPENDENCIES_1 =
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_DEPENDENCIES = libgomp.la \
+@PLUGIN_GCN_TRUE@	$(am__DEPENDENCIES_1)
+@PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_OBJECTS =  \
+@PLUGIN_GCN_TRUE@	libgomp_plugin_gcn_la-plugin-gcn.lo
+libgomp_plugin_gcn_la_OBJECTS = $(am_libgomp_plugin_gcn_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+libgomp_plugin_gcn_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+	$(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(libgomp_plugin_gcn_la_LDFLAGS) $(LDFLAGS) -o $@
+@PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_rpath = -rpath \
+@PLUGIN_GCN_TRUE@	$(toolexeclibdir)
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_DEPENDENCIES = libgomp.la \
 @PLUGIN_HSA_TRUE@	$(am__DEPENDENCIES_1)
 @PLUGIN_HSA_TRUE@am_libgomp_plugin_hsa_la_OBJECTS =  \
 @PLUGIN_HSA_TRUE@	libgomp_plugin_hsa_la-plugin-hsa.lo
 libgomp_plugin_hsa_la_OBJECTS = $(am_libgomp_plugin_hsa_la_OBJECTS)
-AM_V_lt = $(am__v_lt_@AM_V@)
-am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
-am__v_lt_0 = --silent
-am__v_lt_1 = 
 libgomp_plugin_hsa_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
 	$(libgomp_plugin_hsa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
 	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
@@ -207,7 +219,6 @@
 	$(libgomp_plugin_nvptx_la_LDFLAGS) $(LDFLAGS) -o $@
 @PLUGIN_NVPTX_TRUE@am_libgomp_plugin_nvptx_la_rpath = -rpath \
 @PLUGIN_NVPTX_TRUE@	$(toolexeclibdir)
-libgomp_la_LIBADD =
 @USE_FORTRAN_TRUE@am__objects_1 = openacc.lo
 am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
 	env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \
@@ -217,7 +228,8 @@
 	target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
 	oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
 	oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
-	teams.lo $(am__objects_1)
+	teams.lo gomp_print.lo oacc-profiling.lo \
+	oacc-profiling-acc_register_library.lo $(am__objects_1)
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 AM_V_P = $(am__v_P_@AM_V@)
 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
@@ -265,7 +277,8 @@
 am__v_FCLD_ = $(am__v_FCLD_@AM_DEFAULT_V@)
 am__v_FCLD_0 = @echo "  FCLD    " $@;
 am__v_FCLD_1 = 
-SOURCES = $(libgomp_plugin_hsa_la_SOURCES) \
+SOURCES = $(libgomp_plugin_gcn_la_SOURCES) \
+	$(libgomp_plugin_hsa_la_SOURCES) \
 	$(libgomp_plugin_nvptx_la_SOURCES) $(libgomp_la_SOURCES)
 AM_V_DVIPS = $(am__v_DVIPS_@AM_V@)
 am__v_DVIPS_ = $(am__v_DVIPS_@AM_DEFAULT_V@)
@@ -390,6 +403,8 @@
 INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
 LD = @LD@
 LDFLAGS = @LDFLAGS@
+LIBFFI = @LIBFFI@
+LIBFFIINCS = @LIBFFIINCS@
 LIBOBJS = @LIBOBJS@
 LIBS = @LIBS@
 LIBTOOL = @LIBTOOL@
@@ -427,6 +442,10 @@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
 PERL = @PERL@
+PLUGIN_GCN = @PLUGIN_GCN@
+PLUGIN_GCN_CPPFLAGS = @PLUGIN_GCN_CPPFLAGS@
+PLUGIN_GCN_LDFLAGS = @PLUGIN_GCN_LDFLAGS@
+PLUGIN_GCN_LIBS = @PLUGIN_GCN_LIBS@
 PLUGIN_HSA = @PLUGIN_HSA@
 PLUGIN_HSA_CPPFLAGS = @PLUGIN_HSA_CPPFLAGS@
 PLUGIN_HSA_LDFLAGS = @PLUGIN_HSA_LDFLAGS@
@@ -500,6 +519,7 @@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -523,10 +543,12 @@
 
 fincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)$(MULTISUBDIR)/finclude
 libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
-AM_CPPFLAGS = $(addprefix -I, $(search_path))
+@USE_LIBFFI_TRUE@libgomp_la_LIBADD = $(LIBFFI)
+AM_CPPFLAGS = $(addprefix -I, $(search_path)) $(LIBFFIINCS)
 AM_CFLAGS = $(XCFLAGS)
 AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
-toolexeclib_LTLIBRARIES = libgomp.la $(am__append_1) $(am__append_2)
+toolexeclib_LTLIBRARIES = libgomp.la $(am__append_1) $(am__append_2) \
+	$(am__append_3)
 nodist_toolexeclib_HEADERS = libgomp.spec
 
 # -Wc is only a libtool option.
@@ -551,7 +573,8 @@
 	affinity.c target.c splay-tree.c libgomp-plugin.c \
 	oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
 	oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
-	affinity-fmt.c teams.c $(am__append_3)
+	affinity-fmt.c teams.c gomp_print.c oacc-profiling.c \
+	oacc-profiling-acc_register_library.c $(am__append_4)
 
 # Nvidia PTX OpenACC plugin.
 @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -574,8 +597,20 @@
 @PLUGIN_HSA_TRUE@	$(lt_host_flags) $(PLUGIN_HSA_LDFLAGS)
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
+
+# AMD GCN plugin
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_GCN_CPPFLAGS) \
+@PLUGIN_GCN_TRUE@	-D_GNU_SOURCE
+
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_LDFLAGS =  \
+@PLUGIN_GCN_TRUE@	$(libgomp_plugin_gcn_version_info) \
+@PLUGIN_GCN_TRUE@	$(lt_host_flags) $(PLUGIN_GCN_LDFLAGS)
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_LIBADD = libgomp.la $(PLUGIN_GCN_LIBS)
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_LIBTOOLFLAGS = --tag=disable-static
 nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h openacc.h
+nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
 @USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
 @USE_FORTRAN_TRUE@	openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
 
@@ -710,6 +745,9 @@
 	  rm -f $${locs}; \
 	}
 
+libgomp-plugin-gcn.la: $(libgomp_plugin_gcn_la_OBJECTS) $(libgomp_plugin_gcn_la_DEPENDENCIES) $(EXTRA_libgomp_plugin_gcn_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libgomp_plugin_gcn_la_LINK) $(am_libgomp_plugin_gcn_la_rpath) $(libgomp_plugin_gcn_la_OBJECTS) $(libgomp_plugin_gcn_la_LIBADD) $(LIBS)
+
 libgomp-plugin-hsa.la: $(libgomp_plugin_hsa_la_OBJECTS) $(libgomp_plugin_hsa_la_DEPENDENCIES) $(EXTRA_libgomp_plugin_hsa_la_DEPENDENCIES) 
 	$(AM_V_CCLD)$(libgomp_plugin_hsa_la_LINK) $(am_libgomp_plugin_hsa_la_rpath) $(libgomp_plugin_hsa_la_OBJECTS) $(libgomp_plugin_hsa_la_LIBADD) $(LIBS)
 
@@ -735,11 +773,13 @@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gomp_print.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv-device.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
@@ -753,6 +793,8 @@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling-acc_register_library.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@
@@ -790,6 +832,13 @@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
 
+libgomp_plugin_gcn_la-plugin-gcn.lo: plugin/plugin-gcn.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-plugin-gcn.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Tpo -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='plugin/plugin-gcn.c' object='libgomp_plugin_gcn_la-plugin-gcn.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c
+
 libgomp_plugin_hsa_la-plugin-hsa.lo: plugin/plugin-hsa.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_hsa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_hsa_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_hsa_la-plugin-hsa.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Tpo -c -o libgomp_plugin_hsa_la-plugin-hsa.lo `test -f 'plugin/plugin-hsa.c' || echo '$(srcdir)/'`plugin/plugin-hsa.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Tpo $(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Plo
diff --git a/libgomp/acc_prof.h b/libgomp/acc_prof.h
new file mode 100644
index 0000000..c7a2197
--- /dev/null
+++ b/libgomp/acc_prof.h
@@ -0,0 +1,252 @@
+/* OpenACC Profiling Interface
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   Contributed by Mentor, a Siemens Business.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _ACC_PROF_H
+#define _ACC_PROF_H 1
+
+
+/* The OpenACC specification doesn't say so explicitly, but as its Profiling
+   Interface explicitly makes use of, for example, <openacc.h>'s
+   'acc_device_t', we supposedly are to '#include' that file here.  */
+
+#include <openacc.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Events.  */
+
+typedef enum acc_event_t
+{
+  acc_ev_none = 0,
+  acc_ev_device_init_start,
+  acc_ev_device_init_end,
+  acc_ev_device_shutdown_start,
+  acc_ev_device_shutdown_end,
+  acc_ev_runtime_shutdown,
+  acc_ev_create,
+  acc_ev_delete,
+  acc_ev_alloc,
+  acc_ev_free,
+  acc_ev_enter_data_start,
+  acc_ev_enter_data_end,
+  acc_ev_exit_data_start,
+  acc_ev_exit_data_end,
+  acc_ev_update_start,
+  acc_ev_update_end,
+  acc_ev_compute_construct_start,
+  acc_ev_compute_construct_end,
+  acc_ev_enqueue_launch_start,
+  acc_ev_enqueue_launch_end,
+  acc_ev_enqueue_upload_start,
+  acc_ev_enqueue_upload_end,
+  acc_ev_enqueue_download_start,
+  acc_ev_enqueue_download_end,
+  acc_ev_wait_start,
+  acc_ev_wait_end,
+  acc_ev_last
+} acc_event_t;
+
+
+/* Callbacks Signature.  */
+
+/* "The datatype 'ssize_t' means a signed 32-bit integer for a 32-bit binary
+   and a 64-bit integer for a 64-bit binary".  */
+typedef signed long int _acc_prof_ssize_t;
+/* "The datatype 'size_t' means an unsigned 32-bit integer for a 32-bit binary
+   and a 64-bit integer for a 64-bit binary".  */
+typedef unsigned long int _acc_prof_size_t;
+/* "The datatype 'int' means a 32-bit integer for both 32-bit and 64-bit
+   binaries".  */
+typedef int _acc_prof_int_t;
+
+/* Internal helpers: a struct's 'valid_bytes' may be less than its 'sizeof'.  */
+#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) \
+  offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
+#if 0 /* Untested.  */
+#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \
+  ((_n - 1) * sizeof (_type) + (_valid_bytes_type))
+#endif
+#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \
+  (sizeof (_basictype))
+
+typedef struct acc_prof_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  _acc_prof_int_t version;
+  acc_device_t device_type;
+  _acc_prof_int_t device_number;
+  _acc_prof_int_t thread_id;
+  _acc_prof_ssize_t async;
+  _acc_prof_ssize_t async_queue;
+  const char *src_file;
+  const char *func_name;
+  _acc_prof_int_t line_no, end_line_no;
+  _acc_prof_int_t func_line_no, func_end_line_no;
+#define _ACC_PROF_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
+} acc_prof_info;
+
+/* We implement the OpenACC 2.6 Profiling Interface.  */
+
+#define _ACC_PROF_INFO_VERSION 201711
+
+typedef enum acc_construct_t
+{
+  acc_construct_parallel = 0,
+  acc_construct_kernels,
+  acc_construct_loop,
+  acc_construct_data,
+  acc_construct_enter_data,
+  acc_construct_exit_data,
+  acc_construct_host_data,
+  acc_construct_atomic,
+  acc_construct_declare,
+  acc_construct_init,
+  acc_construct_shutdown,
+  acc_construct_set,
+  acc_construct_update,
+  acc_construct_routine,
+  acc_construct_wait,
+  acc_construct_runtime_api,
+  acc_construct_serial
+} acc_construct_t;
+
+typedef struct acc_data_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  const char *var_name;
+  _acc_prof_size_t bytes;
+  const void *host_ptr;
+  const void *device_ptr;
+#define _ACC_DATA_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_data_event_info;
+
+typedef struct acc_launch_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  const char *kernel_name;
+  _acc_prof_size_t num_gangs, num_workers, vector_length;
+#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t))
+} acc_launch_event_info;
+
+typedef struct acc_other_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_other_event_info;
+
+typedef union acc_event_info
+{
+  acc_event_t event_type;
+  acc_data_event_info data_event;
+  acc_launch_event_info launch_event;
+  acc_other_event_info other_event;
+} acc_event_info;
+
+typedef enum acc_device_api
+{
+  acc_device_api_none = 0,
+  acc_device_api_cuda,
+  acc_device_api_opencl,
+  acc_device_api_coi,
+  acc_device_api_other
+} acc_device_api;
+
+typedef struct acc_api_info
+{
+  acc_device_api device_api;
+  _acc_prof_int_t valid_bytes;
+  acc_device_t device_type;
+  _acc_prof_int_t vendor;
+  const void *device_handle;
+  const void *context_handle;
+  const void *async_handle;
+#define _ACC_API_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_api_info;
+
+/* Don't tag 'acc_prof_callback' as '__GOACC_NOTHROW': these functions are
+   provided by user code, and must be expected to do anything.  */
+typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
+				   acc_api_info *);
+
+
+/* Loading the Library.  */
+
+typedef enum acc_register_t
+{
+  acc_reg = 0,
+  acc_toggle = 1,
+  acc_toggle_per_thread = 2
+} acc_register_t;
+
+typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
+extern void acc_prof_register (acc_event_t, acc_prof_callback,
+			       acc_register_t) __GOACC_NOTHROW;
+extern void acc_prof_unregister (acc_event_t, acc_prof_callback,
+				 acc_register_t) __GOACC_NOTHROW;
+typedef void (*acc_query_fn) ();
+typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
+extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
+/* Don't tag 'acc_register_library' as '__GOACC_NOTHROW': this function can be
+   overridden by user code, and must be expected to do anything.  */
+extern void acc_register_library (acc_prof_reg, acc_prof_reg,
+				  acc_prof_lookup_func);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _ACC_PROF_H */
diff --git a/libgomp/acinclude.m4 b/libgomp/acinclude.m4
index 51f4f30..9a15c0f 100644
--- a/libgomp/acinclude.m4
+++ b/libgomp/acinclude.m4
@@ -154,7 +154,7 @@
   fi
   changequote(,)
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
   changequote([,])
   libgomp_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) [$]3=0; print ([$]1*100+[$]2)*100+[$]3 }'`
diff --git a/libgomp/affinity-fmt.c b/libgomp/affinity-fmt.c
index 61417c9..09a66f0 100644
--- a/libgomp/affinity-fmt.c
+++ b/libgomp/affinity-fmt.c
@@ -38,7 +38,7 @@
 #endif
 
 bool
-gomp_print_string (const char *str, size_t len)
+gomp_write_string (const char *str, size_t len)
 {
   return fwrite (str, 1, len, stderr) != len;
 }
@@ -462,13 +462,13 @@
   if (ret < sizeof buf)
     {
       buf[ret] = '\n';
-      gomp_print_string (buf, ret + 1);
+      gomp_write_string (buf, ret + 1);
       return;
     }
   b = gomp_malloc (ret + 1);
   ialias_call (omp_capture_affinity) (b, ret + 1, format);
   b[ret] = '\n';
-  gomp_print_string (b, ret + 1);
+  gomp_write_string (b, ret + 1);
   free (b);
 }
 
@@ -483,13 +483,13 @@
   if (ret < sizeof buf)
     {
       buf[ret] = '\n';
-      gomp_print_string (buf, ret + 1);
+      gomp_write_string (buf, ret + 1);
       return;
     }
   b = gomp_malloc (ret + 1);
   gomp_display_affinity (b, ret + 1, gomp_affinity_format_var,
   			 handle, ts, place);
   b[ret] = '\n';
-  gomp_print_string (b, ret + 1);
+  gomp_write_string (b, ret + 1);
   free (b);
 }
diff --git a/libgomp/config.h.in b/libgomp/config.h.in
index 73f1b12..778a59e 100644
--- a/libgomp/config.h.in
+++ b/libgomp/config.h.in
@@ -146,8 +146,8 @@
    */
 #undef LT_OBJDIR
 
-/* Define to offload plugins, separated by commas. */
-#undef OFFLOAD_PLUGINS
+/* Define to offload targets, separated by colons. */
+#undef OFFLOAD_TARGETS
 
 /* Name of package */
 #undef PACKAGE
@@ -170,6 +170,9 @@
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
+/* Define to 1 if the GCN plugin is built, 0 if not. */
+#undef PLUGIN_GCN
+
 /* Define to 1 if the HSA plugin is built, 0 if not. */
 #undef PLUGIN_HSA
 
@@ -210,5 +213,8 @@
 /* Define to 1 if the target use emutls for thread-local storage. */
 #undef USE_EMUTLS
 
+/* Define to 1 if the target requires libffi to call the offloaded funtions. */
+#undef USE_LIBFFI
+
 /* Version number of package */
 #undef VERSION
diff --git a/libgomp/config/nvptx/libgomp-plugin.c b/libgomp/config/accel/libgomp-plugin.c
similarity index 100%
rename from libgomp/config/nvptx/libgomp-plugin.c
rename to libgomp/config/accel/libgomp-plugin.c
diff --git a/libgomp/config/nvptx/lock.c b/libgomp/config/accel/lock.c
similarity index 100%
rename from libgomp/config/nvptx/lock.c
rename to libgomp/config/accel/lock.c
diff --git a/libgomp/config/nvptx/mutex.c b/libgomp/config/accel/mutex.c
similarity index 100%
rename from libgomp/config/nvptx/mutex.c
rename to libgomp/config/accel/mutex.c
diff --git a/libgomp/config/nvptx/mutex.h b/libgomp/config/accel/mutex.h
similarity index 100%
rename from libgomp/config/nvptx/mutex.h
rename to libgomp/config/accel/mutex.h
diff --git a/libgomp/config/nvptx/oacc-async.c b/libgomp/config/accel/oacc-async.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-async.c
rename to libgomp/config/accel/oacc-async.c
diff --git a/libgomp/config/nvptx/oacc-cuda.c b/libgomp/config/accel/oacc-cuda.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-cuda.c
rename to libgomp/config/accel/oacc-cuda.c
diff --git a/libgomp/config/nvptx/oacc-host.c b/libgomp/config/accel/oacc-host.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-host.c
rename to libgomp/config/accel/oacc-host.c
diff --git a/libgomp/config/nvptx/oacc-init.c b/libgomp/config/accel/oacc-init.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-init.c
rename to libgomp/config/accel/oacc-init.c
diff --git a/libgomp/config/nvptx/oacc-mem.c b/libgomp/config/accel/oacc-mem.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-mem.c
rename to libgomp/config/accel/oacc-mem.c
diff --git a/libgomp/config/nvptx/oacc-plugin.c b/libgomp/config/accel/oacc-plugin.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-plugin.c
rename to libgomp/config/accel/oacc-plugin.c
diff --git a/libgomp/config/nvptx/omp-lock.h b/libgomp/config/accel/omp-lock.h
similarity index 100%
rename from libgomp/config/nvptx/omp-lock.h
rename to libgomp/config/accel/omp-lock.h
diff --git a/libgomp/config/nvptx/openacc.f90 b/libgomp/config/accel/openacc.f90
similarity index 95%
rename from libgomp/config/nvptx/openacc.f90
rename to libgomp/config/accel/openacc.f90
index a7f690e..c2331d8 100644
--- a/libgomp/config/nvptx/openacc.f90
+++ b/libgomp/config/accel/openacc.f90
@@ -51,6 +51,8 @@
   ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
   integer (acc_device_kind), parameter :: acc_device_not_host = 4
   integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+  integer (acc_device_kind), parameter :: acc_device_hsa = 6
+  integer (acc_device_kind), parameter :: acc_device_gcn = 7
 
 end module
 
diff --git a/libgomp/config/nvptx/pool.h b/libgomp/config/accel/pool.h
similarity index 100%
rename from libgomp/config/nvptx/pool.h
rename to libgomp/config/accel/pool.h
diff --git a/libgomp/config/nvptx/proc.c b/libgomp/config/accel/proc.c
similarity index 97%
rename from libgomp/config/nvptx/proc.c
rename to libgomp/config/accel/proc.c
index 8ca0b0a..fb2d483 100644
--- a/libgomp/config/nvptx/proc.c
+++ b/libgomp/config/accel/proc.c
@@ -39,3 +39,4 @@
 {
   return gomp_icv (false)->nthreads_var;
 }
+ialias (omp_get_num_procs)
diff --git a/libgomp/config/nvptx/ptrlock.c b/libgomp/config/accel/ptrlock.c
similarity index 100%
rename from libgomp/config/nvptx/ptrlock.c
rename to libgomp/config/accel/ptrlock.c
diff --git a/libgomp/config/nvptx/ptrlock.h b/libgomp/config/accel/ptrlock.h
similarity index 100%
rename from libgomp/config/nvptx/ptrlock.h
rename to libgomp/config/accel/ptrlock.h
diff --git a/libgomp/config/nvptx/sem.c b/libgomp/config/accel/sem.c
similarity index 100%
rename from libgomp/config/nvptx/sem.c
rename to libgomp/config/accel/sem.c
diff --git a/libgomp/config/nvptx/sem.h b/libgomp/config/accel/sem.h
similarity index 100%
rename from libgomp/config/nvptx/sem.h
rename to libgomp/config/accel/sem.h
diff --git a/libgomp/config/nvptx/thread-stacksize.h b/libgomp/config/accel/thread-stacksize.h
similarity index 100%
rename from libgomp/config/nvptx/thread-stacksize.h
rename to libgomp/config/accel/thread-stacksize.h
diff --git a/libgomp/config/gcn/affinity-fmt.c b/libgomp/config/gcn/affinity-fmt.c
new file mode 100644
index 0000000..3585f41
--- /dev/null
+++ b/libgomp/config/gcn/affinity-fmt.c
@@ -0,0 +1,51 @@
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>  /* For PRIx64.  */
+#endif
+#ifdef HAVE_UNAME
+#include <sys/utsname.h>
+#endif
+
+/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for nvptx,
+   while the nvptx newlib implementation does not support those functions.
+   Override the configure test results here.  */
+#undef HAVE_GETPID
+#undef HAVE_GETHOSTNAME
+
+/* The GCN newlib implementation does not support fwrite, but it does support
+   write.  Map fwrite to write.  */
+#undef fwrite
+#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size))
+
+#include "../../affinity-fmt.c"
+
diff --git a/libgomp/config/gcn/bar.c b/libgomp/config/gcn/bar.c
new file mode 100644
index 0000000..592dacd
--- /dev/null
+++ b/libgomp/config/gcn/bar.c
@@ -0,0 +1,230 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an AMD GCN specific implementation of a barrier synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   implementation uses atomic instructions and s_barrier instruction.  It
+   uses MEMMODEL_RELAXED here because barriers are within workgroups and
+   therefore don't need to flush caches.  */
+
+#include <limits.h>
+#include "libgomp.h"
+
+
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+			MEMMODEL_RELAXED);
+    }
+  asm ("s_barrier" ::: "memory");
+}
+
+void
+gomp_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+/* Like gomp_barrier_wait, except that if the encountering thread
+   is not the last one to hit the barrier, it returns immediately.
+   The intended usage is that a thread which intends to gomp_barrier_destroy
+   this barrier calls gomp_barrier_wait, while all other threads
+   call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
+   the barrier can be safely destroyed.  */
+
+void
+gomp_barrier_wait_last (gomp_barrier_t *bar)
+{
+  /* Deferring to gomp_barrier_wait does not use the optimization opportunity
+     allowed by the interface contract for all-but-last participants.  The
+     original implementation in config/linux/bar.c handles this better.  */
+  gomp_barrier_wait (bar);
+}
+
+void
+gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+{
+  asm ("s_barrier" ::: "memory");
+}
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  state &= ~BAR_WAS_LAST;
+	}
+      else
+	{
+	  state &= ~BAR_CANCELLED;
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
+	  asm ("s_barrier" ::: "memory");
+	  return;
+	}
+    }
+
+  generation = state;
+  state &= ~BAR_CANCELLED;
+  int retry = 100;
+  do
+    {
+      if (retry-- == 0)
+	{
+	  /* It really shouldn't happen that barriers get out of sync, but
+	     if they do then this will loop until they realign, so we need
+	     to avoid an infinite loop where the thread just isn't there.  */
+	  gomp_print_string ("Barrier sync failed (another thread died?);",
+			     " aborting.");
+	  abort();
+	}
+
+      asm ("s_barrier" ::: "memory");
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	}
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+}
+
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    bar->awaited_final = bar->total;
+  gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+				   gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      /* BAR_CANCELLED should never be set in state here, because
+	 cancellation means that at least one of the threads has been
+	 cancelled, thus on a cancellable barrier we should never see
+	 all threads to arrive.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  state &= ~BAR_WAS_LAST;
+	}
+      else
+	{
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
+	  asm ("s_barrier" ::: "memory");
+	  return false;
+	}
+    }
+
+  if (__builtin_expect (state & BAR_CANCELLED, 0))
+    return true;
+
+  generation = state;
+  int retry = 100;
+  do
+    {
+      if (retry-- == 0)
+	{
+	  /* It really shouldn't happen that barriers get out of sync, but
+	     if they do then this will loop until they realign, so we need
+	     to avoid an infinite loop where the thread just isn't there.  */
+	  gomp_print_string ("Barrier sync failed (another thread died?);",
+			     " aborting.");
+	  abort();
+	}
+
+      asm ("s_barrier" ::: "memory");
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+      if (__builtin_expect (gen & BAR_CANCELLED, 0))
+	return true;
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+	}
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+
+  return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
+  gomp_team_barrier_wake (&team->barrier, INT_MAX);
+}
diff --git a/libgomp/config/gcn/bar.h b/libgomp/config/gcn/bar.h
new file mode 100644
index 0000000..ec8851b
--- /dev/null
+++ b/libgomp/config/gcn/bar.h
@@ -0,0 +1,168 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an AMD GCN specific implementation of a barrier synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   implementation uses atomic instructions and s_barrier instruction.  It
+   uses MEMMODEL_RELAXED here because barriers are within workgroups and
+   therefore don't need to flush caches.  */
+
+#ifndef GOMP_BARRIER_H
+#define GOMP_BARRIER_H 1
+
+#include "mutex.h"
+
+typedef struct
+{
+  unsigned total;
+  unsigned generation;
+  unsigned awaited;
+  unsigned awaited_final;
+} gomp_barrier_t;
+
+typedef unsigned int gomp_barrier_state_t;
+
+/* The generation field contains a counter in the high bits, with a few
+   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
+   share space because WAS_LAST is never stored back to generation.  */
+#define BAR_TASK_PENDING	1
+#define BAR_WAS_LAST		1
+#define BAR_WAITING_FOR_TASK	2
+#define BAR_CANCELLED		4
+#define BAR_INCR		8
+
+static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
+{
+  bar->total = count;
+  bar->awaited = count;
+  bar->awaited_final = count;
+  bar->generation = 0;
+}
+
+static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
+{
+  __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_RELAXED);
+  bar->total = count;
+}
+
+static inline void gomp_barrier_destroy (gomp_barrier_t *bar)
+{
+}
+
+extern void gomp_barrier_wait (gomp_barrier_t *);
+extern void gomp_barrier_wait_last (gomp_barrier_t *);
+extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
+extern void gomp_team_barrier_wait (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
+					gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+					       gomp_barrier_state_t);
+extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* A memory barrier is needed before exiting from the various forms
+     of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
+     2.8.6 flush Construct, which says there is an implicit flush during
+     a barrier region.  This is a convenient place to add the barrier,
+     so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE.  */
+  if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_RELAXED) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+  return gomp_barrier_wait_start (bar);
+}
+
+/* This is like gomp_barrier_wait_start, except it decrements
+   bar->awaited_final rather than bar->awaited and should be used
+   for the gomp_team_end barrier only.  */
+static inline gomp_barrier_state_t
+gomp_barrier_wait_final_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* See above gomp_barrier_wait_start comment.  */
+  if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_RELAXED) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline bool
+gomp_barrier_last_thread (gomp_barrier_state_t state)
+{
+  return state & BAR_WAS_LAST;
+}
+
+/* All the inlines below must be called with team->task_lock
+   held.  */
+
+static inline void
+gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
+{
+  bar->generation |= BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
+{
+  bar->generation &= ~BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
+{
+  bar->generation |= BAR_WAITING_FOR_TASK;
+}
+
+static inline bool
+gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
+{
+  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
+}
+
+static inline void
+gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  bar->generation = (state & -BAR_INCR) + BAR_INCR;
+}
+
+#endif /* GOMP_BARRIER_H */
diff --git a/libgomp/config/gcn/doacross.h b/libgomp/config/gcn/doacross.h
new file mode 100644
index 0000000..2bff18a
--- /dev/null
+++ b/libgomp/config/gcn/doacross.h
@@ -0,0 +1,58 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is the AMD GCN implementation of doacross spinning.  */
+
+#ifndef GOMP_DOACROSS_H
+#define GOMP_DOACROSS_H 1
+
+#include "libgomp.h"
+
+static inline int
+cpu_relax (void)
+{
+  /* This can be implemented as just a memory barrier, but a sleep seems
+     like it should allow the wavefront to yield (maybe?)
+     Use the shortest possible sleep time of 1*64 cycles.  */
+  asm volatile ("s_sleep\t1" ::: "memory");
+  return 0;
+}
+
+static inline void doacross_spin (unsigned long *addr, unsigned long expected,
+				  unsigned long cur)
+{
+  /* Prevent compiler from optimizing based on bounds of containing object.  */
+  asm ("" : "+r" (addr));
+  do
+    {
+       /* An alternative implementation might use s_setprio to lower the
+	  priority temporarily, and then restore it after.  */
+      int i = cpu_relax ();
+      cur = addr[i];
+    }
+  while (cur <= expected);
+}
+
+#endif /* GOMP_DOACROSS_H */
diff --git a/libgomp/config/gcn/gomp_print.c b/libgomp/config/gcn/gomp_print.c
new file mode 100644
index 0000000..1c75573
--- /dev/null
+++ b/libgomp/config/gcn/gomp_print.c
@@ -0,0 +1,2 @@
+/* The GCN gomp_print routines live in libgcc where they are available
+   to stand-alone toolchains configured without libgomp.  */
diff --git a/libgomp/config/gcn/icv-device.c b/libgomp/config/gcn/icv-device.c
new file mode 100644
index 0000000..cbb9dfa
--- /dev/null
+++ b/libgomp/config/gcn/icv-device.c
@@ -0,0 +1,72 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file defines OpenMP API entry points that accelerator targets are
+   expected to replace.  */
+
+#include "libgomp.h"
+
+void
+omp_set_default_device (int device_num __attribute__((unused)))
+{
+}
+
+int
+omp_get_default_device (void)
+{
+  return 0;
+}
+
+int
+omp_get_num_devices (void)
+{
+  return 0;
+}
+
+int
+omp_get_num_teams (void)
+{
+  return gomp_num_teams_var + 1;
+}
+
+int __attribute__ ((__optimize__ ("O2")))
+omp_get_team_num (void)
+{
+  return __builtin_gcn_dim_pos (0);
+}
+
+int
+omp_is_initial_device (void)
+{
+  /* AMD GCN is an accelerator-only target.  */
+  return 0;
+}
+
+ialias (omp_set_default_device)
+ialias (omp_get_default_device)
+ialias (omp_get_num_devices)
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
+ialias (omp_is_initial_device)
diff --git a/libgomp/config/gcn/simple-bar.h b/libgomp/config/gcn/simple-bar.h
new file mode 100644
index 0000000..802e0f5
--- /dev/null
+++ b/libgomp/config/gcn/simple-bar.h
@@ -0,0 +1,61 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a simplified barrier that is suitable for thread pool
+   synchronizaton.  Only a subset of full barrier API (bar.h) is exposed.
+   Here in the AMD GCN-specific implementation, we expect that thread pool
+   corresponds to the wavefronts within a work group.  */
+
+#ifndef GOMP_SIMPLE_BARRIER_H
+#define GOMP_SIMPLE_BARRIER_H 1
+
+/* AMD GCN has no use for this type.  */
+typedef int gomp_simple_barrier_t;
+
+/* GCN barriers block all wavefronts, so the count is not interesting.  */
+static inline void
+gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count)
+{
+}
+
+static inline void
+gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar)
+{
+}
+
+static inline void
+gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
+{
+  asm volatile ("s_barrier" ::: "memory");
+}
+
+static inline void
+gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
+{
+  /* GCN has no way to signal a barrier without waiting.  */
+  asm volatile ("s_barrier" ::: "memory");
+}
+
+#endif /* GOMP_SIMPLE_BARRIER_H */
diff --git a/libgomp/config/gcn/target.c b/libgomp/config/gcn/target.c
new file mode 100644
index 0000000..db00551
--- /dev/null
+++ b/libgomp/config/gcn/target.c
@@ -0,0 +1,67 @@
+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+#include <limits.h>
+
+void
+GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
+{
+  if (thread_limit)
+    {
+      struct gomp_task_icv *icv = gomp_icv (true);
+      icv->thread_limit_var
+	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
+    }
+  unsigned int num_workgroups, workgroup_id;
+  num_workgroups = __builtin_gcn_dim_size (0);
+  workgroup_id = __builtin_gcn_dim_pos (0);
+  if (!num_teams || num_teams >= num_workgroups)
+    num_teams = num_workgroups;
+  else if (workgroup_id >= num_teams)
+    {
+      gomp_free_thread (gcn_thrs ());
+      exit (0);
+    }
+  gomp_num_teams_var = num_teams - 1;
+}
+
+int
+omp_pause_resource (omp_pause_resource_t kind, int device_num)
+{
+  (void) kind;
+  (void) device_num;
+  return -1;
+}
+
+int
+omp_pause_resource_all (omp_pause_resource_t kind)
+{
+  (void) kind;
+  return -1;
+}
+
+ialias (omp_pause_resource)
+ialias (omp_pause_resource_all)
diff --git a/libgomp/config/nvptx/proc.c b/libgomp/config/gcn/task.c
similarity index 72%
copy from libgomp/config/nvptx/proc.c
copy to libgomp/config/gcn/task.c
index 8ca0b0a..a135650 100644
--- a/libgomp/config/nvptx/proc.c
+++ b/libgomp/config/gcn/task.c
@@ -1,5 +1,5 @@
-/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
-   Contributed by Alexander Monakov <amonakov@ispras.ru>
+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
 
    This file is part of the GNU Offloading and Multi Processing Library
    (libgomp).
@@ -23,19 +23,17 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This file contains system specific routines related to counting
-   online processors and dynamic load balancing.  */
+/* This file handles the maintainence of tasks in response to task
+   creation and termination.  */
 
 #include "libgomp.h"
 
-unsigned
-gomp_dynamic_max_threads (void)
+/* AMD GCN is an accelerator-only target, so this should never be called.  */
+
+bool
+gomp_target_task_fn (void *data)
 {
-  return gomp_icv (false)->nthreads_var;
+  __builtin_unreachable ();
 }
 
-int
-omp_get_num_procs (void)
-{
-  return gomp_icv (false)->nthreads_var;
-}
+#include "../../task.c"
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
new file mode 100644
index 0000000..534cf59
--- /dev/null
+++ b/libgomp/config/gcn/team.c
@@ -0,0 +1,203 @@
+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file handles maintainance of threads on AMD GCN.  */
+
+#include "libgomp.h"
+#include <stdlib.h>
+#include <string.h>
+
+static void gomp_thread_start (struct gomp_thread_pool *);
+void gomp_print_string (const char *msg, const char *val);
+
+/* This externally visible function handles target region entry.  It
+   sets up a per-team thread pool and transfers control by returning to
+   the kernel in the master thread or gomp_thread_start in other threads.
+
+   The name of this function is part of the interface with the compiler: for
+   each OpenMP kernel the compiler configures the stack, then calls here.
+
+   Likewise, gomp_gcn_exit_kernel is called during the kernel epilogue.  */
+
+void
+gomp_gcn_enter_kernel (void)
+{
+  int tid, ntids;
+  tid = __builtin_gcn_dim_pos (1);
+  ntids = __builtin_gcn_dim_size (1);
+  if (tid == 0)
+    {
+      gomp_global_icv.nthreads_var = ntids;
+      /* Starting additional threads is not supported.  */
+      gomp_global_icv.dyn_var = true;
+
+      set_gcn_thrs (calloc (ntids, sizeof (struct gomp_thread)));
+      if (gcn_thrs () == NULL)
+	goto oom;
+
+      struct gomp_thread_pool *pool = malloc (sizeof (*pool));
+      if (pool == NULL)
+	goto oom;
+
+      pool->threads = malloc (ntids * sizeof (*pool->threads));
+      if (pool->threads == NULL)
+	goto oom;
+
+      for (tid = 0; tid < ntids; tid++)
+	pool->threads[tid] = gcn_thrs () + tid;
+      pool->threads_size = ntids;
+      pool->threads_used = ntids;
+      pool->threads_busy = 1;
+      pool->last_team = NULL;
+      gomp_simple_barrier_init (&pool->threads_dock, ntids);
+
+      gcn_thrs ()[0].thread_pool = pool;
+      asm ("s_barrier" ::: "memory");
+      return;  /* Return to kernel.  */
+    }
+  else
+    {
+      asm ("s_barrier" ::: "memory");
+      gomp_thread_start (gcn_thrs ()[0].thread_pool);
+      /* gomp_thread_start does not return.  */
+    }
+
+oom:
+  gomp_print_string ("GCN heap exhausted; try setting GCN_HEAP_SIZE.", "");
+  abort();
+}
+
+void
+gomp_gcn_exit_kernel (void)
+{
+  gomp_free_thread (gcn_thrs ());
+  free (gcn_thrs ());
+}
+
+/* This function contains the idle loop in which a thread waits
+   to be called up to become part of a team.  */
+
+static void
+gomp_thread_start (struct gomp_thread_pool *pool)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  gomp_sem_init (&thr->release, 0);
+  thr->thread_pool = pool;
+
+  /* The loop exits only when "fn" is assigned "gomp_free_pool_helper",
+     which contains "s_endpgm", or an infinite no-op loop is
+     suspected (this happens when the thread master crashes).  */
+  int nul_limit = 99;
+  do
+    {
+      gomp_simple_barrier_wait (&pool->threads_dock);
+      if (!thr->fn)
+	{
+	  if (nul_limit-- > 0)
+	    continue;
+	  else
+	    {
+	      gomp_print_string ("team master not responding;",
+				 " slave thread aborting");
+	      abort();
+	    }
+	}
+      thr->fn (thr->data);
+      thr->fn = NULL;
+
+      struct gomp_task *task = thr->task;
+      gomp_team_barrier_wait_final (&thr->ts.team->barrier);
+      gomp_finish_task (task);
+    }
+  while (1);
+}
+
+/* Launch a team.  */
+
+void
+gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
+		 unsigned flags, struct gomp_team *team,
+		 struct gomp_taskgroup *taskgroup)
+{
+  struct gomp_thread *thr, *nthr;
+  struct gomp_task *task;
+  struct gomp_task_icv *icv;
+  struct gomp_thread_pool *pool;
+  unsigned long nthreads_var;
+
+  thr = gomp_thread ();
+  pool = thr->thread_pool;
+  task = thr->task;
+  icv = task ? &task->icv : &gomp_global_icv;
+
+  /* Always save the previous state, even if this isn't a nested team.
+     In particular, we should save any work share state from an outer
+     orphaned work share construct.  */
+  team->prev_ts = thr->ts;
+
+  thr->ts.team = team;
+  thr->ts.team_id = 0;
+  ++thr->ts.level;
+  if (nthreads > 1)
+    ++thr->ts.active_level;
+  thr->ts.work_share = &team->work_shares[0];
+  thr->ts.last_work_share = NULL;
+  thr->ts.single_count = 0;
+  thr->ts.static_trip = 0;
+  thr->task = &team->implicit_task[0];
+  nthreads_var = icv->nthreads_var;
+  gomp_init_task (thr->task, task, icv);
+  team->implicit_task[0].icv.nthreads_var = nthreads_var;
+  team->implicit_task[0].taskgroup = taskgroup;
+
+  if (nthreads == 1)
+    return;
+
+  /* Release existing idle threads.  */
+  for (unsigned i = 1; i < nthreads; ++i)
+    {
+      nthr = pool->threads[i];
+      nthr->ts.team = team;
+      nthr->ts.work_share = &team->work_shares[0];
+      nthr->ts.last_work_share = NULL;
+      nthr->ts.team_id = i;
+      nthr->ts.level = team->prev_ts.level + 1;
+      nthr->ts.active_level = thr->ts.active_level;
+      nthr->ts.single_count = 0;
+      nthr->ts.static_trip = 0;
+      nthr->task = &team->implicit_task[i];
+      gomp_init_task (nthr->task, task, icv);
+      team->implicit_task[i].icv.nthreads_var = nthreads_var;
+      team->implicit_task[i].taskgroup = taskgroup;
+      nthr->fn = fn;
+      nthr->data = data;
+      team->ordered_release[i] = &nthr->release;
+    }
+
+  gomp_simple_barrier_wait (&pool->threads_dock);
+}
+
+#include "../../team.c"
diff --git a/libgomp/config/nvptx/proc.c b/libgomp/config/gcn/time.c
similarity index 69%
copy from libgomp/config/nvptx/proc.c
copy to libgomp/config/gcn/time.c
index 8ca0b0a..f189e55 100644
--- a/libgomp/config/nvptx/proc.c
+++ b/libgomp/config/gcn/time.c
@@ -1,5 +1,5 @@
 /* Copyright (C) 2015-2019 Free Software Foundation, Inc.
-   Contributed by Alexander Monakov <amonakov@ispras.ru>
+   Contributed by Mentor Embedded.
 
    This file is part of the GNU Offloading and Multi Processing Library
    (libgomp).
@@ -23,19 +23,30 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This file contains system specific routines related to counting
-   online processors and dynamic load balancing.  */
+/* This file implements timer routines for AMD GCN.  */
 
 #include "libgomp.h"
 
-unsigned
-gomp_dynamic_max_threads (void)
+/* According to AMD:
+    dGPU RTC is 27MHz
+    AGPU RTC is 100MHz
+   FIXME: DTRT on an APU.  */
+#define RTC_TICKS (1.0 / 27000000.0) /* 27MHz */
+
+double
+omp_get_wtime (void)
 {
-  return gomp_icv (false)->nthreads_var;
+  uint64_t clock;
+  asm ("s_memrealtime %0\n\t"
+       "s_waitcnt 0" : "=r" (clock));
+  return clock * RTC_TICKS;
 }
 
-int
-omp_get_num_procs (void)
+double
+omp_get_wtick (void)
 {
-  return gomp_icv (false)->nthreads_var;
+  return RTC_TICKS;
 }
+
+ialias (omp_get_wtime)
+ialias (omp_get_wtick)
diff --git a/libgomp/config/linux/gomp_print.c b/libgomp/config/linux/gomp_print.c
new file mode 100644
index 0000000..8b2e383
--- /dev/null
+++ b/libgomp/config/linux/gomp_print.c
@@ -0,0 +1,29 @@
+#include <stdio.h>
+#include <stdint.h>
+
+#include "config.h"  /* For HAVE_INTTYPES_H.  */
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>  /* For PRId64.  */
+#endif
+
+void
+gomp_print_string (const char *msg, const char *value)
+{
+  printf ("%s%s\n", msg, value);
+}
+
+void
+gomp_print_integer (const char *msg, int64_t value)
+{
+#ifdef HAVE_INTTYPES_H
+  printf ("%s%" PRId64 "\n", msg, value);
+#else
+  printf ("%s%ld\n", msg, (long) value);
+#endif
+}
+
+void
+gomp_print_double (const char *msg, double value)
+{
+  printf ("%s%f\n", msg, value);
+}
diff --git a/libgomp/config/nvptx/gomp_print.c b/libgomp/config/nvptx/gomp_print.c
new file mode 100644
index 0000000..811bdd6
--- /dev/null
+++ b/libgomp/config/nvptx/gomp_print.c
@@ -0,0 +1,20 @@
+#include <stdio.h>
+#include <stdint.h>
+
+void
+gomp_print_string (const char *msg, const char *value)
+{
+  printf ("%s%s\n", msg, value);
+}
+
+void
+gomp_print_integer (const char *msg, int64_t value)
+{
+  printf ("%s%ld\n", msg, value);
+}
+
+void
+gomp_print_double (const char *msg, double value)
+{
+  printf ("%s%f\n", msg, value);
+}
diff --git a/libgomp/config/nvptx/libgomp-plugin.c b/libgomp/config/nvptx/oacc-profiling-acc_register_library.c
similarity index 100%
copy from libgomp/config/nvptx/libgomp-plugin.c
copy to libgomp/config/nvptx/oacc-profiling-acc_register_library.c
diff --git a/libgomp/config/nvptx/libgomp-plugin.c b/libgomp/config/nvptx/oacc-profiling.c
similarity index 100%
copy from libgomp/config/nvptx/libgomp-plugin.c
copy to libgomp/config/nvptx/oacc-profiling.c
diff --git a/libgomp/configure b/libgomp/configure
index b4bc4f4..85a29c5 100755
--- a/libgomp/configure
+++ b/libgomp/configure
@@ -661,6 +661,8 @@
 LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE
 OPT_LDFLAGS
 SECTION_LDFLAGS
+PLUGIN_GCN_FALSE
+PLUGIN_GCN_TRUE
 PLUGIN_HSA_FALSE
 PLUGIN_HSA_TRUE
 PLUGIN_NVPTX_FALSE
@@ -669,6 +671,10 @@
 offload_additional_options
 offload_targets
 offload_plugins
+PLUGIN_GCN_LIBS
+PLUGIN_GCN_LDFLAGS
+PLUGIN_GCN_CPPFLAGS
+PLUGIN_GCN
 PLUGIN_HSA_LIBS
 PLUGIN_HSA_LDFLAGS
 PLUGIN_HSA_CPPFLAGS
@@ -681,6 +687,10 @@
 PLUGIN_NVPTX
 CUDA_DRIVER_LIB
 CUDA_DRIVER_INCLUDE
+USE_LIBFFI_FALSE
+USE_LIBFFI_TRUE
+LIBFFIINCS
+LIBFFI
 libtool_VERSION
 ac_ct_FC
 FCFLAGS
@@ -2704,7 +2714,6 @@
 fi
 
 
-
 # -------
 # -------
 
@@ -11393,7 +11402,7 @@
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11396 "configure"
+#line 11405 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -11499,7 +11508,7 @@
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11502 "configure"
+#line 11511 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -15312,7 +15321,7 @@
   *-*-rtems*)
     # RTEMS supports Pthreads, but the library is not available at GCC build time.
     ;;
-  nvptx*-*-*)
+  nvptx*-*-* | amdgcn*-*-*)
     # NVPTX does not support Pthreads, has its own code replacement.
     libgomp_use_pthreads=no
     # NVPTX is an accelerator-only target
@@ -15374,9 +15383,31 @@
 
 fi
 
+# Prepare libffi when necessary.
+
+LIBFFI=
+LIBFFIINCS=
+if test -d ../libffi; then
+
+$as_echo "#define USE_LIBFFI 1" >>confdefs.h
+
+   LIBFFI=../libffi/libffi_convenience.la
+   LIBFFIINCS='-I$(top_srcdir)/../libffi/include -I../libffi/include'
+fi
+
+
+ if test -d ../libffi; then
+  USE_LIBFFI_TRUE=
+  USE_LIBFFI_FALSE='#'
+else
+  USE_LIBFFI_TRUE='#'
+  USE_LIBFFI_FALSE=
+fi
+
+
 # Plugins for offload execution, configure.ac fragment.  -*- mode: autoconf -*-
 #
-# Copyright (C) 2014-2018 Free Software Foundation, Inc.
+# Copyright (C) 2014-2019 Free Software Foundation, Inc.
 #
 # Contributed by Mentor Embedded.
 #
@@ -15620,6 +15651,15 @@
 
 
 
+PLUGIN_GCN=0
+PLUGIN_GCN_CPPFLAGS=
+PLUGIN_GCN_LDFLAGS=
+PLUGIN_GCN_LIBS=
+
+
+
+
+
 # Parse '--enable-offload-targets', figure out the corresponding libgomp
 # plugins, and configure to find the corresponding offload compilers.
 # 'offload_plugins' and 'offload_targets' will be populated in the same order.
@@ -15731,6 +15771,30 @@
             ;;
         esac
         ;;
+
+      amdgcn*)
+	case "${target}" in
+	  x86_64-*-*)
+	    case " ${CC} ${CFLAGS} " in
+	      *" -m32 "*)
+		PLUGIN_GCN=0
+		;;
+	      *)
+		tgt_name=gcn
+		tgt_plugin=gcn
+		PLUGIN_GCN=$tgt
+		PLUGIN_GCN_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
+		PLUGIN_GCN_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
+		PLUGIN_GCN_LIBS="-ldl"
+		PLUGIN_GCN=1
+		;;
+	      esac
+	    ;;
+	  *-*-*)
+	    PLUGIN_GCN=0
+	     ;;
+	esac
+	;;
       *)
 	as_fn_error $? "unknown offload target specified" "$LINENO" 5
 	;;
@@ -15743,8 +15807,8 @@
       offload_plugins=$tgt_plugin
       offload_targets=$tgt
     else
-      offload_plugins=$offload_plugins,$tgt_plugin
-      offload_targets=$offload_targets,$tgt
+      offload_plugins=$offload_plugins:$tgt_plugin
+      offload_targets=$offload_targets:$tgt
     fi
     # Configure additional search paths.
     if test "$tgt_plugin" = hsa; then
@@ -15761,7 +15825,7 @@
 fi
 
 cat >>confdefs.h <<_ACEOF
-#define OFFLOAD_PLUGINS "$offload_plugins"
+#define OFFLOAD_TARGETS "$offload_targets"
 _ACEOF
 
  if test $PLUGIN_NVPTX = 1; then
@@ -15795,6 +15859,19 @@
 #define PLUGIN_HSA $PLUGIN_HSA
 _ACEOF
 
+ if test $PLUGIN_GCN = 1; then
+  PLUGIN_GCN_TRUE=
+  PLUGIN_GCN_FALSE='#'
+else
+  PLUGIN_GCN_TRUE='#'
+  PLUGIN_GCN_FALSE=
+fi
+
+
+cat >>confdefs.h <<_ACEOF
+#define PLUGIN_GCN $PLUGIN_GCN
+_ACEOF
+
 
 if test "$HSA_RUNTIME_LIB" != ""; then
   HSA_RUNTIME_LIB="$HSA_RUNTIME_LIB/"
@@ -15824,8 +15901,7 @@
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
-eval as_val=\$$as_ac_var
-   if test "x$as_val" = x""yes; then :
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
   cat >>confdefs.h <<_ACEOF
 #define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
 _ACEOF
@@ -16587,7 +16663,7 @@
   fi
 
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
   libgomp_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -17409,6 +17485,10 @@
   as_fn_error $? "conditional \"MAINTAINER_MODE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${USE_LIBFFI_TRUE}" && test -z "${USE_LIBFFI_FALSE}"; then
+  as_fn_error $? "conditional \"USE_LIBFFI\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${PLUGIN_NVPTX_TRUE}" && test -z "${PLUGIN_NVPTX_FALSE}"; then
   as_fn_error $? "conditional \"PLUGIN_NVPTX\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -17417,6 +17497,10 @@
   as_fn_error $? "conditional \"PLUGIN_HSA\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${PLUGIN_GCN_TRUE}" && test -z "${PLUGIN_GCN_FALSE}"; then
+  as_fn_error $? "conditional \"PLUGIN_GCN\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE}" && test -z "${LIBGOMP_BUILD_VERSIONED_SHLIB_FALSE}"; then
   as_fn_error $? "conditional \"LIBGOMP_BUILD_VERSIONED_SHLIB\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
diff --git a/libgomp/configure.ac b/libgomp/configure.ac
index f75c622..33e1839 100644
--- a/libgomp/configure.ac
+++ b/libgomp/configure.ac
@@ -27,7 +27,6 @@
 AC_MSG_RESULT($enable_generated_files_in_srcdir)
 AM_CONDITIONAL(GENINSRC, test "$enable_generated_files_in_srcdir" = yes)
 
-
 # -------
 # -------
 
@@ -178,7 +177,7 @@
   *-*-rtems*)
     # RTEMS supports Pthreads, but the library is not available at GCC build time.
     ;;
-  nvptx*-*-*)
+  nvptx*-*-* | amdgcn*-*-*)
     # NVPTX does not support Pthreads, has its own code replacement.
     libgomp_use_pthreads=no
     # NVPTX is an accelerator-only target
@@ -214,6 +213,19 @@
             [Define to 1 if building libgomp for an accelerator-only target.])
 fi
 
+# Prepare libffi when necessary.
+
+LIBFFI=
+LIBFFIINCS=
+if test -d ../libffi; then
+   AC_DEFINE(USE_LIBFFI, 1, [Define if we're to use libffi.])
+   LIBFFI=../libffi/libffi_convenience.la
+   LIBFFIINCS='-I$(top_srcdir)/../libffi/include -I../libffi/include'
+fi
+AC_SUBST(LIBFFI)
+AC_SUBST(LIBFFIINCS)
+AM_CONDITIONAL([USE_LIBFFI], [test -d ../libffi])
+
 m4_include([plugin/configfrag.ac])
 
 # Check for functions needed.
diff --git a/libgomp/configure.tgt b/libgomp/configure.tgt
index b88bf72..06ee115 100644
--- a/libgomp/configure.tgt
+++ b/libgomp/configure.tgt
@@ -154,7 +154,7 @@
 	;;
 
   nvptx*-*-*)
-	config_path="nvptx"
+	config_path="nvptx accel"
 	;;
 
   *-*-rtems*)
@@ -164,6 +164,10 @@
 	fi
 	;;
 
+  amdgcn*-*-*)
+	config_path="gcn accel"
+	;;
+
   *)
 	;;
 
diff --git a/libgomp/env.c b/libgomp/env.c
index 7937fbb..29d9f58 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -1425,5 +1425,7 @@
   parse_gomp_openacc_dim ();
 
   goacc_runtime_initialize ();
+
+  goacc_profiling_initialize ();
 }
 #endif /* LIBGOMP_OFFLOADED_ONLY */
diff --git a/libgomp/fortran.c b/libgomp/fortran.c
index 4d544be..fdc324d 100644
--- a/libgomp/fortran.c
+++ b/libgomp/fortran.c
@@ -626,7 +626,7 @@
   if (ret < sizeof buf)
     {
       buf[ret] = '\n';
-      gomp_print_string (buf, ret + 1);
+      gomp_write_string (buf, ret + 1);
     }
   else
     {
@@ -635,7 +635,7 @@
 			     format_len ? fmt : gomp_affinity_format_var,
 			     gomp_thread_self (), &thr->ts, thr->place);
       b[ret] = '\n';
-      gomp_print_string (b, ret + 1);
+      gomp_write_string (b, ret + 1);
       free (b);
     }
   if (fmt && fmt != fmt_buf)
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 8960d80..fcd4727 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -50,9 +50,31 @@
   /* OFFLOAD_TARGET_TYPE_HOST_NONSHM = 3 removed.  */
   OFFLOAD_TARGET_TYPE_NVIDIA_PTX = 5,
   OFFLOAD_TARGET_TYPE_INTEL_MIC = 6,
-  OFFLOAD_TARGET_TYPE_HSA = 7
+  OFFLOAD_TARGET_TYPE_HSA = 7,
+  OFFLOAD_TARGET_TYPE_GCN = 8
 };
 
+/* Container type for passing device properties.  */
+union gomp_device_property_value
+{
+  void *ptr;
+  uintmax_t val;
+};
+
+/* Opaque type to represent plugin-dependent implementation of an
+   OpenACC asynchronous queue.  */
+struct goacc_asyncqueue;
+
+/* Used to keep a list of active asynchronous queues.  */
+struct goacc_asyncqueue_list
+{
+  struct goacc_asyncqueue *aq;
+  struct goacc_asyncqueue_list *next;
+};
+
+typedef struct goacc_asyncqueue *goacc_aq;
+typedef struct goacc_asyncqueue_list *goacc_aq_list;
+
 /* Auxiliary struct, used for transferring pairs of addresses from plugin
    to libgomp.  */
 struct addr_pair
@@ -79,6 +101,7 @@
 extern unsigned int GOMP_OFFLOAD_get_caps (void);
 extern int GOMP_OFFLOAD_get_type (void);
 extern int GOMP_OFFLOAD_get_num_devices (void);
+extern union gomp_device_property_value GOMP_OFFLOAD_get_property (int, int);
 extern bool GOMP_OFFLOAD_init_device (int);
 extern bool GOMP_OFFLOAD_fini_device (int);
 extern unsigned GOMP_OFFLOAD_version (void);
@@ -93,22 +116,39 @@
 extern bool GOMP_OFFLOAD_can_run (void *);
 extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
 extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
+
 extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **,
-				       void **, int, unsigned *, void *);
-extern void GOMP_OFFLOAD_openacc_register_async_cleanup (void *, int);
-extern int GOMP_OFFLOAD_openacc_async_test (int);
-extern int GOMP_OFFLOAD_openacc_async_test_all (void);
-extern void GOMP_OFFLOAD_openacc_async_wait (int);
-extern void GOMP_OFFLOAD_openacc_async_wait_async (int, int);
-extern void GOMP_OFFLOAD_openacc_async_wait_all (void);
-extern void GOMP_OFFLOAD_openacc_async_wait_all_async (int);
-extern void GOMP_OFFLOAD_openacc_async_set_async (int);
+				       void **, unsigned *, void *);
 extern void *GOMP_OFFLOAD_openacc_create_thread_data (int);
 extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *);
+extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (int);
+extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *,
+						  struct goacc_asyncqueue *);
+extern void GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *,
+						       void (*)(void *), void *);
+extern void GOMP_OFFLOAD_openacc_async_exec (void (*) (void *), size_t, void **,
+					     void **, unsigned *, void *,
+					     struct goacc_asyncqueue *);
+extern void GOMP_OFFLOAD_openacc_exec_params (void (*) (void *), size_t,
+					      void **, void **, unsigned *,
+					      void *);
+extern void GOMP_OFFLOAD_openacc_async_exec_params (void (*) (void *), size_t,
+						    void **, void **,
+						    unsigned *, void *,
+						    struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t,
+						 struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *,
+						 size_t, bool,
+						 struct goacc_asyncqueue *);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
-extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (int);
-extern int GOMP_OFFLOAD_openacc_cuda_set_stream (int, void *);
+extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *,
+						 void *);
 
 #ifdef __cplusplus
 }
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index c98c145..a59bc33 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -692,6 +692,24 @@
   asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
   return nvptx_thrs + tid;
 }
+#elif defined __AMDGCN__
+static inline struct gomp_thread *gcn_thrs (void)
+{
+  /* The value is at the bottom of LDS.  */
+  struct gomp_thread * __lds *thrs = (struct gomp_thread * __lds *)4;
+  return *thrs;
+}
+static inline void set_gcn_thrs (struct gomp_thread *val)
+{
+  /* The value is at the bottom of LDS.  */
+  struct gomp_thread * __lds *thrs = (struct gomp_thread * __lds *)4;
+  *thrs = val;
+}
+static inline struct gomp_thread *gomp_thread (void)
+{
+  int tid = __builtin_gcn_dim_pos(1);
+  return gcn_thrs () + tid;
+}
 #elif defined HAVE_TLS || defined USE_EMUTLS
 extern __thread struct gomp_thread gomp_tls_data;
 static inline struct gomp_thread *gomp_thread (void)
@@ -751,7 +769,7 @@
 
 /* affinity-fmt.c */
 
-extern bool gomp_print_string (const char *str, size_t len);
+extern bool gomp_write_string (const char *str, size_t len);
 extern void gomp_set_affinity_format (const char *, size_t);
 extern void gomp_display_string (char *, size_t, size_t *, const char *,
 				 size_t);
@@ -867,15 +885,25 @@
   bool copy_from;
   /* True if data always should be copied from device to host at the end.  */
   bool always_copy_from;
+  /* True if variable should be detached at end of region.  */
+  bool do_detach;
   /* Relative offset against key host_start.  */
   uintptr_t offset;
   /* Actual length.  */
   uintptr_t length;
 };
 
+/* Uncomment to enable reference-count consistency checking (for development
+   use only).  */
+//#define RC_CHECKING 1
+
 struct target_mem_desc {
   /* Reference count.  */
   uintptr_t refcount;
+#ifdef RC_CHECKING
+  uintptr_t refcount_chk;
+  bool mark;
+#endif
   /* All the splay nodes allocated together.  */
   splay_tree_node array;
   /* Start of the target region.  */
@@ -903,6 +931,15 @@
    artificial pointer to "omp declare target link" object.  */
 #define REFCOUNT_LINK (~(uintptr_t) 1)
 
+/* Special offset values.  */
+#define OFFSET_INLINED (~(uintptr_t) 0)
+#define OFFSET_POINTER (~(uintptr_t) 1)
+#define OFFSET_STRUCT (~(uintptr_t) 2)
+
+/* A special tag value for "virtual_refcount" in the splay_tree_key_s structure
+   below.  */
+#define VREFCOUNT_LINK_KEY (~(uintptr_t) 0)
+
 struct splay_tree_key_s {
   /* Address of the host object.  */
   uintptr_t host_start;
@@ -914,10 +951,25 @@
   uintptr_t tgt_offset;
   /* Reference count.  */
   uintptr_t refcount;
-  /* Dynamic reference count.  */
-  uintptr_t dynamic_refcount;
-  /* Pointer to the original mapping of "omp declare target link" object.  */
-  splay_tree_key link_key;
+  /* Reference counts beyond those that represent genuine references in the
+     linked splay tree key/target memory structures, e.g. for multiple OpenACC
+     "present increment" operations (via "acc enter data") referring to the same
+     host-memory block.
+     If set to VREFCOUNT_LINK_KEY (for OpenMP, where this field is not otherwise
+     needed), the union below represents a link key.  */
+  uintptr_t virtual_refcount;
+#ifdef RC_CHECKING
+  /* The recalculated reference count, for verification.  */
+  uintptr_t refcount_chk;
+#endif
+  union {
+    /* For a block with attached pointers, the attachment counters for each.
+       Only used for OpenACC.  */
+    uintptr_t *attach_count;
+    /* Pointer to the original mapping of "omp declare target link" object.
+       Only used for OpenMP.  */
+    splay_tree_key link_key;
+  } u;
 };
 
 /* The comparison function.  */
@@ -939,34 +991,36 @@
 
 typedef struct acc_dispatch_t
 {
-  /* This is a linked list of data mapped using the
-     acc_map_data/acc_unmap_data or "acc enter data"/"acc exit data" pragmas.
-     Unlike mapped_data in the goacc_thread struct, unmapping can
-     happen out-of-order with respect to mapping.  */
-  /* This is guarded by the lock in the "outer" struct gomp_device_descr.  */
-  struct target_mem_desc *data_environ;
-
   /* Execute.  */
   __typeof (GOMP_OFFLOAD_openacc_exec) *exec_func;
-
-  /* Async cleanup callback registration.  */
-  __typeof (GOMP_OFFLOAD_openacc_register_async_cleanup)
-    *register_async_cleanup_func;
-
-  /* Asynchronous routines.  */
-  __typeof (GOMP_OFFLOAD_openacc_async_test) *async_test_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_test_all) *async_test_all_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_wait) *async_wait_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_wait_async) *async_wait_async_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_wait_all) *async_wait_all_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_wait_all_async)
-    *async_wait_all_async_func;
-  __typeof (GOMP_OFFLOAD_openacc_async_set_async) *async_set_async_func;
+  __typeof (GOMP_OFFLOAD_openacc_exec_params) *exec_params_func;
 
   /* Create/destroy TLS data.  */
   __typeof (GOMP_OFFLOAD_openacc_create_thread_data) *create_thread_data_func;
   __typeof (GOMP_OFFLOAD_openacc_destroy_thread_data)
     *destroy_thread_data_func;
+  
+  struct {
+    /* Once created and put into the "active" list, asyncqueues are then never
+       destructed and removed from the "active" list, other than if the TODO
+       device is shut down.  */
+    gomp_mutex_t lock;
+    int nasyncqueue;
+    struct goacc_asyncqueue **asyncqueue;
+    struct goacc_asyncqueue_list *active;
+
+    __typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_test) *test_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_synchronize) *synchronize_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_serialize) *serialize_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_queue_callback) *queue_callback_func;
+
+    __typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_exec_params) *exec_params_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
+  } async;
 
   /* NVIDIA target specific routines.  */
   struct {
@@ -1013,6 +1067,7 @@
   __typeof (GOMP_OFFLOAD_get_caps) *get_caps_func;
   __typeof (GOMP_OFFLOAD_get_type) *get_type_func;
   __typeof (GOMP_OFFLOAD_get_num_devices) *get_num_devices_func;
+  __typeof (GOMP_OFFLOAD_get_property) *get_property_func;
   __typeof (GOMP_OFFLOAD_init_device) *init_device_func;
   __typeof (GOMP_OFFLOAD_fini_device) *fini_device_func;
   __typeof (GOMP_OFFLOAD_version) *version_func;
@@ -1048,25 +1103,62 @@
 enum gomp_map_vars_kind
 {
   GOMP_MAP_VARS_OPENACC,
+  /* Like "GOMP_MAP_VARS_OPENACC", but with "GOACC_FLAG_HOST_DATA_IF_PRESENT"
+     semantics.  */
+  GOMP_MAP_VARS_OPENACC_IF_PRESENT,
+  GOMP_MAP_VARS_OPENACC_ENTER_DATA,
   GOMP_MAP_VARS_TARGET,
   GOMP_MAP_VARS_DATA,
   GOMP_MAP_VARS_ENTER_DATA
 };
 
-extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
-extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
+extern void gomp_acc_remove_pointer (struct gomp_device_descr *, void **,
+				     size_t *, unsigned short *, int, bool,
+				     int);
 extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
 				       unsigned short *);
+struct gomp_coalesce_buf;
+extern void gomp_copy_host2dev (struct gomp_device_descr *,
+				struct goacc_asyncqueue *, void *, const void *,
+				size_t, bool, struct gomp_coalesce_buf *);
+extern void gomp_copy_dev2host (struct gomp_device_descr *,
+				struct goacc_asyncqueue *, void *, const void *,
+				size_t);
+extern uintptr_t gomp_map_val (struct target_mem_desc *, void **, size_t);
+extern void gomp_attach_pointer (struct gomp_device_descr *,
+				 struct goacc_asyncqueue *, splay_tree,
+				 splay_tree_key, uintptr_t, size_t,
+				 struct gomp_coalesce_buf *);
+extern void gomp_detach_pointer (struct gomp_device_descr *,
+				 struct goacc_asyncqueue *, splay_tree_key,
+				 uintptr_t, bool, struct gomp_coalesce_buf *);
+
+#ifdef RC_CHECKING
+extern void dump_tgt (const char *, struct target_mem_desc *);
+extern void gomp_rc_check (struct gomp_device_descr *,
+			   struct target_mem_desc *);
+#endif
 
 extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
 					      size_t, void **, void **,
 					      size_t *, void *, bool,
 					      enum gomp_map_vars_kind);
+extern struct target_mem_desc *gomp_map_vars_async (struct gomp_device_descr *,
+						    struct goacc_asyncqueue *,
+						    size_t, void **, void **,
+						    size_t *, void *, bool,
+						    enum gomp_map_vars_kind);
+extern void gomp_unmap_tgt (struct target_mem_desc *);
 extern void gomp_unmap_vars (struct target_mem_desc *, bool);
+extern void gomp_unmap_vars_async (struct target_mem_desc *, bool,
+				   struct goacc_asyncqueue *);
 extern void gomp_init_device (struct gomp_device_descr *);
-extern void gomp_free_memmap (struct splay_tree_s *);
+extern bool gomp_fini_device (struct gomp_device_descr *);
 extern void gomp_unload_device (struct gomp_device_descr *);
 extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
+extern bool gomp_offload_target_available_p (int);
+extern void gomp_remove_var_async (struct gomp_device_descr *, splay_tree_key,
+				   struct goacc_asyncqueue *);
 
 /* work.c */
 
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index d8e2fd1..0118761 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -312,6 +312,9 @@
 	GOMP_loop_ull_nonmonotonic_guided_start;
 	GOMP_parallel_loop_nonmonotonic_dynamic;
 	GOMP_parallel_loop_nonmonotonic_guided;
+	gomp_print_string;
+	gomp_print_integer;
+	gomp_print_double;
 } GOMP_4.0.1;
 
 GOMP_5.0 {
@@ -476,6 +479,28 @@
 	acc_update_self_async_array_h_;
 } OACC_2.0.1;
 
+OACC_2.5.1 {
+  global:
+	acc_prof_lookup;
+	acc_prof_register;
+	acc_prof_unregister;
+	acc_register_library;
+} OACC_2.5;
+
+OACC_2.6 {
+  global:
+	acc_get_property;
+	acc_get_property_h_;
+	acc_get_property_string;
+	acc_get_property_string_h_;
+	acc_attach;
+	acc_attach_async;
+	acc_detach;
+	acc_detach_async;
+	acc_detach_finalize;
+	acc_detach_finalize_async;
+} OACC_2.5.1;
+
 GOACC_2.0 {
   global:
 	GOACC_data_end;
@@ -494,6 +519,12 @@
 	GOACC_parallel_keyed;
 } GOACC_2.0;
 
+GOACC_2.0.GOMP_4_BRANCH {
+  global:
+	GOMP_set_offload_targets;
+} GOACC_2.0.1;
+
+
 GOMP_PLUGIN_1.0 {
   global:
 	GOMP_PLUGIN_malloc;
@@ -515,3 +546,9 @@
   global:
 	GOMP_PLUGIN_acc_default_dim;
 } GOMP_PLUGIN_1.1;
+
+GOMP_PLUGIN_1.3 {
+  global:
+	GOMP_PLUGIN_goacc_profiling_dispatch;
+	GOMP_PLUGIN_goacc_thread;
+} GOMP_PLUGIN_1.2;
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index e2e384a..5c9cf9e 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -111,6 +111,7 @@
                                asynchronous operations.
 * OpenACC Library Interoperability:: OpenACC library interoperability with the
                                NVIDIA CUBLAS library.
+* OpenACC Profiling Interface::
 * The libgomp ABI::            Notes on the external ABI presented by libgomp.
 * Reporting Bugs::             How to report bugs in the GNU Offloading and
                                Multi Processing Runtime Library.
@@ -1819,7 +1820,7 @@
 
 A complete description of all OpenACC directives accepted may be found in 
 the @uref{https://www.openacc.org, OpenACC} Application Programming
-Interface manual, version 2.0.
+Interface manual, version 2.6.
 
 Note that this is an experimental feature and subject to
 change in future versions of GCC.  See
@@ -1835,12 +1836,15 @@
 @chapter OpenACC Runtime Library Routines
 
 The runtime routines described here are defined by section 3 of the OpenACC
-specifications in version 2.0.
+specifications in version 2.6.
 They have C linkage, and do not throw exceptions.
 Generally, they are available only for the host, with the exception of
 @code{acc_on_device}, which is available for both the host and the
 acceleration device.
 
+This list has not yet been updated for the OpenACC specification in
+version 2.6.
+
 @menu
 * acc_get_num_devices::         Get number of devices for the given device
                                 type.
@@ -1848,6 +1852,7 @@
 * acc_get_device_type::         Get type of device accelerator to be used.
 * acc_set_device_num::          Set device number to use.
 * acc_get_device_num::          Get device number to be used.
+* acc_get_property::            Get device property.
 * acc_async_test::              Tests for completion of a specific asynchronous
                                 operation.
 * acc_async_test_all::          Tests for completion of all asychronous
@@ -1897,6 +1902,13 @@
 * acc_get_current_cuda_context::Get CUDA context handle.
 * acc_get_cuda_stream::         Get CUDA stream handle.
 * acc_set_cuda_stream::         Set CUDA stream handle.
+
+API routines for the OpenACC Profiling Interface.
+
+* acc_prof_register::           Register callbacks.
+* acc_prof_unregister::         Unregister callbacks.
+* acc_prof_lookup::             Obtain inquiry functions.
+* acc_register_library::        Library registration.
 @end menu
 
 
@@ -1920,7 +1932,7 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 3.2.1.
 @end table
 
@@ -1945,7 +1957,7 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 3.2.2.
 @end table
 
@@ -1970,7 +1982,7 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 3.2.3.
 @end table
 
@@ -1997,7 +2009,7 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 3.2.4.
 @end table
 
@@ -2024,12 +2036,50 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 3.2.5.
 @end table
 
 
 
+@node acc_get_property
+@section @code{acc_get_property} -- Get device property.
+@cindex acc_get_property
+@cindex acc_get_property_string
+@table @asis
+@item @emph{Description}
+These routines return the value of the specified @var{property} for the
+device being queried according to @var{devicenum} and @var{devicetype}.
+Integer-valued and string-valued properties are returned by
+@code{acc_get_property} and @code{acc_get_property_string} respectively.
+The Fortran @code{acc_get_property_string} subroutine returns the string
+retrieved in its fourth argument while the remaining entry points are
+functions, which pass the return value as their result.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{size_t acc_get_property(int devicenum, acc_device_t devicetype, acc_device_property_t property);}
+@item @emph{Prototype}: @tab @code{const char *acc_get_property_string(int devicenum, acc_device_t devicetype, acc_device_property_t property);}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{function acc_get_property(devicenum, devicetype, property)}
+@item @emph{Interface}: @tab @code{subroutine acc_get_property_string(devicenum, devicetype, property, string)}
+@item                   @tab @code{integer devicenum}
+@item                   @tab @code{integer(kind=acc_device_kind) devicetype}
+@item                   @tab @code{integer(kind=acc_device_property) property}
+@item                   @tab @code{integer(kind=acc_device_property) acc_get_property}
+@item                   @tab @code{character(*) string}
+@end multitable
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.6.
+@end table
+
+
+
 @node acc_async_test
 @section @code{acc_async_test} -- Test for completion of a specific asynchronous operation.
 @table @asis
@@ -2053,8 +2103,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.6.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.9.
 @end table
 
 
@@ -2081,8 +2131,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.7.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.10.
 @end table
 
 
@@ -2109,8 +2159,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.8.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.11.
 @end table
 
 
@@ -2134,8 +2184,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.10.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.13.
 @end table
 
 
@@ -2160,8 +2210,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.11.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.14.
 @end table
 
 
@@ -2185,8 +2235,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.9.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.12.
 @end table
 
 
@@ -2210,8 +2260,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.12.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.7.
 @end table
 
 
@@ -2235,8 +2285,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.13.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.8.
 @end table
 
 
@@ -2266,8 +2316,8 @@
 
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.14.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.17.
 @end table
 
 
@@ -2285,8 +2335,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.15.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.18.
 @end table
 
 
@@ -2303,8 +2353,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.16.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.19.
 @end table
 
 
@@ -2336,8 +2386,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.17.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.20.
 @end table
 
 
@@ -2376,8 +2426,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.18.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.20.
 @end table
 
 
@@ -2409,8 +2459,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.19.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.21.
 @end table
 
 
@@ -2450,8 +2500,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.20.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.21.
 @end table
 
 
@@ -2482,8 +2532,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.21.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.22.
 @end table
 
 
@@ -2514,8 +2564,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.22.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.23.
 @end table
 
 
@@ -2547,8 +2597,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.23.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.24.
 @end table
 
 
@@ -2580,8 +2630,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.24.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.25.
 @end table
 
 
@@ -2600,8 +2650,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.25.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.26.
 @end table
 
 
@@ -2619,8 +2669,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.26.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.27.
 @end table
 
 
@@ -2638,8 +2688,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.27.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.28.
 @end table
 
 
@@ -2657,8 +2707,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.28.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.29.
 @end table
 
 
@@ -2696,8 +2746,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.29.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.30.
 @end table
 
 
@@ -2716,8 +2766,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.30.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.31.
 @end table
 
 
@@ -2736,8 +2786,8 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
-3.2.31.
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.32.
 @end table
 
 
@@ -2755,7 +2805,7 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 A.2.1.1.
 @end table
 
@@ -2774,7 +2824,7 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 A.2.1.2.
 @end table
 
@@ -2793,7 +2843,7 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 A.2.1.3.
 @end table
 
@@ -2817,12 +2867,96 @@
 @end multitable
 
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 A.2.1.4.
 @end table
 
 
 
+@node acc_prof_register
+@section @code{acc_prof_register} -- Register callbacks.
+@table @asis
+@item @emph{Description}:
+This function registers callbacks.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void acc_prof_register (acc_event_t, acc_prof_callback, acc_register_t);}
+@end multitable
+
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+5.3.
+@end table
+
+
+
+@node acc_prof_unregister
+@section @code{acc_prof_unregister} -- Unregister callbacks.
+@table @asis
+@item @emph{Description}:
+This function unregisters callbacks.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void acc_prof_unregister (acc_event_t, acc_prof_callback, acc_register_t);}
+@end multitable
+
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+5.3.
+@end table
+
+
+
+@node acc_prof_lookup
+@section @code{acc_prof_lookup} -- Obtain inquiry functions.
+@table @asis
+@item @emph{Description}:
+Function to obtain inquiry functions.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{acc_query_fn acc_prof_lookup (const char *);}
+@end multitable
+
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+5.3.
+@end table
+
+
+
+@node acc_register_library
+@section @code{acc_register_library} -- Library registration.
+@table @asis
+@item @emph{Description}:
+Function for library registration.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void acc_register_library (acc_prof_reg, acc_prof_reg, acc_prof_lookup_func);}
+@end multitable
+
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}, @ref{ACC_PROFLIB}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+5.3.
+@end table
+
+
+
 @c ---------------------------------------------------------------------
 @c OpenACC Environment Variables
 @c ---------------------------------------------------------------------
@@ -2831,12 +2965,15 @@
 @chapter OpenACC Environment Variables
 
 The variables @env{ACC_DEVICE_TYPE} and @env{ACC_DEVICE_NUM}
-are defined by section 4 of the OpenACC specification in version 2.0.
+are defined by section 4 of the OpenACC specification in version 2.6.
+The variable @env{ACC_PROFLIB}
+is defined by section 4 of the OpenACC specification in version 2.6.
 The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
 
 @menu
 * ACC_DEVICE_TYPE::
 * ACC_DEVICE_NUM::
+* ACC_PROFLIB::
 * GCC_ACC_NOTIFY::
 @end menu
 
@@ -2846,7 +2983,7 @@
 @section @code{ACC_DEVICE_TYPE}
 @table @asis
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 4.1.
 @end table
 
@@ -2856,12 +2993,25 @@
 @section @code{ACC_DEVICE_NUM}
 @table @asis
 @item @emph{Reference}:
-@uref{https://www.openacc.org, OpenACC specification v2.0}, section
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
 4.2.
 @end table
 
 
 
+@node ACC_PROFLIB
+@section @code{ACC_PROFLIB}
+@table @asis
+@item @emph{See also}:
+@ref{acc_register_library}, @ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+4.3.
+@end table
+
+
+
 @node GCC_ACC_NOTIFY
 @section @code{GCC_ACC_NOTIFY}
 @table @asis
@@ -3073,7 +3223,303 @@
 @code{acc_set_device_num()}@footnote{More complete information
 about @env{ACC_DEVICE_TYPE} and @env{ACC_DEVICE_NUM} can be found in
 sections 4.1 and 4.2 of the @uref{https://www.openacc.org, OpenACC}
-Application Programming Interface”, Version 2.0.}
+Application Programming Interface”, Version 2.6.}
+
+
+
+@c ---------------------------------------------------------------------
+@c OpenACC Profiling Interface
+@c ---------------------------------------------------------------------
+
+@node OpenACC Profiling Interface
+@chapter OpenACC Profiling Interface
+
+@section Implementation Status and Implementation-Defined Behavior
+
+We're implementing the OpenACC Profiling Interface as defined by the
+OpenACC 2.6 specification.  We're clarifying some aspects here as
+@emph{implementation-defined behavior}, while they're still under
+discussion within the OpenACC Technical Committee.
+
+This implementation is tuned to keep the performance impact as low as
+possible for the (very common) case that the Profiling Interface is
+not enabled.  This is relevant, as the Profiling Interface affects all
+the @emph{hot} code paths (in the target code, not in the offloaded
+code).  Users of the OpenACC Profiling Interface can be expected to
+understand that performance will be impacted to some degree once the
+Profiling Interface has gotten enabled: for example, because of the
+@emph{runtime} (libgomp) calling into a third-party @emph{library} for
+every event that has been registered.
+
+We're not yet accounting for the fact that @cite{OpenACC events may
+occur during event processing}.
+
+As currently there are no inquiry functions defined, calls to
+@code{acc_prof_lookup} will always return @code{NULL}.
+
+There aren't separate @emph{start}, @emph{stop} events defined for the
+event types @code{acc_ev_create}, @code{acc_ev_delete},
+@code{acc_ev_alloc}, @code{acc_ev_free}.  It's not clear if these
+should be triggered before or after the actual device-specific call is
+made.  We trigger them after.
+
+Remarks about data provided to callbacks:
+
+@table @asis
+
+@item @code{acc_prof_info.event_type}
+It's not clear if for @emph{nested} event callbacks (for example,
+@code{acc_ev_enqueue_launch_start} as part of a parent compute
+construct), this should be set for the nested event
+(@code{acc_ev_enqueue_launch_start}), or if the value of the parent
+construct should remain (@code{acc_ev_compute_construct_start}).  In
+this implementation, the value will generally correspond to the
+innermost nested event type.
+
+@item @code{acc_prof_info.device_type}
+@itemize
+
+@item
+For @code{acc_ev_compute_construct_start}, and in presence of an
+@code{if} clause with @emph{false} argument, this will still refer to
+the offloading device type.
+It's not clear if that's the expected behavior.
+
+@item
+Complementary to the item before, for
+@code{acc_ev_compute_construct_end}, this is set to
+@code{acc_device_host} in presence of an @code{if} clause with
+@emph{false} argument.
+It's not clear if that's the expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.thread_id}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.async}
+@itemize
+
+@item
+Not yet implemented correctly for
+@code{acc_ev_compute_construct_start}.
+
+@item
+In a compute construct, for host-fallback
+execution/@code{acc_device_host} it will always be
+@code{acc_async_sync}.
+It's not clear if that's the expected behavior.
+
+@item
+For @code{acc_ev_device_init_start} and @code{acc_ev_device_init_end},
+it will always be @code{acc_async_sync}.
+It's not clear if that's the expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.async_queue}
+There is no @cite{limited number of asynchronous queues} in libgomp.
+This will always have the same value as @code{acc_prof_info.async}.
+
+@item @code{acc_prof_info.src_file}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_prof_info.func_name}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_prof_info.line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_event_info.event_type}, @code{acc_event_info.*.event_type}
+Relating to @code{acc_prof_info.event_type} discussed above, in this
+implementation, this will always be the same value as
+@code{acc_prof_info.event_type}.
+
+@item @code{acc_event_info.*.parent_construct}
+@itemize
+
+@item
+Will be @code{acc_construct_parallel} for all OpenACC compute
+constructs as well as many OpenACC Runtime API calls; should be the
+one matching the actual construct, or
+@code{acc_construct_runtime_api}, respectively.
+
+@item
+Will be @code{acc_construct_enter_data} or
+@code{acc_construct_exit_data} when processing variable mappings
+specified in OpenACC @emph{declare} directives; should be
+@code{acc_construct_declare}.
+
+@item
+For implicit @code{acc_ev_device_init_start},
+@code{acc_ev_device_init_end}, and explicit as well as implicit
+@code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start}, and
+@code{acc_ev_enqueue_download_end}, will be
+@code{acc_construct_parallel}; should reflect the real parent
+construct.
+
+@end itemize
+
+@item @code{acc_event_info.*.implicit}
+For @code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start}, and
+@code{acc_ev_enqueue_download_end}, this currently will be @code{1}
+also for explicit usage.
+
+@item @code{acc_event_info.data_event.var_name}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_event_info.data_event.host_ptr}
+For @code{acc_ev_alloc}, and @code{acc_ev_free}, this is always
+@code{NULL}.
+
+@item @code{typedef union acc_api_info}
+@dots{} as printed in @cite{5.2.3. Third Argument: API-Specific
+Information}.  This should obviously be @code{typedef @emph{struct}
+acc_api_info}.
+
+@item @code{acc_api_info.device_api}
+Possibly not yet implemented correctly for
+@code{acc_ev_compute_construct_start},
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}:
+will always be @code{acc_device_api_none} for these event types.
+For @code{acc_ev_enter_data_start}, it will be
+@code{acc_device_api_none} in some cases.
+
+@item @code{acc_api_info.device_type}
+Always the same as @code{acc_prof_info.device_type}.
+
+@item @code{acc_api_info.vendor}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_api_info.device_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.context_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.async_handle}
+Always @code{NULL}; not yet implemented.
+
+@end table
+
+Remarks about certain event types:
+
+@table @asis
+
+@item @code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+@itemize
+
+@item
+@c See 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' in
+@c 'libgomp.oacc-c-c++-common/acc_prof-kernels-1.c',
+@c 'libgomp.oacc-c-c++-common/acc_prof-parallel-1.c'.
+Whan a compute construct triggers implicit
+@code{acc_ev_device_init_start} and @code{acc_ev_device_init_end}
+events, they currently aren't @emph{nested within} the corresponding
+@code{acc_ev_compute_construct_start} and
+@code{acc_ev_compute_construct_end}, but they're currently observed
+@emph{before} @code{acc_ev_compute_construct_start}.
+It's not clear what to do: the standard asks us provide a lot of
+details to the @code{acc_ev_compute_construct_start} callback, without
+(implicitly) initializing a device before?
+
+@item
+Callbacks for these event types will not be invoked for calls to the
+@code{acc_set_device_type} and @code{acc_set_device_num} functions.
+It's not clear if they should be.
+
+@end itemize
+
+@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end}, @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end}
+@itemize
+
+@item
+Callbacks for these event types will also be invoked for OpenACC
+@emph{host_data} constructs.
+It's not clear if they should be.
+
+@item
+Callbacks for these event types will also be invoked when processing
+variable mappings specified in OpenACC @emph{declare} directives.
+It's not clear if they should be.
+
+@end itemize
+
+@end table
+
+Callbacks for the following event types will be invoked, but dispatch
+and information provided therein has not yet been thoroughly reviewed:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@item @code{acc_ev_update_start}, @code{acc_ev_update_end}
+@item @code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}
+@item @code{acc_ev_enqueue_download_start}, @code{acc_ev_enqueue_download_end}
+@end itemize
+
+During device initialization, and finalization, respectively,
+callbacks for the following event types will not yet be invoked:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@end itemize
+
+Callbacks for the following event types have not yet been implemented,
+so currently won't be invoked:
+
+@itemize
+@item @code{acc_ev_device_shutdown_start}, @code{acc_ev_device_shutdown_end}
+@item @code{acc_ev_runtime_shutdown}
+@item @code{acc_ev_create}, @code{acc_ev_delete}
+@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end}
+@end itemize
+
+For the following runtime library functions, not all expected
+callbacks will be invoked (mostly concerning implicit device
+initialization):
+
+@itemize
+@item @code{acc_get_num_devices}
+@item @code{acc_set_device_type}
+@item @code{acc_get_device_type}
+@item @code{acc_set_device_num}
+@item @code{acc_get_device_num}
+@item @code{acc_init}
+@item @code{acc_shutdown}
+@end itemize
+
+Aside from implicit device initialization, for the following runtime
+library functions, no callbacks will be invoked for shared-memory
+offloading devices (it's not clear if they should be):
+
+@itemize
+@item @code{acc_malloc}
+@item @code{acc_free}
+@item @code{acc_copyin}, @code{acc_present_or_copyin}, @code{acc_copyin_async}
+@item @code{acc_create}, @code{acc_present_or_create}, @code{acc_create_async}
+@item @code{acc_copyout}, @code{acc_copyout_async}, @code{acc_copyout_finalize}, @code{acc_copyout_finalize_async}
+@item @code{acc_delete}, @code{acc_delete_async}, @code{acc_delete_finalize}, @code{acc_delete_finalize_async}
+@item @code{acc_update_device}, @code{acc_update_device_async}
+@item @code{acc_update_self}, @code{acc_update_self_async}
+@item @code{acc_map_data}, @code{acc_unmap_data}
+@item @code{acc_memcpy_to_device}, @code{acc_memcpy_to_device_async}
+@item @code{acc_memcpy_from_device}, @code{acc_memcpy_from_device_async}
+@end itemize
 
 
 
diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h
index 32a9d8aa..4bf61d5 100644
--- a/libgomp/libgomp_g.h
+++ b/libgomp/libgomp_g.h
@@ -31,7 +31,7 @@
 
 #include <stdbool.h>
 #include <stddef.h>
-#include "gstdint.h"
+#include <stdint.h>
 
 /* barrier.c */
 
@@ -334,6 +334,7 @@
 
 /* target.c */
 
+extern void GOMP_set_offload_targets (const char *);
 extern void GOMP_target (int, void (*) (void *), const void *,
 			 size_t, void **, size_t *, unsigned char *);
 extern void GOMP_target_ext (int, void (*) (void *), size_t, void **, size_t *,
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index 915284d..2b24ae7 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -27,47 +27,199 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <assert.h>
+#include <string.h>
 #include "openacc.h"
 #include "libgomp.h"
 #include "oacc-int.h"
 
-int
-acc_async_test (int async)
+static struct goacc_thread *
+get_goacc_thread (void)
 {
-  if (!async_valid_p (async))
-    gomp_fatal ("invalid async argument: %d", async);
-
   struct goacc_thread *thr = goacc_thread ();
 
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
-  return thr->dev->openacc.async_test_func (async);
+  return thr;
+}
+
+static int
+validate_async_val (int async)
+{
+  if (!async_valid_p (async))
+    gomp_fatal ("invalid async-argument: %d", async);
+
+  if (async == acc_async_sync)
+    return -1;
+
+  if (async == acc_async_noval)
+    return 0;
+
+  if (async >= 0)
+    /* TODO: we reserve 0 for acc_async_noval before we can clarify the
+       semantics of "default_async".  */
+    return 1 + async;
+  else
+    __builtin_unreachable ();
+}
+
+/* Return the asyncqueue to be used for OpenACC async-argument ASYNC.  This
+   might return NULL if no asyncqueue is to be used.  Otherwise, if CREATE,
+   create the asyncqueue if it doesn't exist yet.
+
+   Unless CREATE, this will not generate any OpenACC Profiling Interface
+   events.  */
+
+attribute_hidden struct goacc_asyncqueue *
+lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
+{
+  async = validate_async_val (async);
+  if (async < 0)
+    return NULL;
+
+  struct goacc_asyncqueue *ret_aq = NULL;
+  struct gomp_device_descr *dev = thr->dev;
+
+  gomp_mutex_lock (&dev->openacc.async.lock);
+
+  if (!create
+      && (async >= dev->openacc.async.nasyncqueue
+	  || !dev->openacc.async.asyncqueue[async]))
+    goto end;
+
+  if (async >= dev->openacc.async.nasyncqueue)
+    {
+      int diff = async + 1 - dev->openacc.async.nasyncqueue;
+      dev->openacc.async.asyncqueue
+	= gomp_realloc (dev->openacc.async.asyncqueue,
+			sizeof (goacc_aq) * (async + 1));
+      memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
+	      0, sizeof (goacc_aq) * diff);
+      dev->openacc.async.nasyncqueue = async + 1;
+    }
+
+  if (!dev->openacc.async.asyncqueue[async])
+    {
+      dev->openacc.async.asyncqueue[async]
+	= dev->openacc.async.construct_func (dev->target_id);
+
+      if (!dev->openacc.async.asyncqueue[async])
+	{
+	  gomp_mutex_unlock (&dev->openacc.async.lock);
+	  gomp_fatal ("async %d creation failed", async);
+	}
+      
+      /* Link new async queue into active list.  */
+      goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
+      n->aq = dev->openacc.async.asyncqueue[async];
+      n->next = dev->openacc.async.active;
+      dev->openacc.async.active = n;
+    }
+
+  ret_aq = dev->openacc.async.asyncqueue[async];
+
+ end:
+  gomp_mutex_unlock (&dev->openacc.async.lock);
+  return ret_aq;
+}
+
+/* Return the asyncqueue to be used for OpenACC async-argument ASYNC.  This
+   might return NULL if no asyncqueue is to be used.  Otherwise, create the
+   asyncqueue if it doesn't exist yet.  */
+
+attribute_hidden struct goacc_asyncqueue *
+get_goacc_asyncqueue (int async)
+{
+  struct goacc_thread *thr = get_goacc_thread ();
+  return lookup_goacc_asyncqueue (thr, true, async);
+}
+
+int
+acc_async_test (int async)
+{
+  struct goacc_thread *thr = goacc_thread ();
+
+  if (!thr || !thr->dev)
+    gomp_fatal ("no device active");
+
+  goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+  if (!aq)
+    return 1;
+
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
+  int res = thr->dev->openacc.async.test_func (aq);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  return res;
 }
 
 int
 acc_async_test_all (void)
 {
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
 
-  return thr->dev->openacc.async_test_all_func ();
+  int ret = 1;
+  gomp_mutex_lock (&thr->dev->openacc.async.lock);
+  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+    if (!thr->dev->openacc.async.test_func (l->aq))
+      {
+	ret = 0;
+	break;
+      }
+  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  return ret;
 }
 
 void
 acc_wait (int async)
 {
-  if (!async_valid_p (async))
-    gomp_fatal ("invalid async argument: %d", async);
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  struct goacc_thread *thr = goacc_thread ();
+  goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+  if (!aq)
+    return;
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
 
-  thr->dev->openacc.async_wait_func (async);
+  if (!thr->dev->openacc.async.synchronize_func (aq))
+    gomp_fatal ("wait on %d failed", async);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.  */
@@ -84,12 +236,47 @@
 void
 acc_wait_async (int async1, int async2)
 {
-  struct goacc_thread *thr = goacc_thread ();
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  goacc_aq aq1 = lookup_goacc_asyncqueue (thr, false, async1);
+  /* TODO: Is this also correct for acc_async_sync, assuming that in this case,
+     we'll always be synchronous anyways?  */
+  if (!aq1)
+    return;
 
-  thr->dev->openacc.async_wait_async_func (async1, async2);
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async2;
+      prof_info.async_queue = prof_info.async;
+    }
+
+  goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
+  /* An async queue is always synchronized with itself.  */
+  if (aq1 == aq2)
+    goto out_prof;
+
+  if (aq2)
+    {
+      if (!thr->dev->openacc.async.serialize_func (aq1, aq2))
+	gomp_fatal ("ordering of async ids %d and %d failed", async1, async2);
+    }
+  else
+    {
+      /* TODO: Local thread synchronization.
+	 Necessary for the "async2 == acc_async_sync" case, or can just skip?  */
+      if (!thr->dev->openacc.async.synchronize_func (aq1))
+	gomp_fatal ("wait on %d failed", async1);
+    }
+
+ out_prof:
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -97,10 +284,24 @@
 {
   struct goacc_thread *thr = goacc_thread ();
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
 
-  thr->dev->openacc.async_wait_all_func ();
+  bool ret = true;
+  gomp_mutex_lock (&thr->dev->openacc.async.lock);
+  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+    ret &= thr->dev->openacc.async.synchronize_func (l->aq);
+  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  if (!ret)
+    gomp_fatal ("wait all failed");
 }
 
 /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.  */
@@ -117,13 +318,88 @@
 void
 acc_wait_all_async (int async)
 {
-  if (!async_valid_p (async))
-    gomp_fatal ("invalid async argument: %d", async);
+  struct goacc_thread *thr = get_goacc_thread ();
 
-  struct goacc_thread *thr = goacc_thread ();
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
 
-  if (!thr || !thr->dev)
-    gomp_fatal ("no device active");
+  goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
 
-  thr->dev->openacc.async_wait_all_async_func (async);
+  bool ret = true;
+  gomp_mutex_lock (&thr->dev->openacc.async.lock);
+  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+    {
+      if (waiting_queue)
+	ret &= thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
+      else
+	/* TODO: Local thread synchronization.
+	   Necessary for the "async2 == acc_async_sync" case, or can just skip?  */
+	ret &= thr->dev->openacc.async.synchronize_func (l->aq);
+    }
+  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  if (!ret)
+    gomp_fatal ("wait all async(%d) failed", async);
+}
+
+attribute_hidden void
+goacc_async_free (struct gomp_device_descr *devicep,
+		  struct goacc_asyncqueue *aq, void *ptr)
+{
+  if (!aq)
+    free (ptr);
+  else
+    devicep->openacc.async.queue_callback_func (aq, free, ptr);
+}
+
+/* This function initializes the asyncqueues for the device specified by
+   DEVICEP.  TODO DEVICEP must be locked on entry, and remains locked on
+   return.  */
+
+attribute_hidden void
+goacc_init_asyncqueues (struct gomp_device_descr *devicep)
+{
+  devicep->openacc.async.nasyncqueue = 0;
+  devicep->openacc.async.asyncqueue = NULL;
+  devicep->openacc.async.active = NULL;
+  gomp_mutex_init (&devicep->openacc.async.lock);
+}
+
+/* This function finalizes the asyncqueues for the device specified by DEVICEP.
+   TODO DEVICEP must be locked on entry, and remains locked on return.  */
+
+attribute_hidden bool
+goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
+{
+  bool ret = true;
+  gomp_mutex_lock (&devicep->openacc.async.lock);
+  if (devicep->openacc.async.nasyncqueue > 0)
+    {
+      goacc_aq_list next;
+      for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
+	{
+	  ret &= devicep->openacc.async.destruct_func (l->aq);
+	  next = l->next;
+	  free (l);
+	}
+      free (devicep->openacc.async.asyncqueue);
+      devicep->openacc.async.nasyncqueue = 0;
+      devicep->openacc.async.asyncqueue = NULL;
+      devicep->openacc.async.active = NULL;
+    }
+  gomp_mutex_unlock (&devicep->openacc.async.lock);
+  gomp_mutex_destroy (&devicep->openacc.async.lock);
+  return ret;
 }
diff --git a/libgomp/oacc-cuda.c b/libgomp/oacc-cuda.c
index 16eb6c3..2eda381 100644
--- a/libgomp/oacc-cuda.c
+++ b/libgomp/oacc-cuda.c
@@ -30,16 +30,30 @@
 #include "config.h"
 #include "libgomp.h"
 #include "oacc-int.h"
+#include <assert.h>
 
 void *
 acc_get_current_cuda_device (void)
 {
   struct goacc_thread *thr = goacc_thread ();
 
+  void *ret = NULL;
   if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
-    return thr->dev->openacc.cuda.get_current_device_func ();
+    {
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
 
-  return NULL;
+      ret = thr->dev->openacc.cuda.get_current_device_func ();
+
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
+    }
+
+  return ret;
 }
 
 void *
@@ -47,10 +61,23 @@
 {
   struct goacc_thread *thr = goacc_thread ();
 
+  void *ret = NULL;
   if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
-    return thr->dev->openacc.cuda.get_current_context_func ();
- 
-  return NULL;
+    {
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
+      ret = thr->dev->openacc.cuda.get_current_context_func ();
+
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
+    }
+
+  return ret;
 }
 
 void *
@@ -61,12 +88,37 @@
   if (!async_valid_p (async))
     return NULL;
 
+  void *ret = NULL;
   if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
-    return thr->dev->openacc.cuda.get_stream_func (async);
- 
-  return NULL;
+    {
+      goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+      if (!aq)
+	return ret;
+
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+      if (profiling_p)
+	{
+	  prof_info.async = async;
+	  prof_info.async_queue = prof_info.async;
+	}
+
+      ret = thr->dev->openacc.cuda.get_stream_func (aq);
+
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
+    }
+
+  return ret;
 }
 
+/* As of OpenACC 2.6, the return code of this function appears to be
+   unspecified.  We choose to return 1 for success, or 0 for failure.  */
+
 int
 acc_set_cuda_stream (int async, void *stream)
 {
@@ -79,8 +131,40 @@
 
   thr = goacc_thread ();
 
+  int ret = 0;
   if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
-    return thr->dev->openacc.cuda.set_stream_func (async, stream);
+    {
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+      if (profiling_p)
+	{
+	  prof_info.async = async;
+	  prof_info.async_queue = prof_info.async;
+	}
 
-  return -1;
+      goacc_aq aq = get_goacc_asyncqueue (async);
+      /* Due to not using an asyncqueue for "acc_async_sync", this cannot be
+	 used to change the CUDA stream associated with "acc_async_sync".  */
+      if (!aq)
+	{
+	  assert (async == acc_async_sync);
+	  gomp_debug (0, "Refusing request to set CUDA stream associated"
+		      " with \"acc_async_sync\"\n");
+	  ret = 0;
+	  goto out_prof;
+	}
+      gomp_mutex_lock (&thr->dev->openacc.async.lock);
+      ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
+      gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+    out_prof:
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
+    }
+
+  return ret;
 }
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index 222bfb7..4bc2eeb 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -60,6 +60,27 @@
   return 1;
 }
 
+static union gomp_device_property_value
+host_get_property (int n, int prop)
+{
+  union gomp_device_property_value nullval = { .val = 0 };
+
+  if (n >= host_get_num_devices ())
+    return nullval;
+
+  switch (prop)
+    {
+    case GOMP_DEVICE_PROPERTY_NAME:
+      return (union gomp_device_property_value) { .ptr = "GOMP" };
+    case GOMP_DEVICE_PROPERTY_VENDOR:
+      return (union gomp_device_property_value) { .ptr = "GNU" };
+    case GOMP_DEVICE_PROPERTY_DRIVER:
+      return (union gomp_device_property_value) { .ptr = VERSION };
+    default:
+      return nullval;
+    }
+}
+
 static bool
 host_init_device (int n __attribute__ ((unused)))
 {
@@ -140,55 +161,113 @@
 		   size_t mapnum __attribute__ ((unused)),
 		   void **hostaddrs,
 		   void **devaddrs __attribute__ ((unused)),
-		   int async __attribute__ ((unused)),
-		   unsigned *dims __attribute ((unused)),
+		   unsigned *dims __attribute__ ((unused)),
 		   void *targ_mem_desc __attribute__ ((unused)))
 {
   fn (hostaddrs);
 }
 
 static void
-host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
-				     int async __attribute__ ((unused)))
+host_openacc_async_exec (void (*fn) (void *),
+			 size_t mapnum __attribute__ ((unused)),
+			 void **hostaddrs,
+			 void **devaddrs __attribute__ ((unused)),
+			 unsigned *dims __attribute__ ((unused)),
+			 void *targ_mem_desc __attribute__ ((unused)),
+			 struct goacc_asyncqueue *aq __attribute__ ((unused)))
 {
+  fn (hostaddrs);
 }
 
+static void
+host_openacc_exec_params (void (*fn) (void *),
+			  size_t mapnum __attribute__ ((unused)),
+			  void **hostaddrs,
+			  void **devaddrs __attribute__ ((unused)),
+			  unsigned *dims __attribute__ ((unused)),
+			  void *targ_mem_desc __attribute__ ((unused)))
+{
+  fn (hostaddrs);
+}
+
+static void
+host_openacc_async_exec_params (void (*fn) (void *),
+				size_t mapnum __attribute__ ((unused)),
+				void **hostaddrs,
+				void **devaddrs __attribute__ ((unused)),
+				unsigned *dims __attribute__ ((unused)),
+				void *targ_mem_desc __attribute__ ((unused)),
+				struct goacc_asyncqueue *aq __attribute__ ((unused)))
+{
+  fn (hostaddrs);
+}
+
+
 static int
-host_openacc_async_test (int async __attribute__ ((unused)))
+host_openacc_async_test (struct goacc_asyncqueue *aq __attribute__ ((unused)))
 {
   return 1;
 }
 
-static int
-host_openacc_async_test_all (void)
+static bool
+host_openacc_async_synchronize (struct goacc_asyncqueue *aq
+				__attribute__ ((unused)))
 {
-  return 1;
+  return true;
+}
+
+static bool
+host_openacc_async_serialize (struct goacc_asyncqueue *aq1
+			      __attribute__ ((unused)),
+			      struct goacc_asyncqueue *aq2
+			      __attribute__ ((unused)))
+{
+  return true;
+}
+
+static bool
+host_openacc_async_host2dev (int ord __attribute__ ((unused)),
+			     void *dst __attribute__ ((unused)),
+			     const void *src __attribute__ ((unused)),
+			     size_t n __attribute__ ((unused)),
+			     bool eph __attribute__ ((unused)),
+			     struct goacc_asyncqueue *aq
+			     __attribute__ ((unused)))
+{
+  return true;
+}
+
+static bool
+host_openacc_async_dev2host (int ord __attribute__ ((unused)),
+			     void *dst __attribute__ ((unused)),
+			     const void *src __attribute__ ((unused)),
+			     size_t n __attribute__ ((unused)),
+			     struct goacc_asyncqueue *aq
+			     __attribute__ ((unused)))
+{
+  return true;
 }
 
 static void
-host_openacc_async_wait (int async __attribute__ ((unused)))
+host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
+				   __attribute__ ((unused)),
+				   void (*callback_fn)(void *), void *userptr)
 {
+  callback_fn (userptr);
 }
 
-static void
-host_openacc_async_wait_async (int async1 __attribute__ ((unused)),
-			       int async2 __attribute__ ((unused)))
+static struct goacc_asyncqueue *
+host_openacc_async_construct (int device __attribute__((unused)))
 {
+  /* Non-NULL 0xffff... value as opaque dummy.  */
+  return (struct goacc_asyncqueue *) -1;
 }
 
-static void
-host_openacc_async_wait_all (void)
+static bool
+host_openacc_async_destruct (struct goacc_asyncqueue *aq
+			     __attribute__ ((unused)))
 {
-}
-
-static void
-host_openacc_async_wait_all_async (int async __attribute__ ((unused)))
-{
-}
-
-static void
-host_openacc_async_set_async (int async __attribute__ ((unused)))
-{
+  return true;
 }
 
 static void *
@@ -215,6 +294,7 @@
     .get_caps_func = host_get_caps,
     .get_type_func = host_get_type,
     .get_num_devices_func = host_get_num_devices,
+    .get_property_func = host_get_property,
     .init_device_func = host_init_device,
     .fini_device_func = host_fini_device,
     .version_func = host_version,
@@ -231,23 +311,25 @@
     .state = GOMP_DEVICE_UNINITIALIZED,
 
     .openacc = {
-      .data_environ = NULL,
-
       .exec_func = host_openacc_exec,
-
-      .register_async_cleanup_func = host_openacc_register_async_cleanup,
-
-      .async_test_func = host_openacc_async_test,
-      .async_test_all_func = host_openacc_async_test_all,
-      .async_wait_func = host_openacc_async_wait,
-      .async_wait_async_func = host_openacc_async_wait_async,
-      .async_wait_all_func = host_openacc_async_wait_all,
-      .async_wait_all_async_func = host_openacc_async_wait_all_async,
-      .async_set_async_func = host_openacc_async_set_async,
+      .exec_params_func = host_openacc_exec_params,
 
       .create_thread_data_func = host_openacc_create_thread_data,
       .destroy_thread_data_func = host_openacc_destroy_thread_data,
 
+      .async = {
+	.construct_func = host_openacc_async_construct,
+	.destruct_func = host_openacc_async_destruct,
+	.test_func = host_openacc_async_test,
+	.synchronize_func = host_openacc_async_synchronize,
+	.serialize_func = host_openacc_async_serialize,
+	.queue_callback_func = host_openacc_async_queue_callback,
+	.exec_func = host_openacc_async_exec,
+	.exec_params_func = host_openacc_async_exec_params,
+	.dev2host_func = host_openacc_async_dev2host,
+	.host2dev_func = host_openacc_async_host2dev,
+      },
+
       .cuda = {
 	.get_current_device_func = NULL,
 	.get_current_context_func = NULL,
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 033fac4..beeeb48 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -40,6 +40,11 @@
 
 static gomp_mutex_t acc_device_lock;
 
+static gomp_mutex_t acc_init_state_lock;
+static enum { uninitialized, initializing, initialized } acc_init_state
+  = uninitialized;
+static pthread_t acc_init_thread;
+
 /* A cached version of the dispatcher for the global "current" accelerator type,
    e.g. used as the default when creating new host threads.  This is the
    device-type equivalent of goacc_device_num (which specifies which device to
@@ -87,6 +92,8 @@
 static const char *
 get_openacc_name (const char *name)
 {
+  /* not supported: _acc_device_intel_mic */
+  /* not supported: _acc_device_hsa */
   if (strcmp (name, "nvptx") == 0)
     return "nvidia";
   else
@@ -103,6 +110,9 @@
     case acc_device_host: return "host";
     case acc_device_not_host: return "not_host";
     case acc_device_nvidia: return "nvidia";
+    case acc_device_gcn: return "gcn";
+    case /* not supported */ _acc_device_hsa:
+    case /* not supported */ _acc_device_intel_mic:
     default: gomp_fatal ("unknown device type %u", (unsigned) type);
     }
 }
@@ -114,6 +124,8 @@
 static struct gomp_device_descr *
 resolve_device (acc_device_t d, bool fail_is_error)
 {
+  gomp_debug (0, "%s (%d)\n", __FUNCTION__, (int) d);
+
   acc_device_t d_arg = d;
 
   switch (d)
@@ -122,7 +134,9 @@
       {
 	if (goacc_device_type)
 	  {
-	    /* Lookup the named device.  */
+	    /* Lookup the device that has been explicitly named, so do not pay
+	       attention to gomp_offload_target_available_p.  (That is, hard
+	       error if not actually available.)  */
 	    while (++d != _ACC_device_hwm)
 	      if (dispatchers[d]
 		  && !strcasecmp (goacc_device_type,
@@ -148,8 +162,14 @@
     case acc_device_not_host:
       /* Find the first available device after acc_device_not_host.  */
       while (++d != _ACC_device_hwm)
-	if (dispatchers[d] && dispatchers[d]->get_num_devices_func () > 0)
+	if (dispatchers[d]
+	    && dispatchers[d]->get_num_devices_func () > 0
+	    /* No device has been explicitly named, so pay attention to
+	       gomp_offload_target_available_p, to not decide on an offload
+	       target that we don't have offload data available for.  */
+	    && gomp_offload_target_available_p (dispatchers[d]->type))
 	  goto found;
+      /* No non-host device found.  */
       if (d_arg == acc_device_default)
 	{
 	  d = acc_device_host;
@@ -164,9 +184,6 @@
         return NULL;
       break;
 
-    case acc_device_host:
-      break;
-
     default:
       if (d > _ACC_device_hwm)
 	{
@@ -181,7 +198,8 @@
 
   assert (d != acc_device_none
 	  && d != acc_device_default
-	  && d != acc_device_not_host);
+	  && d != acc_device_not_host
+	  && d < _ACC_device_hwm);
 
   if (dispatchers[d] == NULL && fail_is_error)
     {
@@ -190,6 +208,7 @@
       gomp_fatal ("device type %s not supported", name_of_acc_device_t (d));
     }
 
+  gomp_debug (0, "  %s: %d: %p\n", __FUNCTION__, (int) d, dispatchers[d]);
   return dispatchers[d];
 }
 
@@ -210,11 +229,75 @@
    held before calling this function.  */
 
 static struct gomp_device_descr *
-acc_init_1 (acc_device_t d)
+acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
 {
+  bool check_not_nested_p;
+  if (implicit)
+    {
+      /* In the implicit case, there should (TODO: must?) already be something
+	 have been set up for an outer construct.  */
+      check_not_nested_p = false;
+    }
+  else
+    {
+      check_not_nested_p = true;
+      /* TODO: should we set 'thr->prof_info' etc. in this case ('acc_init')?
+	 The problem is, that we don't have 'thr' yet?  (So,
+	 'check_not_nested_p = true' also is pointless actually.)  */
+    }
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (check_not_nested_p);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_device_init_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = d;
+      prof_info.device_number = goacc_device_num;
+      prof_info.thread_id = -1;
+      prof_info.async = acc_async_sync;
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info device_init_event_info;
+  if (profiling_p)
+    {
+      device_init_event_info.other_event.event_type = prof_info.event_type;
+      device_init_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      device_init_event_info.other_event.parent_construct = parent_construct;
+      device_init_event_info.other_event.implicit = implicit;
+      device_init_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
+
   struct gomp_device_descr *base_dev, *acc_dev;
   int ndevs;
 
+  gomp_mutex_lock (&acc_init_state_lock);
+  acc_init_state = initializing;
+  acc_init_thread = pthread_self ();
+  gomp_mutex_unlock (&acc_init_state_lock);
+
   base_dev = resolve_device (d, true);
 
   ndevs = base_dev->get_num_devices_func ();
@@ -234,6 +317,18 @@
   gomp_init_device (acc_dev);
   gomp_mutex_unlock (&acc_dev->lock);
 
+  gomp_mutex_lock (&acc_init_state_lock);
+  acc_init_state = initialized;
+  gomp_mutex_unlock (&acc_init_state_lock);
+
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_device_init_end;
+      device_init_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &device_init_event_info,
+				&api_info);
+    }
+
   return base_dev;
 }
 
@@ -289,9 +384,13 @@
 
       if (walk->dev)
 	{
-	  gomp_mutex_lock (&walk->dev->lock);
-	  gomp_free_memmap (&walk->dev->mem_map);
-	  gomp_mutex_unlock (&walk->dev->lock);
+	  while (walk->dev->mem_map.root)
+	    {
+	      splay_tree_key k = &walk->dev->mem_map.root->key;
+	      if (k->virtual_refcount == VREFCOUNT_LINK_KEY)
+		k->u.link_key = NULL;
+	      gomp_remove_var (walk->dev, k);
+	    }
 
 	  walk->dev = NULL;
 	  walk->base_dev = NULL;
@@ -309,7 +408,7 @@
       if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
         {
 	  devices_active = true;
-	  ret &= acc_dev->fini_device_func (acc_dev->target_id);
+	  ret &= gomp_fini_device (acc_dev);
 	  acc_dev->state = GOMP_DEVICE_UNINITIALIZED;
 	}
       gomp_mutex_unlock (&acc_dev->lock);
@@ -423,11 +522,13 @@
   thr->dev = acc_dev = &base_dev[ord];
   thr->saved_bound_dev = NULL;
   thr->mapped_data = NULL;
-  
+  thr->prof_info = NULL;
+  thr->api_info = NULL;
+  /* Initially, all callbacks for all events are enabled.  */
+  thr->prof_callbacks_enabled = true;
+
   thr->target_tls
     = acc_dev->openacc.create_thread_data_func (ord);
-  
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 /* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
@@ -439,9 +540,7 @@
   gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
-
-  cached_base_dev = acc_init_1 (d);
-
+  cached_base_dev = acc_init_1 (d, acc_construct_runtime_api, 0);
   gomp_mutex_unlock (&acc_device_lock);
   
   goacc_attach_host_thread_to_device (-1);
@@ -500,6 +599,12 @@
   struct gomp_device_descr *base_dev, *acc_dev;
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    prof_info.device_type = d;
+
   gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
@@ -524,10 +629,27 @@
     }
 
   goacc_attach_host_thread_to_device (-1);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 ialias (acc_set_device_type)
 
+static bool
+self_initializing_p (void)
+{
+  bool res;
+  gomp_mutex_lock (&acc_init_state_lock);
+  res = (acc_init_state == initializing
+	 && pthread_equal (acc_init_thread, pthread_self ()));
+  gomp_mutex_unlock (&acc_init_state_lock);
+  return res;
+}
+
 acc_device_t
 acc_get_device_type (void)
 {
@@ -537,18 +659,38 @@
 
   if (thr && thr->base_dev)
     res = acc_device_type (thr->base_dev->type);
+  else if (self_initializing_p ())
+    /* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the
+       acc_ev_device_init_start event callback, which is dispatched during
+       acc_init_1.  Trying to lock acc_device_lock during such a call (as we do
+       in the else clause below), will result in deadlock, since the lock has
+       already been taken by the acc_init_1 caller.  We work around this problem
+       by using the acc_get_device_type property "If the device type has not yet
+       been selected, the value acc_device_none may be returned".  */
+    ;
   else
     {
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
       gomp_init_targets_once ();
 
       gomp_mutex_lock (&acc_device_lock);
       dev = resolve_device (acc_device_default, true);
       gomp_mutex_unlock (&acc_device_lock);
       res = acc_device_type (dev->type);
+
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
     }
 
   assert (res != acc_device_default
-	  && res != acc_device_not_host);
+	  && res != acc_device_not_host
+	  && res != acc_device_current);
 
   return res;
 }
@@ -564,12 +706,24 @@
   if (d >= _ACC_device_hwm)
     gomp_fatal ("unknown device type %u", (unsigned) d);
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    prof_info.device_type = d;
+
   gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
   dev = resolve_device (d, true);
   gomp_mutex_unlock (&acc_device_lock);
 
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   if (thr && thr->base_dev == dev && thr->dev)
     return thr->dev->target_id;
 
@@ -622,6 +776,100 @@
 
 ialias (acc_set_device_num)
 
+static union gomp_device_property_value
+get_property_any (int ord, acc_device_t d, acc_device_property_t prop)
+{
+  union gomp_device_property_value propval;
+  struct gomp_device_descr *dev;
+  struct goacc_thread *thr;
+
+  if (d == acc_device_none)
+    return (union gomp_device_property_value) { .val = 0 };
+
+  goacc_lazy_initialize ();
+  thr = goacc_thread ();
+
+  if (d == acc_device_current && (!thr || !thr->dev))
+    return (union gomp_device_property_value) { .val = 0 };
+
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
+  if (d == acc_device_current)
+    {
+      if (profiling_p)
+	{
+	  prof_info.device_type = acc_device_type (thr->dev->type);
+	  prof_info.device_number = thr->dev->target_id;
+	}
+
+      dev = thr->dev;
+    }
+  else
+    {
+      int num_devices;
+
+      if (profiling_p)
+	{
+	  prof_info.device_type = d;
+	  prof_info.device_number = ord;
+	}
+
+      gomp_mutex_lock (&acc_device_lock);
+
+      dev = resolve_device (d, false);
+
+      num_devices = dev->get_num_devices_func ();
+
+      if (num_devices <= 0 || ord >= num_devices)
+        acc_dev_num_out_of_range (d, ord, num_devices);
+
+      dev += ord;
+
+      gomp_mutex_lock (&dev->lock);
+      if (dev->state == GOMP_DEVICE_UNINITIALIZED)
+        gomp_init_device (dev);
+      gomp_mutex_unlock (&dev->lock);
+
+      gomp_mutex_unlock (&acc_device_lock);
+    }
+
+  assert (dev);
+
+  propval = dev->get_property_func (dev->target_id, prop);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  return propval;
+}
+
+size_t
+acc_get_property (int ord, acc_device_t d, acc_device_property_t prop)
+{
+  if (prop & GOMP_DEVICE_PROPERTY_STRING_MASK)
+    return 0;
+  else
+    return get_property_any (ord, d, prop).val;
+}
+
+ialias (acc_get_property)
+
+const char *
+acc_get_property_string (int ord, acc_device_t d, acc_device_property_t prop)
+{
+  if (prop & GOMP_DEVICE_PROPERTY_STRING_MASK)
+    return get_property_any (ord, d, prop).ptr;
+  else
+    return NULL;
+}
+
+ialias (acc_get_property_string)
+
 /* For -O and higher, the compiler always attempts to expand acc_on_device, but
    if the user disables the builtin, or calls it via a pointer, we'll need this
    version.
@@ -700,8 +948,13 @@
   if (thr && thr->dev)
     return;
 
+  gomp_init_targets_once ();
+
+  gomp_mutex_lock (&acc_device_lock);
   if (!cached_base_dev)
-    acc_init (acc_device_default);
-  else
-    goacc_attach_host_thread_to_device (-1);
+    cached_base_dev = acc_init_1 (acc_device_default,
+				  acc_construct_parallel, 1);
+  gomp_mutex_unlock (&acc_device_lock);
+
+  goacc_attach_host_thread_to_device (-1);
 }
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index 940052b..9dc6c8a 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -40,6 +40,7 @@
 
 #include "openacc.h"
 #include "config.h"
+#include "acc_prof.h"
 #include <stddef.h>
 #include <stdbool.h>
 #include <stdarg.h>
@@ -68,6 +69,12 @@
      strictly push/pop semantics according to lexical scope.  */
   struct target_mem_desc *mapped_data;
 
+  /* Data of the OpenACC Profiling Interface.  */
+  acc_prof_info *prof_info;
+  acc_api_info *api_info;
+  /* Per-thread toggle of OpenACC Profiling Interface callbacks.  */
+  bool prof_callbacks_enabled;
+
   /* These structures form a list: this is the next thread in that list.  */
   struct goacc_thread *next;
 
@@ -75,7 +82,14 @@
   void *target_tls;
 };
 
-#if defined HAVE_TLS || defined USE_EMUTLS
+#ifdef __AMDGCN__
+static inline struct goacc_thread *
+goacc_thread (void)
+{
+  /* Unused in the offload libgomp for OpenACC: return a dummy value.  */
+  return 0;
+}
+#elif defined HAVE_TLS || defined USE_EMUTLS
 extern __thread struct goacc_thread *goacc_tls_data;
 static inline struct goacc_thread *
 goacc_thread (void)
@@ -99,6 +113,13 @@
 void goacc_lazy_initialize (void);
 void goacc_host_init (void);
 
+void goacc_init_asyncqueues (struct gomp_device_descr *);
+bool goacc_fini_asyncqueues (struct gomp_device_descr *);
+void goacc_async_free (struct gomp_device_descr *, struct goacc_asyncqueue *,
+		       void *);
+struct goacc_asyncqueue *get_goacc_asyncqueue (int);
+struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool,
+						  int);
 static inline bool
 async_valid_stream_id_p (int async)
 {
@@ -121,6 +142,28 @@
   return async == acc_async_sync;
 }
 
+
+extern bool goacc_prof_enabled;
+/* Tune for the (very common) case that profiling is not enabled.  */
+#define GOACC_PROF_ENABLED \
+  (__builtin_expect (__atomic_load_n (&goacc_prof_enabled, \
+				      MEMMODEL_ACQUIRE) == true, false))
+
+void goacc_profiling_initialize (void);
+bool _goacc_profiling_dispatch_p (bool);
+/* Tune for the (very common) case that profiling is not enabled.  */
+#define GOACC_PROFILING_DISPATCH_P(...) \
+  (GOACC_PROF_ENABLED \
+   && _goacc_profiling_dispatch_p (__VA_ARGS__))
+bool _goacc_profiling_setup_p (struct goacc_thread *,
+			       acc_prof_info *, acc_api_info *);
+/* Tune for the (very common) case that profiling is not enabled.  */
+#define GOACC_PROFILING_SETUP_P(...) \
+  (GOACC_PROFILING_DISPATCH_P (false) \
+   && _goacc_profiling_setup_p (__VA_ARGS__))
+void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
+			       acc_api_info *);
+
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility pop
 #endif
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 26e1a75..f8c71bf 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -52,6 +52,25 @@
   return key;
 }
 
+/* Helper for lookup_dev.  Iterate over splay tree.  */
+
+static splay_tree_key
+lookup_dev_1 (splay_tree_node node, uintptr_t d, size_t s)
+{
+  splay_tree_key k = &node->key;
+  struct target_mem_desc *t = k->tgt;
+
+  if (d >= t->tgt_start && d + s <= t->tgt_end)
+    return k;
+
+  if (node->left)
+    return lookup_dev_1 (node->left, d, s);
+  if (node->right)
+    return lookup_dev_1 (node->right, d, s);
+
+  return NULL;
+}
+
 /* Return block containing [D->S), or NULL if not contained.
    The list isn't ordered by device address, so we have to iterate
    over the whole array.  This is not expected to be a common
@@ -59,35 +78,12 @@
    remains locked on exit.  */
 
 static splay_tree_key
-lookup_dev (struct target_mem_desc *tgt, void *d, size_t s)
+lookup_dev (splay_tree mem_map, void *d, size_t s)
 {
-  int i;
-  struct target_mem_desc *t;
-
-  if (!tgt)
+  if (!mem_map || !mem_map->root)
     return NULL;
 
-  for (t = tgt; t != NULL; t = t->prev)
-    {
-      if (t->tgt_start <= (uintptr_t) d && t->tgt_end >= (uintptr_t) d + s)
-        break;
-    }
-
-  if (!t)
-    return NULL;
-
-  for (i = 0; i < t->list_count; i++)
-    {
-      void * offset;
-
-      splay_tree_key k = &t->array[i].key;
-      offset = d - t->tgt_start + k->tgt_offset;
-
-      if (k->host_start + offset <= (void *) k->host_end)
-        return k;
-    }
-
-  return NULL;
+  return lookup_dev_1 (mem_map->root, (uintptr_t) d, s);
 }
 
 /* OpenACC is silent on how memory exhaustion is indicated.  We return
@@ -108,7 +104,19 @@
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return malloc (s);
 
-  return thr->dev->alloc_func (thr->dev->target_id, s);
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
+  void *res = thr->dev->alloc_func (thr->dev->target_id, s);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  return res;
 }
 
 /* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@@ -131,12 +139,16 @@
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return free (d);
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
   gomp_mutex_lock (&acc_dev->lock);
 
   /* We don't have to call lazy open here, as the ptr value must have
      been returned by acc_malloc.  It's not permitted to pass NULL in
      (unless you got that null from acc_malloc).  */
-  if ((k = lookup_dev (acc_dev->openacc.data_environ, d, 1)))
+  if ((k = lookup_dev (&acc_dev->mem_map, d, 1)))
     {
       void *offset;
 
@@ -151,6 +163,12 @@
 
   if (!acc_dev->free_func (acc_dev->target_id, d))
     gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 static void
@@ -172,18 +190,26 @@
       return;
     }
 
-  if (async > acc_async_sync)
-    thr->dev->openacc.async_set_async_func (async);
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
 
-  bool ret = (from
-	      ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
-	      : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
+  goacc_aq aq = get_goacc_asyncqueue (async);
+  if (from)
+    gomp_copy_dev2host (thr->dev, aq, h, d, s);
+  else
+    gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
 
-  if (async > acc_async_sync)
-    thr->dev->openacc.async_set_async_func (acc_async_sync);
-
-  if (!ret)
-    gomp_fatal ("error in %s", libfnname);
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -228,6 +254,9 @@
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return h;
 
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   gomp_mutex_lock (&dev->lock);
 
   n = lookup_host (dev, h, 1);
@@ -265,9 +294,12 @@
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return d;
 
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   gomp_mutex_lock (&acc_dev->lock);
 
-  n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
+  n = lookup_dev (&acc_dev->mem_map, d, 1);
 
   if (!n)
     {
@@ -302,6 +334,9 @@
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return h != NULL;
 
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -346,6 +381,10 @@
 	gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
                     (void *)h, (int)s, (void *)d, (int)s);
 
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+
       gomp_mutex_lock (&acc_dev->lock);
 
       if (lookup_host (acc_dev, h, s))
@@ -355,7 +394,7 @@
 		      (int)s);
 	}
 
-      if (lookup_dev (thr->dev->openacc.data_environ, d, s))
+      if (lookup_dev (&thr->dev->mem_map, d, s))
         {
 	  gomp_mutex_unlock (&acc_dev->lock);
 	  gomp_fatal ("device address [%p, +%d] is already mapped", (void *)d,
@@ -367,12 +406,13 @@
       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
 			   &kinds, true, GOMP_MAP_VARS_OPENACC);
       tgt->list[0].key->refcount = REFCOUNT_INFINITY;
-    }
 
-  gomp_mutex_lock (&acc_dev->lock);
-  tgt->prev = acc_dev->openacc.data_environ;
-  acc_dev->openacc.data_environ = tgt;
-  gomp_mutex_unlock (&acc_dev->lock);
+      if (profiling_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
+    }
 }
 
 void
@@ -380,6 +420,7 @@
 {
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
+  struct splay_tree_key_s cur_node;
 
   /* No need to call lazy open, as the address must have been mapped.  */
 
@@ -387,12 +428,15 @@
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
-  size_t host_size;
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
 
   gomp_mutex_lock (&acc_dev->lock);
 
-  splay_tree_key n = lookup_host (acc_dev, h, 1);
-  struct target_mem_desc *t;
+  cur_node.host_start = (uintptr_t) h;
+  cur_node.host_end = cur_node.host_start + 1;
+  splay_tree_key n = splay_tree_lookup (&acc_dev->mem_map, &cur_node);
 
   if (!n)
     {
@@ -400,46 +444,33 @@
       gomp_fatal ("%p is not a mapped block", (void *)h);
     }
 
-  host_size = n->host_end - n->host_start;
-
   if (n->host_start != (uintptr_t) h)
     {
+      size_t host_size = n->host_end - n->host_start;
       gomp_mutex_unlock (&acc_dev->lock);
       gomp_fatal ("[%p,%d] surrounds %p",
 		  (void *) n->host_start, (int) host_size, (void *) h);
     }
 
-  /* Mark for removal.  */
-  n->refcount = 1;
+  splay_tree_remove (&acc_dev->mem_map, n);
 
-  t = n->tgt;
+  struct target_mem_desc *tgt = n->tgt;
 
-  if (t->refcount == 2)
+  if (tgt->refcount > 0)
+    tgt->refcount--;
+  else
     {
-      struct target_mem_desc *tp;
-
-      /* This is the last reference, so pull the descriptor off the
-         chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from
-         freeing the device memory. */
-      t->tgt_end = 0;
-      t->to_free = 0;
-
-      for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
-	   tp = t, t = t->prev)
-	if (n->tgt == t)
-	  {
-	    if (tp)
-	      tp->prev = t->prev;
-	    else
-	      acc_dev->openacc.data_environ = t->prev;
-
-	    break;
-	  }
+      free (tgt->array);
+      free (tgt);
     }
 
   gomp_mutex_unlock (&acc_dev->lock);
 
-  gomp_unmap_vars (t, true);
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 #define FLAG_PRESENT (1 << 0)
@@ -463,6 +494,15 @@
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return h;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -483,11 +523,14 @@
 	  gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
 	}
 
+      assert (n->virtual_refcount != VREFCOUNT_LINK_KEY);
+
       if (n->refcount != REFCOUNT_INFINITY)
 	{
 	  n->refcount++;
-	  n->dynamic_refcount++;
+	  n->virtual_refcount++;
 	}
+
       gomp_mutex_unlock (&acc_dev->lock);
     }
   else if (!(f & FLAG_CREATE))
@@ -497,7 +540,6 @@
     }
   else
     {
-      struct target_mem_desc *tgt;
       size_t mapnum = 1;
       unsigned short kinds;
       void *hostaddrs = h;
@@ -509,26 +551,25 @@
 
       gomp_mutex_unlock (&acc_dev->lock);
 
-      if (async > acc_async_sync)
-	acc_dev->openacc.async_set_async_func (async);
+      goacc_aq aq = get_goacc_asyncqueue (async);
 
-      tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
-			   GOMP_MAP_VARS_OPENACC);
-      /* Initialize dynamic refcount.  */
-      tgt->list[0].key->dynamic_refcount = 1;
-
-      if (async > acc_async_sync)
-	acc_dev->openacc.async_set_async_func (acc_async_sync);
+      gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s, &kinds,
+			   true, GOMP_MAP_VARS_OPENACC_ENTER_DATA);
 
       gomp_mutex_lock (&acc_dev->lock);
-
-      d = tgt->to_free;
-      tgt->prev = acc_dev->openacc.data_environ;
-      acc_dev->openacc.data_environ = tgt;
-
+      n = lookup_host (acc_dev, h, s);
+      assert (n != NULL);
+      d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h
+		    - n->host_start);
       gomp_mutex_unlock (&acc_dev->lock);
     }
 
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   return d;
 }
 
@@ -603,13 +644,21 @@
 {
   size_t host_size;
   splay_tree_key n;
-  void *d;
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -623,8 +672,7 @@
       gomp_fatal ("[%p,%d] is not mapped", (void *)h, (int)s);
     }
 
-  d = (void *) (n->tgt->tgt_start + n->tgt_offset
-		+ (uintptr_t) h - n->host_start);
+  assert (n->virtual_refcount != VREFCOUNT_LINK_KEY);
 
   host_size = n->host_end - n->host_start;
 
@@ -638,55 +686,43 @@
   if (n->refcount == REFCOUNT_INFINITY)
     {
       n->refcount = 0;
-      n->dynamic_refcount = 0;
-    }
-  if (n->refcount < n->dynamic_refcount)
-    {
-      gomp_mutex_unlock (&acc_dev->lock);
-      gomp_fatal ("Dynamic reference counting assert fail\n");
+      n->virtual_refcount = 0;
     }
 
   if (f & FLAG_FINALIZE)
     {
-      n->refcount -= n->dynamic_refcount;
-      n->dynamic_refcount = 0;
+      n->refcount -= n->virtual_refcount;
+      n->virtual_refcount = 0;
     }
-  else if (n->dynamic_refcount)
+
+  if (n->virtual_refcount > 0)
     {
-      n->dynamic_refcount--;
       n->refcount--;
+      n->virtual_refcount--;
     }
+  else if (n->refcount > 0)
+    n->refcount--;
 
   if (n->refcount == 0)
     {
-      if (n->tgt->refcount == 2)
-	{
-	  struct target_mem_desc *tp, *t;
-	  for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
-	       tp = t, t = t->prev)
-	    if (n->tgt == t)
-	      {
-		if (tp)
-		  tp->prev = t->prev;
-		else
-		  acc_dev->openacc.data_environ = t->prev;
-		break;
-	      }
-	}
+      goacc_aq aq = get_goacc_asyncqueue (async);
 
       if (f & FLAG_COPYOUT)
-	{
-	  if (async > acc_async_sync)
-	    acc_dev->openacc.async_set_async_func (async);
-	  acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
-	  if (async > acc_async_sync)
-	    acc_dev->openacc.async_set_async_func (acc_async_sync);
+        {
+	  void *d = (void *) (n->tgt->tgt_start + n->tgt_offset
+			      + (uintptr_t) h - n->host_start);
+	  gomp_copy_dev2host (acc_dev, aq, h, d, s);
 	}
-
-      gomp_remove_var (acc_dev, n);
+      gomp_remove_var_async (acc_dev, n, aq);
     }
 
   gomp_mutex_unlock (&acc_dev->lock);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -752,6 +788,21 @@
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
+  /* Fortran optional arguments that are non-present result in a
+     null host address here.  This can safely be ignored as it is
+     not possible to 'update' a non-present optional argument.  */
+  if (h == NULL)
+    return;
+
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -765,18 +816,20 @@
   d = (void *) (n->tgt->tgt_start + n->tgt_offset
 		+ (uintptr_t) h - n->host_start);
 
-  if (async > acc_async_sync)
-    acc_dev->openacc.async_set_async_func (async);
+  goacc_aq aq = get_goacc_asyncqueue (async);
 
   if (is_dev)
-    acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
+    gomp_copy_host2dev (acc_dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
   else
-    acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
-
-  if (async > acc_async_sync)
-    acc_dev->openacc.async_set_async_func (acc_async_sync);
+    gomp_copy_dev2host (acc_dev, aq, h, d, s);
 
   gomp_mutex_unlock (&acc_dev->lock);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -804,136 +857,188 @@
 }
 
 void
-gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
-			 void *kinds)
+gomp_acc_declare_allocate (bool allocate, size_t mapnum, void **hostaddrs,
+                          size_t *sizes, unsigned short *kinds)
 {
-  struct target_mem_desc *tgt;
-  struct goacc_thread *thr = goacc_thread ();
-  struct gomp_device_descr *acc_dev = thr->dev;
+  gomp_debug (0, "  %s: processing\n", __FUNCTION__);
 
-  if (acc_is_present (*hostaddrs, *sizes))
+  if (allocate)
     {
-      splay_tree_key n;
-      gomp_mutex_lock (&acc_dev->lock);
-      n = lookup_host (acc_dev, *hostaddrs, *sizes);
-      gomp_mutex_unlock (&acc_dev->lock);
+      assert (mapnum == 3);
 
-      tgt = n->tgt;
-      for (size_t i = 0; i < tgt->list_count; i++)
-	if (tgt->list[i].key == n)
-	  {
-	    for (size_t j = 0; j < mapnum; j++)
-	      if (i + j < tgt->list_count && tgt->list[i + j].key)
-		{
-		  tgt->list[i + j].key->refcount++;
-		  tgt->list[i + j].key->dynamic_refcount++;
-		}
-	    return;
-	  }
-      /* Should not reach here.  */
-      gomp_fatal ("Dynamic refcount incrementing failed for pointer/pset");
+      /* Allocate memory for the array data.  */
+      uintptr_t data = (uintptr_t) acc_create (hostaddrs[0], sizes[0]);
+
+      /* Update the PSET.  */
+      acc_update_device (hostaddrs[1], sizes[1]);
+      void *pset = acc_deviceptr (hostaddrs[1]);
+      acc_memcpy_to_device (pset, &data, sizeof (uintptr_t));
+    }
+  else
+    {
+      /* Deallocate memory for the array data.  */
+      void *data = acc_deviceptr (hostaddrs[0]);
+      acc_free (data);
     }
 
-  gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-  tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
-		       NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
-  gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
+  gomp_debug (0, "  %s: end\n", __FUNCTION__);
+}
 
-  /* Initialize dynamic refcount.  */
-  tgt->list[0].key->dynamic_refcount = 1;
+void
+gomp_acc_remove_pointer (struct gomp_device_descr *acc_dev, void **hostaddrs,
+			 size_t *sizes, unsigned short *kinds, int async,
+			 bool finalize, int mapnum)
+{
+  struct splay_tree_key_s cur_node;
+  splay_tree_key n;
 
   gomp_mutex_lock (&acc_dev->lock);
-  tgt->prev = acc_dev->openacc.data_environ;
-  acc_dev->openacc.data_environ = tgt;
+
+  for (int i = 0; i < mapnum; i++)
+    {
+      int kind = kinds[i] & 0xff;
+      bool copyfrom = false;
+
+      switch (kind)
+        {
+	case GOMP_MAP_FROM:
+	case GOMP_MAP_FORCE_FROM:
+	case GOMP_MAP_ALWAYS_FROM:
+	  copyfrom = true;
+	  /* Fallthrough.  */
+
+	case GOMP_MAP_TO_PSET:
+	case GOMP_MAP_POINTER:
+	case GOMP_MAP_DELETE:
+	case GOMP_MAP_RELEASE:
+	case GOMP_MAP_DETACH:
+	case GOMP_MAP_FORCE_DETACH:
+	  cur_node.host_start = (uintptr_t) hostaddrs[i];
+	  cur_node.host_end = cur_node.host_start
+			      + ((kind == GOMP_MAP_DETACH
+				  || kind == GOMP_MAP_FORCE_DETACH
+				  || kind == GOMP_MAP_POINTER)
+				 ? sizeof (void *) : sizes[i]);
+	  n = splay_tree_lookup (&acc_dev->mem_map, &cur_node);
+
+	  if (n == NULL)
+	    continue;
+
+	  assert (n->virtual_refcount != VREFCOUNT_LINK_KEY);
+
+	  if (n->refcount == REFCOUNT_INFINITY)
+	    {
+	      n->refcount = 1;
+	      n->virtual_refcount = 0;
+	    }
+
+	  if (finalize)
+	    {
+	      n->refcount -= n->virtual_refcount;
+	      n->virtual_refcount = 0;
+	    }
+
+	  if (n->virtual_refcount > 0)
+	    {
+	      n->refcount--;
+	      n->virtual_refcount--;
+	    }
+	  else if (n->refcount > 0)
+	    n->refcount--;
+
+	  if (copyfrom)
+	    gomp_copy_dev2host (acc_dev, NULL, (void *) cur_node.host_start,
+				(void *) (n->tgt->tgt_start + n->tgt_offset
+					  + cur_node.host_start
+					  - n->host_start),
+				cur_node.host_end - cur_node.host_start);
+
+	  if (n->refcount == 0)
+	    gomp_remove_var (acc_dev, n);
+	  break;
+
+	default:
+	  gomp_mutex_unlock (&acc_dev->lock);
+	  gomp_fatal ("gomp_acc_remove_pointer unhandled kind 0x%.2x",
+		      kind);
+	}
+    }
+
   gomp_mutex_unlock (&acc_dev->lock);
 }
 
 void
-gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async,
-			 int finalize, int mapnum)
+acc_attach_async (void **hostaddr, int async)
 {
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
-  splay_tree_key n;
-  struct target_mem_desc *t;
-  int minrefs = (mapnum == 1) ? 2 : 3;
+  goacc_aq aq = get_goacc_asyncqueue (async);
 
-  if (!acc_is_present (h, s))
+  struct splay_tree_key_s cur_node;
+  splay_tree_key n;
+
+  if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
-  gomp_mutex_lock (&acc_dev->lock);
+  cur_node.host_start = (uintptr_t) hostaddr;
+  cur_node.host_end = cur_node.host_start + sizeof (void *);
+  n = splay_tree_lookup (&acc_dev->mem_map, &cur_node);
 
-  n = lookup_host (acc_dev, h, 1);
+  if (n == NULL)
+    gomp_fatal ("struct not mapped for acc_attach");
 
-  if (!n)
-    {
-      gomp_mutex_unlock (&acc_dev->lock);
-      gomp_fatal ("%p is not a mapped block", (void *)h);
-    }
+  gomp_attach_pointer (acc_dev, aq, &acc_dev->mem_map, n, (uintptr_t) hostaddr,
+		       0, NULL);
+}
 
-  gomp_debug (0, "  %s: restore mappings\n", __FUNCTION__);
+void
+acc_attach (void **hostaddr)
+{
+  acc_attach_async (hostaddr, acc_async_sync);
+}
 
-  t = n->tgt;
+static void
+goacc_detach_internal (void **hostaddr, int async, bool finalize)
+{
+  struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
+  struct splay_tree_key_s cur_node;
+  splay_tree_key n;
+  struct goacc_asyncqueue *aq = get_goacc_asyncqueue (async);
 
-  if (n->refcount < n->dynamic_refcount)
-    {
-      gomp_mutex_unlock (&acc_dev->lock);
-      gomp_fatal ("Dynamic reference counting assert fail\n");
-    }
+  if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+    return;
 
-  if (finalize)
-    {
-      n->refcount -= n->dynamic_refcount;
-      n->dynamic_refcount = 0;
-    }
-  else if (n->dynamic_refcount)
-    {
-      n->dynamic_refcount--;
-      n->refcount--;
-    }
+  cur_node.host_start = (uintptr_t) hostaddr;
+  cur_node.host_end = cur_node.host_start + sizeof (void *);
+  n = splay_tree_lookup (&acc_dev->mem_map, &cur_node);
 
-  gomp_mutex_unlock (&acc_dev->lock);
+  if (n == NULL)
+    gomp_fatal ("struct not mapped for acc_detach");
 
-  if (n->refcount == 0)
-    {
-      if (t->refcount == minrefs)
-	{
-	  /* This is the last reference, so pull the descriptor off the
-	     chain. This prevents gomp_unmap_vars via gomp_unmap_tgt from
-	     freeing the device memory. */
-	  struct target_mem_desc *tp;
-	  for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
-	       tp = t, t = t->prev)
-	    {
-	      if (n->tgt == t)
-		{
-		  if (tp)
-		    tp->prev = t->prev;
-		  else
-		    acc_dev->openacc.data_environ = t->prev;
-		  break;
-		}
-	    }
-	}
+  gomp_detach_pointer (acc_dev, aq, n, (uintptr_t) hostaddr, finalize, NULL);
+}
 
-      /* Set refcount to 1 to allow gomp_unmap_vars to unmap it.  */
-      n->refcount = 1;
-      t->refcount = minrefs;
-      for (size_t i = 0; i < t->list_count; i++)
-	if (t->list[i].key == n)
-	  {
-	    t->list[i].copy_from = force_copyfrom ? 1 : 0;
-	    break;
-	  }
+void
+acc_detach (void **hostaddr)
+{
+  goacc_detach_internal (hostaddr, acc_async_sync, false);
+}
 
-      /* If running synchronously, unmap immediately.  */
-      if (async < acc_async_noval)
-	gomp_unmap_vars (t, true);
-      else
-	t->device_descr->openacc.register_async_cleanup_func (t, async);
-    }
+void
+acc_detach_async (void **hostaddr, int async)
+{
+  goacc_detach_internal (hostaddr, async, false);
+}
 
-  gomp_mutex_unlock (&acc_dev->lock);
+void
+acc_detach_finalize (void **hostaddr)
+{
+  goacc_detach_internal (hostaddr, acc_async_sync, true);
+}
 
-  gomp_debug (0, "  %s: mappings restored\n", __FUNCTION__);
+void
+acc_detach_finalize_async (void **hostaddr, int async)
+{
+  goacc_detach_internal (hostaddr, async, true);
 }
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index b77c5e8..a3ec0ed 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -31,6 +31,9 @@
 #include "libgomp_g.h"
 #include "gomp-constants.h"
 #include "oacc-int.h"
+#if USE_LIBFFI
+# include "ffi.h"
+#endif
 #ifdef HAVE_INTTYPES_H
 # include <inttypes.h>  /* For PRIu64.  */
 #endif
@@ -57,12 +60,34 @@
   if (pos + 1 >= mapnum)
     return 0;
 
-  unsigned char kind = kinds[pos+1] & 0xff;
+  unsigned char kind0 = kinds[pos] & 0xff;
 
-  if (kind == GOMP_MAP_TO_PSET)
-    return 3;
-  else if (kind == GOMP_MAP_POINTER)
-    return 2;
+  switch (kind0)
+    {
+    case GOMP_MAP_TO:
+    case GOMP_MAP_FORCE_TO:
+    case GOMP_MAP_FROM:
+    case GOMP_MAP_FORCE_FROM:
+    case GOMP_MAP_TOFROM:
+    case GOMP_MAP_FORCE_TOFROM:
+    case GOMP_MAP_ALLOC:
+    case GOMP_MAP_RELEASE:
+    case GOMP_MAP_DECLARE_ALLOCATE:
+    case GOMP_MAP_DECLARE_DEALLOCATE:
+      {
+	unsigned char kind1 = kinds[pos + 1] & 0xff;
+	if (kind1 == GOMP_MAP_POINTER
+	    || kind1 == GOMP_MAP_ALWAYS_POINTER
+	    || kind1 == GOMP_MAP_ATTACH
+	    || kind1 == GOMP_MAP_DETACH
+	    || kind1 == GOMP_MAP_FORCE_DETACH)
+	  return 2;
+	else if (kind1 == GOMP_MAP_TO_PSET)
+	  return 3;
+      }
+    default:
+      /* empty.  */;
+    }
 
   return 0;
 }
@@ -114,6 +139,91 @@
 
 static void goacc_wait (int async, int num_waits, va_list *ap);
 
+static void
+goacc_call_host_fn (void (*fn) (void *), size_t mapnum, void **hostaddrs,
+		    int params)
+{
+#ifdef USE_LIBFFI
+  ffi_cif cif;
+  ffi_type *arg_types[mapnum];
+  void *arg_values[mapnum];
+  ffi_arg result;
+  int i;
+
+  if (params)
+    {
+      for (i = 0; i < mapnum; i++)
+	{
+	  arg_types[i] = &ffi_type_pointer;
+	  arg_values[i] = &hostaddrs[i];
+	}
+
+      if (ffi_prep_cif (&cif, FFI_DEFAULT_ABI, mapnum,
+			&ffi_type_void, arg_types) == FFI_OK)
+	ffi_call (&cif, FFI_FN (fn), &result, arg_values);
+      else
+	abort ();
+    }
+  else
+#endif
+  fn (hostaddrs);
+}
+
+struct async_prof_callback_info {
+  acc_prof_info prof_info;
+  acc_event_info event_info;
+  acc_api_info api_info;
+  struct async_prof_callback_info *start_info;
+};
+
+static void
+async_prof_dispatch (void *ptr)
+{
+  struct async_prof_callback_info *info
+    = (struct async_prof_callback_info *) ptr;
+
+  if (info->start_info)
+    {
+      /* The TOOL_INFO must be preserved from a start event to the
+	 corresponding end event.  Copy that here.  */
+      void *tool_info = info->start_info->event_info.other_event.tool_info;
+      info->event_info.other_event.tool_info = tool_info;
+    }
+
+  goacc_profiling_dispatch (&info->prof_info, &info->event_info,
+			    &info->api_info);
+
+  /* The async_prof_dispatch function is (so far) always used for start/end
+     profiling event pairs: the start and end parts are queued, then each is
+     dispatched (or the dispatches might be interleaved before the end part is
+     queued).
+     In any case, it's not safe to delete either info structure before the
+     whole bracketed event is complete.  */
+
+  if (info->start_info)
+    {
+      free (info->start_info);
+      free (info);
+    }
+}
+
+static struct async_prof_callback_info *
+queue_async_prof_dispatch (struct gomp_device_descr *devicep, goacc_aq aq,
+			   acc_prof_info *prof_info, acc_event_info *event_info,
+			   acc_api_info *api_info,
+			   struct async_prof_callback_info *prev_info)
+{
+  struct async_prof_callback_info *info = malloc (sizeof (*info));
+
+  info->prof_info = *prof_info;
+  info->event_info = *event_info;
+  info->api_info = *api_info;
+  info->start_info = prev_info;
+
+  devicep->openacc.async.queue_callback_func (aq, async_prof_dispatch,
+					      (void *) info);
+  return info;
+}
 
 /* Launch a possibly offloaded function with FLAGS.  FN is the host fn
    address.  MAPNUM, HOSTADDRS, SIZES & KINDS  describe the memory
@@ -121,9 +231,9 @@
    keyed optional parameters terminated with a zero.  */
 
 void
-GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
-		      size_t mapnum, void **hostaddrs, size_t *sizes,
-		      unsigned short *kinds, ...)
+GOACC_parallel_keyed (int flags_m, void (*fn) (void *), size_t mapnum,
+		      void **hostaddrs, size_t *sizes, unsigned short *kinds,
+		      ...)
 {
   int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
 
@@ -139,6 +249,9 @@
   int async = GOMP_ASYNC_SYNC;
   unsigned dims[GOMP_DIM_MAX];
   unsigned tag;
+  bool args_exploded = false;
+  struct async_prof_callback_info *comp_start_info = NULL,
+				  *data_start_info = NULL;
 
 #ifdef HAVE_INTTYPES_H
   gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n",
@@ -152,28 +265,62 @@
   thr = goacc_thread ();
   acc_dev = thr->dev;
 
-  handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
 
-  /* Host fallback if "if" clause is false or if the current device is set to
-     the host.  */
-  if (flags & GOACC_FLAG_HOST_FALLBACK)
+  acc_prof_info prof_info;
+  if (profiling_p)
     {
-      goacc_save_and_set_bind (acc_device_host);
-      fn (hostaddrs);
-      goacc_restore_bind ();
-      return;
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_compute_construct_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
     }
-  else if (acc_device_type (acc_dev->type) == acc_device_host)
+  acc_event_info compute_construct_event_info;
+  if (profiling_p)
     {
-      fn (hostaddrs);
-      return;
+      compute_construct_event_info.other_event.event_type
+	= prof_info.event_type;
+      compute_construct_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      compute_construct_event_info.other_event.parent_construct
+	= acc_construct_parallel;
+      compute_construct_event_info.other_event.implicit = 0;
+      compute_construct_event_info.other_event.tool_info = NULL;
     }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
 
   /* Default: let the runtime choose.  */
   for (i = 0; i != GOMP_DIM_MAX; i++)
     dims[i] = 0;
 
   va_start (ap, kinds);
+
   /* TODO: This will need amending when device_type is implemented.  */
   while ((tag = va_arg (ap, unsigned)) != 0)
     {
@@ -200,6 +347,14 @@
 
 	    if (async == GOMP_LAUNCH_OP_MAX)
 	      async = va_arg (ap, unsigned);
+
+	    /* Set async number in profiling data, unless the device is the
+	       host or we're doing host fallback.  */
+	    if (profiling_p
+	        && !(flags & GOACC_FLAG_HOST_FALLBACK)
+		&& acc_device_type (acc_dev->type) != acc_device_host)
+	      prof_info.async = prof_info.async_queue = async;
+
 	    break;
 	  }
 
@@ -210,14 +365,49 @@
 	    break;
 	  }
 
+	case GOMP_LAUNCH_ARGS_EXPLODED:
+	  args_exploded = true;
+	  break;
+
 	default:
 	  gomp_fatal ("unrecognized offload code '%d',"
 		      " libgomp is too old", GOMP_LAUNCH_CODE (tag));
 	}
     }
   va_end (ap);
-  
-  acc_dev->openacc.async_set_async_func (async);
+
+  goacc_aq aq = get_goacc_asyncqueue (async);
+
+  if (profiling_p)
+    {
+      if (aq)
+	comp_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &compute_construct_event_info,
+				       &api_info, NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+				  &api_info);
+    }
+
+  /* Host fallback if "if" clause is false or if the current device is set to
+     the host.  */
+  if (flags & GOACC_FLAG_HOST_FALLBACK)
+    {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
+      goacc_save_and_set_bind (acc_device_host);
+      goacc_call_host_fn (fn, mapnum, hostaddrs, args_exploded);
+      goacc_restore_bind ();
+      goto out_prof;
+    }
+  else if (acc_device_type (acc_dev->type) == acc_device_host)
+    {
+      goacc_call_host_fn (fn, mapnum, hostaddrs, args_exploded);
+      goto out_prof;
+    }
+  else if (profiling_p)
+    api_info.device_api = acc_device_api_cuda;
 
   if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
     {
@@ -235,44 +425,134 @@
   else
     tgt_fn = (void (*)) fn;
 
-  tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
-		       GOMP_MAP_VARS_OPENACC);
+  acc_event_info enter_exit_data_event_info;
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_start;
+      enter_exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      enter_exit_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_exit_data_event_info.other_event.parent_construct
+	= compute_construct_event_info.other_event.parent_construct;
+      enter_exit_data_event_info.other_event.implicit = 1;
+      enter_exit_data_event_info.other_event.tool_info = NULL;
+      if (aq)
+	data_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &enter_exit_data_event_info, &api_info,
+				       NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
+    }
+
+  tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
+			     true, GOMP_MAP_VARS_OPENACC);
+
+#ifdef RC_CHECKING
+  gomp_mutex_lock (&acc_dev->lock);
+  assert (tgt);
+  dump_tgt (__FUNCTION__, tgt);
+  tgt->prev = thr->mapped_data;
+  gomp_rc_check (acc_dev, tgt);
+  gomp_mutex_unlock (&acc_dev->lock);
+#endif
+
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_end;
+      enter_exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &enter_exit_data_event_info, &api_info,
+				   data_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
+    }
 
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
-    devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
-			    + tgt->list[i].key->tgt_offset
-			    + tgt->list[i].offset);
+    devaddrs[i] = (void *) gomp_map_val (tgt, hostaddrs, i);
 
-  acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
-			      async, dims, tgt);
-
-  /* If running synchronously, unmap immediately.  */
-  bool copyfrom = true;
-  if (async_synchronous_p (async))
-    gomp_unmap_vars (tgt, true);
+  if (aq == NULL)
+    {
+      if (args_exploded)
+	acc_dev->openacc.exec_params_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+					   dims, tgt);
+      else
+	acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+				    dims, tgt);
+    }
   else
     {
-      bool async_unmap = false;
-      for (size_t i = 0; i < tgt->list_count; i++)
-	{
-	  splay_tree_key k = tgt->list[i].key;
-	  if (k && k->refcount == 1)
-	    {
-	      async_unmap = true;
-	      break;
-	    }
-	}
-      if (async_unmap)
-	tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
+      if (args_exploded)
+	acc_dev->openacc.async.exec_params_func (tgt_fn, mapnum, hostaddrs,
+						 devaddrs, dims, tgt, aq);
       else
-	{
-	  copyfrom = false;
-	  gomp_unmap_vars (tgt, copyfrom);
-	}
+	acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs,
+					  devaddrs, dims, tgt, aq);
     }
 
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_exit_data_start;
+      enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
+      enter_exit_data_event_info.other_event.tool_info = NULL;
+      if (aq)
+	data_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &enter_exit_data_event_info, &api_info,
+				       NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
+    }
+
+  /* If running synchronously, unmap immediately.  */
+  if (aq == NULL)
+    gomp_unmap_vars (tgt, true);
+  else
+    gomp_unmap_vars_async (tgt, true, aq);
+
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_exit_data_end;
+      enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &enter_exit_data_event_info, &api_info,
+				   data_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
+    }
+
+#ifdef RC_CHECKING
+  gomp_mutex_lock (&acc_dev->lock);
+  gomp_rc_check (acc_dev, thr->mapped_data);
+  gomp_mutex_unlock (&acc_dev->lock);
+#endif
+
+  out_prof:
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_compute_construct_end;
+      compute_construct_event_info.other_event.event_type
+	= prof_info.event_type;
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &compute_construct_event_info, &api_info,
+				   comp_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+				  &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 /* Legacy entry point, only provide host execution.  */
@@ -310,36 +590,194 @@
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_enter_data_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = acc_async_sync; /* Always synchronous.  */
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info enter_data_event_info;
+  if (profiling_p)
+    {
+      enter_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      enter_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_data_event_info.other_event.parent_construct = acc_construct_data;
+      for (int i = 0; i < mapnum; ++i)
+	if ((kinds[i] & 0xff) == GOMP_MAP_USE_DEVICE_PTR)
+	  {
+	    /* If there is one such data mapping kind, then this is actually an
+	       OpenACC 'host_data' construct.  (GCC maps the OpenACC
+	       'host_data' construct to the OpenACC 'data' construct.)  Apart
+	       from artificial test cases (such as an OpenACC 'host_data'
+	       construct's (implicit) device initialization when there hasn't
+	       been any device data be set up before...), there can't really
+	       any meaningful events be generated from OpenACC 'host_data'
+	       constructs, though.  */
+	    enter_data_event_info.other_event.parent_construct
+	      = acc_construct_host_data;
+	    break;
+	  }
+      enter_data_event_info.other_event.implicit = 0;
+      enter_data_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
+  handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
+
+  enum gomp_map_vars_kind pragma_kind;
+  if (flags & GOACC_FLAG_HOST_DATA_IF_PRESENT)
+    pragma_kind = GOMP_MAP_VARS_OPENACC_IF_PRESENT;
+  else
+    pragma_kind = GOMP_MAP_VARS_OPENACC;
+
   /* Host fallback or 'do nothing'.  */
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK))
     {
-      tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
-			   GOMP_MAP_VARS_OPENACC);
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
+      tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true, pragma_kind);
       tgt->prev = thr->mapped_data;
       thr->mapped_data = tgt;
 
-      return;
+      goto out_prof;
     }
 
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
   tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
-		       GOMP_MAP_VARS_OPENACC);
+		       pragma_kind);
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
   tgt->prev = thr->mapped_data;
   thr->mapped_data = tgt;
+
+#ifdef RC_CHECKING
+  gomp_mutex_lock (&acc_dev->lock);
+  gomp_rc_check (acc_dev, thr->mapped_data);
+  gomp_mutex_unlock (&acc_dev->lock);
+#endif
+
+out_prof:
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_end;
+      enter_data_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 GOACC_data_end (void)
 {
   struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
   struct target_mem_desc *tgt = thr->mapped_data;
 
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_exit_data_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = acc_async_sync; /* Always synchronous.  */
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info exit_data_event_info;
+  if (profiling_p)
+    {
+      exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      exit_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      exit_data_event_info.other_event.parent_construct = acc_construct_data;
+      exit_data_event_info.other_event.implicit = 0;
+      exit_data_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
   gomp_debug (0, "  %s: restore mappings\n", __FUNCTION__);
   thr->mapped_data = tgt->prev;
   gomp_unmap_vars (tgt, true);
   gomp_debug (0, "  %s: mappings restored\n", __FUNCTION__);
+
+#ifdef RC_CHECKING
+  gomp_mutex_lock (&thr->dev->lock);
+  gomp_rc_check (thr->dev, thr->mapped_data);
+  gomp_mutex_unlock (&thr->dev->lock);
+#endif
+
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_exit_data_end;
+      exit_data_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -353,15 +791,138 @@
   struct gomp_device_descr *acc_dev;
   bool data_enter = false;
   size_t i;
+  struct async_prof_callback_info *data_start_info = NULL;
 
   goacc_lazy_initialize ();
 
   thr = goacc_thread ();
   acc_dev = thr->dev;
 
+  /* Determine whether "finalize" semantics apply to all mappings of this
+     OpenACC directive.  */
+  bool finalize = false;
+  if (mapnum > 0)
+    {
+      unsigned char kind = kinds[0] & 0xff;
+
+      if (kind == GOMP_MAP_STRUCT || kind == GOMP_MAP_FORCE_PRESENT)
+        kind = kinds[1] & 0xff;
+
+      if (kind == GOMP_MAP_DELETE
+	  || kind == GOMP_MAP_FORCE_FROM)
+	finalize = true;
+    }
+  else if (num_waits == acc_async_noval)
+    acc_wait_all_async (async);
+
+  /* Determine if this is an "acc enter data".  */
+  for (i = 0; i < mapnum; ++i)
+    {
+      unsigned char kind = kinds[i] & 0xff;
+
+      if (kind == GOMP_MAP_POINTER
+	  || kind == GOMP_MAP_TO_PSET
+	  || kind == GOMP_MAP_STRUCT
+	  || kind == GOMP_MAP_FORCE_PRESENT)
+	continue;
+
+      if (kind == GOMP_MAP_FORCE_ALLOC
+	  || kind == GOMP_MAP_ATTACH
+	  || kind == GOMP_MAP_FORCE_TO
+	  || kind == GOMP_MAP_TO
+	  || kind == GOMP_MAP_ALLOC
+	  || kind == GOMP_MAP_DECLARE_ALLOCATE
+	  || kind == GOMP_MAP_ZERO_LEN_ARRAY_SECTION)
+	{
+	  data_enter = true;
+	  break;
+	}
+
+      if (kind == GOMP_MAP_RELEASE
+	  || kind == GOMP_MAP_DELETE
+	  || kind == GOMP_MAP_DETACH
+	  || kind == GOMP_MAP_FORCE_DETACH
+	  || kind == GOMP_MAP_FROM
+	  || kind == GOMP_MAP_FORCE_FROM
+	  || kind == GOMP_MAP_DECLARE_DEALLOCATE
+	  || kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION)
+	break;
+
+      gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
+		      kind);
+    }
+
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type
+	= data_enter ? acc_ev_enter_data_start : acc_ev_exit_data_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info enter_exit_data_event_info;
+  if (profiling_p)
+    {
+      enter_exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      enter_exit_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_exit_data_event_info.other_event.parent_construct
+	= data_enter ? acc_construct_enter_data : acc_construct_exit_data;
+      enter_exit_data_event_info.other_event.implicit = 0;
+      enter_exit_data_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  goacc_aq aq = get_goacc_asyncqueue (async);
+
+  if (profiling_p)
+    {
+      if (aq)
+	data_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &enter_exit_data_event_info, &api_info,
+				       NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
+    }
+
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK))
-    return;
+    {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
+
+      goto out_prof;
+    }
 
   if (num_waits)
     {
@@ -372,47 +933,6 @@
       va_end (ap);
     }
 
-  /* Determine whether "finalize" semantics apply to all mappings of this
-     OpenACC directive.  */
-  bool finalize = false;
-  if (mapnum > 0)
-    {
-      unsigned char kind = kinds[0] & 0xff;
-      if (kind == GOMP_MAP_DELETE
-	  || kind == GOMP_MAP_FORCE_FROM)
-	finalize = true;
-    }
-
-  acc_dev->openacc.async_set_async_func (async);
-
-  /* Determine if this is an "acc enter data".  */
-  for (i = 0; i < mapnum; ++i)
-    {
-      unsigned char kind = kinds[i] & 0xff;
-
-      if (kind == GOMP_MAP_POINTER || kind == GOMP_MAP_TO_PSET)
-	continue;
-
-      if (kind == GOMP_MAP_FORCE_ALLOC
-	  || kind == GOMP_MAP_FORCE_PRESENT
-	  || kind == GOMP_MAP_FORCE_TO
-	  || kind == GOMP_MAP_TO
-	  || kind == GOMP_MAP_ALLOC)
-	{
-	  data_enter = true;
-	  break;
-	}
-
-      if (kind == GOMP_MAP_RELEASE
-	  || kind == GOMP_MAP_DELETE
-	  || kind == GOMP_MAP_FROM
-	  || kind == GOMP_MAP_FORCE_FROM)
-	break;
-
-      gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
-		      kind);
-    }
-
   /* In c, non-pointers and arrays are represented by a single data clause.
      Dynamically allocated arrays and subarrays are represented by a data
      clause followed by an internal GOMP_MAP_POINTER.
@@ -435,13 +955,31 @@
 	    {
 	      switch (kind)
 		{
+		case GOMP_MAP_DECLARE_ALLOCATE:
 		case GOMP_MAP_ALLOC:
 		case GOMP_MAP_FORCE_ALLOC:
-		  acc_create (hostaddrs[i], sizes[i]);
+		  acc_create_async (hostaddrs[i], sizes[i], async);
+		  break;
+		case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
+		  if (hostaddrs[i] != NULL)
+		    acc_create_async (hostaddrs[i], 1, async);
 		  break;
 		case GOMP_MAP_TO:
 		case GOMP_MAP_FORCE_TO:
-		  acc_copyin (hostaddrs[i], sizes[i]);
+		  if (hostaddrs[i])
+		    acc_copyin_async (hostaddrs[i], sizes[i], async);
+		  break;
+		case GOMP_MAP_STRUCT:
+		  {
+		    int elems = sizes[i];
+		    gomp_map_vars_async (acc_dev, aq, elems + 1, &hostaddrs[i],
+					 NULL, &sizes[i], &kinds[i], true,
+					 GOMP_MAP_VARS_OPENACC_ENTER_DATA);
+		    i += elems;
+		  }
+		  break;
+		case GOMP_MAP_ATTACH:
+		case GOMP_MAP_FORCE_PRESENT:
 		  break;
 		default:
 		  gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -451,8 +989,19 @@
 	    }
 	  else
 	    {
-	      gomp_acc_insert_pointer (pointer, &hostaddrs[i],
-				       &sizes[i], &kinds[i]);
+	      if (kind == GOMP_MAP_DECLARE_ALLOCATE)
+	        gomp_acc_declare_allocate (true, pointer, &hostaddrs[i],
+					   &sizes[i], &kinds[i]);
+	      else
+	        {
+		  for (int j = 0; j < 2; j++)
+		    gomp_map_vars_async (acc_dev, aq,
+					 (j == 0 || pointer == 2) ? 1 : 2,
+					 &hostaddrs[i + j], NULL,
+					 &sizes[i + j], &kinds[i + j], true,
+					 GOMP_MAP_VARS_OPENACC_ENTER_DATA);
+		}
+
 	      /* Increment 'i' by two because OpenACC requires fortran
 		 arrays to be contiguous, so each PSET is associated with
 		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
@@ -460,53 +1009,177 @@
 	      i += pointer - 1;
 	    }
 	}
+
+      /* This loop only handles explicit "attach" clauses that are not an
+	 implicit part of a copy{,in,out}, etc. mapping.  */
+      for (i = 0; i < mapnum; i++)
+        {
+	  unsigned char kind = kinds[i] & 0xff;
+
+	  /* Scan for pointers and PSETs.  */
+	  int pointer = find_pointer (i, mapnum, kinds);
+
+	  if (!pointer)
+	    {
+	      if (kind == GOMP_MAP_ATTACH)
+		acc_attach_async (hostaddrs[i], async);
+	      else if (kind == GOMP_MAP_STRUCT)
+	        i += sizes[i];
+	    }
+	  else
+	    i += pointer - 1;
+	}
     }
   else
-    for (i = 0; i < mapnum; ++i)
-      {
-	unsigned char kind = kinds[i] & 0xff;
+    {
+      /* Handle "detach" before copyback/deletion of mapped data.  */
+      for (i = 0; i < mapnum; i++)
+        {
+	  unsigned char kind = kinds[i] & 0xff;
 
-	int pointer = find_pointer (i, mapnum, kinds);
+	  int pointer = find_pointer (i, mapnum, kinds);
 
-	if (!pointer)
-	  {
-	    switch (kind)
-	      {
-	      case GOMP_MAP_RELEASE:
-	      case GOMP_MAP_DELETE:
-		if (acc_is_present (hostaddrs[i], sizes[i]))
+	  if (!pointer)
+	    {
+	      if (kind == GOMP_MAP_DETACH)
+		acc_detach_async (hostaddrs[i], async);
+	      else if (kind == GOMP_MAP_FORCE_DETACH)
+		acc_detach_finalize_async (hostaddrs[i], async);
+	      else if (kind == GOMP_MAP_STRUCT)
+	        i += sizes[i];
+	    }
+	  else
+	    {
+	      unsigned char kind2 = kinds[i + pointer - 1] & 0xff;
+
+	      if (kind2 == GOMP_MAP_DETACH)
+		acc_detach_async (hostaddrs[i + pointer - 1], async);
+	      else if (kind2 == GOMP_MAP_FORCE_DETACH)
+	        acc_detach_finalize_async (hostaddrs[i + pointer - 1], async);
+
+	      i += pointer - 1;
+	    }
+	}
+
+      for (i = 0; i < mapnum; ++i)
+	{
+	  unsigned char kind = kinds[i] & 0xff;
+
+	  int pointer = find_pointer (i, mapnum, kinds);
+
+	  if (!pointer)
+	    {
+	      switch (kind)
+		{
+		case GOMP_MAP_RELEASE:
+		case GOMP_MAP_DELETE:
+		  if (acc_is_present (hostaddrs[i], sizes[i]))
+		    {
+		      if (finalize)
+			acc_delete_finalize_async (hostaddrs[i], sizes[i],
+						   async);
+		      else
+			acc_delete_async (hostaddrs[i], sizes[i], async);
+		    }
+		  break;
+		case GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION:
+		  if (acc_is_present (hostaddrs[i], 1))
+		    {
+		      if (finalize)
+			acc_delete_finalize_async (hostaddrs[i], 1, async);
+		      else
+			acc_delete_async (hostaddrs[i], 1, async);
+		    }
+		  break;
+		case GOMP_MAP_DETACH:
+		case GOMP_MAP_FORCE_DETACH:
+		case GOMP_MAP_FORCE_PRESENT:
+		  break;
+		case GOMP_MAP_DECLARE_DEALLOCATE:
+		case GOMP_MAP_FROM:
+		case GOMP_MAP_FORCE_FROM:
+		  if (finalize)
+		    acc_copyout_finalize_async (hostaddrs[i], sizes[i], async);
+		  else
+		    acc_copyout_async (hostaddrs[i], sizes[i], async);
+		  break;
+		case GOMP_MAP_STRUCT:
 		  {
-		    if (finalize)
-		      acc_delete_finalize (hostaddrs[i], sizes[i]);
-		    else
-		      acc_delete (hostaddrs[i], sizes[i]);
+		    int elems = sizes[i];
+		    for (int j = 1; j <= elems; j++)
+		      {
+			struct splay_tree_key_s k;
+			k.host_start = (uintptr_t) hostaddrs[i + j];
+			k.host_end = k.host_start + sizes[i + j];
+			splay_tree_key str;
+			gomp_mutex_lock (&acc_dev->lock);
+			str = splay_tree_lookup (&acc_dev->mem_map, &k);
+			gomp_mutex_unlock (&acc_dev->lock);
+			if (str)
+		          {
+			    assert (str->virtual_refcount
+				    != VREFCOUNT_LINK_KEY);
+			    if (finalize)
+			      {
+				str->refcount -= str->virtual_refcount;
+				str->virtual_refcount = 0;
+			      }
+			    if (str->virtual_refcount > 0)
+			      {
+				str->refcount--;
+				str->virtual_refcount--;
+			      }
+			    else if (str->refcount > 0)
+			      str->refcount--;
+			    if (str->refcount == 0)
+			      gomp_remove_var_async (acc_dev, str, aq);
+			  }
+		      }
+		    i += elems;
 		  }
-		break;
-	      case GOMP_MAP_FROM:
-	      case GOMP_MAP_FORCE_FROM:
-		if (finalize)
-		  acc_copyout_finalize (hostaddrs[i], sizes[i]);
-		else
-		  acc_copyout (hostaddrs[i], sizes[i]);
-		break;
-	      default:
-		gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
-			    kind);
-		break;
-	      }
-	  }
-	else
-	  {
-	    bool copyfrom = (kind == GOMP_MAP_FORCE_FROM
-			     || kind == GOMP_MAP_FROM);
-	    gomp_acc_remove_pointer (hostaddrs[i], sizes[i], copyfrom, async,
-				     finalize, pointer);
-	    /* See the above comment.  */
-	    i += pointer - 1;
-	  }
-      }
+		  break;
+		default:
+		  gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
+			      kind);
+		  break;
+		}
+	    }
+	  else
+	    {
+	      if (kind == GOMP_MAP_DECLARE_DEALLOCATE)
+	        gomp_acc_declare_allocate (false, pointer, &hostaddrs[i],
+					   &sizes[i], &kinds[i]);
+	      else
+		gomp_acc_remove_pointer (acc_dev, &hostaddrs[i], &sizes[i],
+					 &kinds[i], async, finalize, pointer);
+	      i += pointer - 1;
+	    }
+	}
+    }
 
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
+#ifdef RC_CHECKING
+  gomp_mutex_lock (&acc_dev->lock);
+  gomp_rc_check (acc_dev, thr->mapped_data);
+  gomp_mutex_unlock (&acc_dev->lock);
+#endif
+
+ out_prof:
+  if (profiling_p)
+    {
+      prof_info.event_type
+	= data_enter ? acc_ev_enter_data_end : acc_ev_exit_data_end;
+      enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				   &enter_exit_data_event_info, &api_info,
+				   data_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				  &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 static void
@@ -532,9 +1205,10 @@
       if (async == acc_async_sync)
 	acc_wait (qid);
       else if (qid == async)
-	;/* If we're waiting on the same asynchronous queue as we're
-	    launching on, the queue itself will order work as
-	    required, so there's no need to wait explicitly.  */
+	/* If we're waiting on the same asynchronous queue as we're
+	   launching on, the queue itself will order work as
+	   required, so there's no need to wait explicitly.  */
+	;
       else
 	acc_wait_async (qid, async);
     }
@@ -553,10 +1227,75 @@
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
+  goacc_aq aq = NULL;
+  struct async_prof_callback_info *update_start_info = NULL;
+
+  bool profiling_p = GOACC_PROFILING_DISPATCH_P (true);
+
+  acc_prof_info prof_info;
+  if (profiling_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_update_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL;
+      prof_info.func_name = NULL;
+      prof_info.line_no = -1;
+      prof_info.end_line_no = -1;
+      prof_info.func_line_no = -1;
+      prof_info.func_end_line_no = -1;
+    }
+  acc_event_info update_event_info;
+  if (profiling_p)
+    {
+      update_event_info.other_event.event_type
+	= prof_info.event_type;
+      update_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      update_event_info.other_event.parent_construct = acc_construct_update;
+      update_event_info.other_event.implicit = 0;
+      update_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_p)
+    {
+      aq = get_goacc_asyncqueue (async);
+      if (aq)
+	update_start_info
+	  = queue_async_prof_dispatch (acc_dev, aq, &prof_info,
+				       &update_event_info, &api_info, NULL);
+      else
+	goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+    }
 
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || (flags & GOACC_FLAG_HOST_FALLBACK))
-    return;
+    {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
+
+      goto out_prof;
+    }
 
   if (num_waits)
     {
@@ -567,8 +1306,6 @@
       va_end (ap);
     }
 
-  acc_dev->openacc.async_set_async_func (async);
-
   bool update_device = false;
   for (i = 0; i < mapnum; ++i)
     {
@@ -578,6 +1315,7 @@
 	{
 	case GOMP_MAP_POINTER:
 	case GOMP_MAP_TO_PSET:
+	case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
 	  break;
 
 	case GOMP_MAP_ALWAYS_POINTER:
@@ -591,6 +1329,8 @@
 		 the value of the allocated device memory in the
 		 previous pointer.  */
 	      *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr;
+	      /* TODO: verify that we really cannot use acc_update_device_async
+		 here.  */
 	      acc_update_device (hostaddrs[i], sizeof (uintptr_t));
 
 	      /* Restore the host pointer.  */
@@ -608,7 +1348,7 @@
 	  /* Fallthru  */
 	case GOMP_MAP_FORCE_TO:
 	  update_device = true;
-	  acc_update_device (hostaddrs[i], sizes[i]);
+	  acc_update_device_async (hostaddrs[i], sizes[i], async);
 	  break;
 
 	case GOMP_MAP_FROM:
@@ -620,7 +1360,7 @@
 	  /* Fallthru  */
 	case GOMP_MAP_FORCE_FROM:
 	  update_device = false;
-	  acc_update_self (hostaddrs[i], sizes[i]);
+	  acc_update_self_async (hostaddrs[i], sizes[i], async);
 	  break;
 
 	default:
@@ -629,12 +1369,41 @@
 	}
     }
 
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
+ out_prof:
+  if (profiling_p)
+    {
+      prof_info.event_type = acc_ev_update_end;
+      update_event_info.other_event.event_type = prof_info.event_type;
+      if (aq)
+	queue_async_prof_dispatch (acc_dev, aq, &prof_info, &update_event_info,
+				   &api_info, update_start_info);
+      else
+	goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 GOACC_wait (int async, int num_waits, ...)
 {
+  goacc_lazy_initialize ();
+
+  struct goacc_thread *thr = goacc_thread ();
+
+  /* No nesting.  */
+  assert (thr->prof_info == NULL);
+  assert (thr->api_info == NULL);
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
+  if (profiling_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   if (num_waits)
     {
       va_list ap;
@@ -647,6 +1416,12 @@
     acc_wait_all ();
   else
     acc_wait_all_async (async);
+
+  if (profiling_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 int
diff --git a/libgomp/oacc-plugin.c b/libgomp/oacc-plugin.c
index 958cda9..b4f71a1 100644
--- a/libgomp/oacc-plugin.c
+++ b/libgomp/oacc-plugin.c
@@ -29,16 +29,22 @@
 #include "libgomp.h"
 #include "oacc-plugin.h"
 #include "oacc-int.h"
+#include "acc_prof.h"
 
+/* This plugin function is now obsolete.  */
 void
-GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
+GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
+			      int async __attribute__((unused)))
 {
-  struct target_mem_desc *tgt = ptr;
-  struct gomp_device_descr *devicep = tgt->device_descr;
+  gomp_fatal ("invalid plugin function");
+}
 
-  devicep->openacc.async_set_async_func (async);
-  gomp_unmap_vars (tgt, true);
-  devicep->openacc.async_set_async_func (acc_async_sync);
+/* Return the TLS data for the current thread.  */
+
+struct goacc_thread *
+GOMP_PLUGIN_goacc_thread (void)
+{
+  return goacc_thread ();
 }
 
 /* Return the target-specific part of the TLS data for the current thread.  */
@@ -60,3 +66,11 @@
     }
   return goacc_default_dims[i];
 }
+
+void
+GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info,
+				      acc_event_info *event_info,
+				      acc_api_info *api_info)
+{
+  goacc_profiling_dispatch (prof_info, event_info, api_info);
+}
diff --git a/libgomp/oacc-plugin.h b/libgomp/oacc-plugin.h
index 112d4a59..887c6f6 100644
--- a/libgomp/oacc-plugin.h
+++ b/libgomp/oacc-plugin.h
@@ -27,8 +27,15 @@
 #ifndef OACC_PLUGIN_H
 #define OACC_PLUGIN_H 1
 
+#include "oacc-int.h"
+#include "acc_prof.h"
+
 extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
+extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void);
 extern void *GOMP_PLUGIN_acc_thread (void);
 extern int GOMP_PLUGIN_acc_default_dim (unsigned int);
+extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *,
+						  acc_event_info *,
+						  acc_api_info *);
 
 #endif
diff --git a/libgomp/config/nvptx/proc.c b/libgomp/oacc-profiling-acc_register_library.c
similarity index 70%
copy from libgomp/config/nvptx/proc.c
copy to libgomp/oacc-profiling-acc_register_library.c
index 8ca0b0a..f6b482b 100644
--- a/libgomp/config/nvptx/proc.c
+++ b/libgomp/oacc-profiling-acc_register_library.c
@@ -1,5 +1,6 @@
-/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
-   Contributed by Alexander Monakov <amonakov@ispras.ru>
+/* Copyright (C) 2017 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
 
    This file is part of the GNU Offloading and Multi Processing Library
    (libgomp).
@@ -23,19 +24,16 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This file contains system specific routines related to counting
-   online processors and dynamic load balancing.  */
+/* This file provides an stub acc_register_library function.  It's in a
+   separate file so that this function can easily be overridden when linking
+   statically.  */
 
 #include "libgomp.h"
+#include "acc_prof.h"
 
-unsigned
-gomp_dynamic_max_threads (void)
+void
+acc_register_library (acc_prof_reg reg, acc_prof_reg unreg,
+		      acc_prof_lookup_func lookup)
 {
-  return gomp_icv (false)->nthreads_var;
-}
-
-int
-omp_get_num_procs (void)
-{
-  return gomp_icv (false)->nthreads_var;
+  gomp_debug (0, "dummy %s\n", __FUNCTION__);
 }
diff --git a/libgomp/oacc-profiling.c b/libgomp/oacc-profiling.c
new file mode 100644
index 0000000..d54f4b8
--- /dev/null
+++ b/libgomp/oacc-profiling.c
@@ -0,0 +1,670 @@
+/* OpenACC Profiling Interface
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   Contributed by Mentor, a Siemens Business.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define _GNU_SOURCE
+#include "libgomp.h"
+#include "oacc-int.h"
+#include "secure_getenv.h"
+#include "acc_prof.h"
+#include <assert.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#endif
+#ifdef PLUGIN_SUPPORT
+# include <dlfcn.h>
+#endif
+
+#define STATIC_ASSERT(expr) _Static_assert (expr, "!(" #expr ")")
+
+/* Statically assert that the layout of the common fields in the
+   'acc_event_info' variants matches.  */
+/* 'event_type' */
+STATIC_ASSERT (offsetof (acc_event_info, event_type)
+	       == offsetof (acc_event_info, data_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+	       == offsetof (acc_event_info, launch_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+	       == offsetof (acc_event_info, other_event.event_type));
+/* 'valid_bytes' */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+	       == offsetof (acc_event_info, launch_event.valid_bytes));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+	       == offsetof (acc_event_info, other_event.valid_bytes));
+/* 'parent_construct' */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+	       == offsetof (acc_event_info, launch_event.parent_construct));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+	       == offsetof (acc_event_info, other_event.parent_construct));
+/* 'implicit' */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+	       == offsetof (acc_event_info, launch_event.implicit));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+	       == offsetof (acc_event_info, other_event.implicit));
+/* 'tool_info' */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+	       == offsetof (acc_event_info, launch_event.tool_info));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+	       == offsetof (acc_event_info, other_event.tool_info));
+
+struct goacc_prof_callback_entry
+{
+  acc_prof_callback cb;
+  int ref;
+  bool enabled;
+  struct goacc_prof_callback_entry *next;
+};
+
+/* Use a separate flag to minimize run-time performance impact for the (very
+   common) case that profiling is not enabled.
+
+   Once enabled, we're not going to disable this anymore, anywhere.  We
+   probably could, by adding appropriate logic to 'acc_prof_register',
+   'acc_prof_unregister'.  */
+bool goacc_prof_enabled = false;
+
+/* Global state for registered callbacks.
+   'goacc_prof_callbacks_enabled[acc_ev_none]' acts as a global toggle.  */
+static bool goacc_prof_callbacks_enabled[acc_ev_last];
+static struct goacc_prof_callback_entry *goacc_prof_callback_entries[acc_ev_last];
+/* Lock used to protect access to 'goacc_prof_callbacks_enabled', and
+   'goacc_prof_callback_entries'.  */
+static gomp_mutex_t goacc_prof_lock;
+
+void
+goacc_profiling_initialize (void)
+{
+  gomp_mutex_init (&goacc_prof_lock);
+
+  /* Initially, all callbacks for all events are enabled.  */
+  for (int i = 0; i < acc_ev_last; ++i)
+    goacc_prof_callbacks_enabled[i] = true;
+
+  /* We are to invoke an external acc_register_library routine, defaulting to
+     our stub oacc-profiling-acc_register_library.c:acc_register_library
+     implementation.  */
+  gomp_debug (0, "%s: calling acc_register_library\n", __FUNCTION__);
+  //TODO.
+  acc_register_library (acc_prof_register, acc_prof_unregister, NULL);
+#ifdef PLUGIN_SUPPORT
+  char *acc_proflibs = secure_getenv ("ACC_PROFLIB");
+  while (acc_proflibs != NULL && acc_proflibs[0] != '\0')
+    {
+      char *acc_proflibs_sep = strchr (acc_proflibs, ';');
+      char *acc_proflib;
+      if (acc_proflibs_sep == acc_proflibs)
+	{
+	  /* Stray ';' separator: make sure we don't 'dlopen' the main
+	     program.  */
+	  acc_proflib = NULL;
+	}
+      else
+	{
+	  if (acc_proflibs_sep != NULL)
+	    {
+	      /* Single out the first library.  */
+	      acc_proflib = gomp_malloc (acc_proflibs_sep - acc_proflibs + 1);
+	      memcpy (acc_proflib, acc_proflibs,
+		      acc_proflibs_sep - acc_proflibs);
+	      acc_proflib[acc_proflibs_sep - acc_proflibs] = '\0';
+	    }
+	  else
+	    {
+	      /* No ';' separator, so only one library.  */
+	      acc_proflib = acc_proflibs;
+	    }
+
+	  gomp_debug (0, "%s: dlopen (\"%s\")\n", __FUNCTION__, acc_proflib);
+	  void *dl_handle = dlopen (acc_proflib, RTLD_LAZY);
+	  if (dl_handle != NULL)
+	    {
+	      typeof (&acc_register_library) a_r_l
+		= dlsym (dl_handle, "acc_register_library");
+	      if (a_r_l == NULL)
+		goto dl_fail;
+	      /* Avoid duplicate registration, for example if the same shared
+		 library is specified in LD_PRELOAD and ACC_PROFLIB -- which
+		 TAU 2.26 does when using "tau_exec -openacc".  */
+	      if (a_r_l != acc_register_library)
+		{
+		  gomp_debug (0, "  %s: calling %s:acc_register_library\n",
+			      __FUNCTION__, acc_proflib);
+		  //TODO.
+		  a_r_l (acc_prof_register, acc_prof_unregister, NULL);
+		}
+	      else
+		gomp_debug (0, "  %s: skipping duplicate"
+			    " %s:acc_register_library\n",
+			    __FUNCTION__, acc_proflib);
+	    }
+	  else
+	    {
+	    dl_fail:
+	      gomp_error ("while loading ACC_PROFLIB \"%s\": %s",
+			  acc_proflib, dlerror ());
+	      if (dl_handle != NULL)
+		{
+		  int err = dlclose (dl_handle);
+		  dl_handle = NULL;
+		  if (err != 0)
+		    goto dl_fail;
+		}
+	    }
+	}
+
+      if (acc_proflib != acc_proflibs)
+	{
+	  free (acc_proflib);
+
+	  acc_proflibs = acc_proflibs_sep + 1;
+	}
+      else
+	acc_proflibs = NULL;
+    }
+#endif /* PLUGIN_SUPPORT */
+}
+
+void
+acc_prof_register (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+  gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+	      __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+
+  /* For any events to be dispatched, the user first has to register a
+     callback, which makes this here a good place for enabling the whole
+     machinery.  */
+  if (!GOACC_PROF_ENABLED)
+    __atomic_store_n (&goacc_prof_enabled, true, MEMMODEL_RELEASE);
+
+
+  enum
+  {
+    EVENT_KIND_BOGUS,
+    EVENT_KIND_NORMAL,
+    /* As end events invoke callbacks in the reverse order, we register these
+       in the reverse order here.  */
+    EVENT_KIND_END,
+  } event_kind = EVENT_KIND_BOGUS;
+  switch (ev)
+    {
+    case acc_ev_none:
+    case acc_ev_device_init_start:
+    case acc_ev_device_shutdown_start:
+    case acc_ev_runtime_shutdown:
+    case acc_ev_create:
+    case acc_ev_delete:
+    case acc_ev_alloc:
+    case acc_ev_free:
+    case acc_ev_enter_data_start:
+    case acc_ev_exit_data_start:
+    case acc_ev_update_start:
+    case acc_ev_compute_construct_start:
+    case acc_ev_enqueue_launch_start:
+    case acc_ev_enqueue_upload_start:
+    case acc_ev_enqueue_download_start:
+    case acc_ev_wait_start:
+      event_kind = EVENT_KIND_NORMAL;
+      break;
+    case acc_ev_device_init_end:
+    case acc_ev_device_shutdown_end:
+    case acc_ev_enter_data_end:
+    case acc_ev_exit_data_end:
+    case acc_ev_update_end:
+    case acc_ev_compute_construct_end:
+    case acc_ev_enqueue_launch_end:
+    case acc_ev_enqueue_upload_end:
+    case acc_ev_enqueue_download_end:
+    case acc_ev_wait_end:
+      event_kind = EVENT_KIND_END;
+      break;
+    case acc_ev_last:
+      break;
+    }
+  if (event_kind == EVENT_KIND_BOGUS)
+    {
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring request for bogus 'acc_event_t'\n");
+      return;
+    }
+
+  bool bogus = true;
+  switch (reg)
+    {
+    case acc_reg:
+    case acc_toggle:
+    case acc_toggle_per_thread:
+      bogus = false;
+      break;
+    }
+  if (bogus)
+    {
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring request with bogus 'acc_register_t'\n");
+      return;
+    }
+
+  /* Special cases.  */
+  if (reg == acc_toggle)
+    {
+      if (cb == NULL)
+	{
+	  gomp_debug (0, "  globally enabling callbacks\n");
+	  gomp_mutex_lock (&goacc_prof_lock);
+	  /* For 'acc_ev_none', this acts as a global toggle.  */
+	  goacc_prof_callbacks_enabled[ev] = true;
+	  gomp_mutex_unlock (&goacc_prof_lock);
+	  return;
+	}
+      else if (ev == acc_ev_none && cb != NULL)
+	{
+	  gomp_debug (0, "  ignoring request\n");
+	  return;
+	}
+    }
+  else if (reg == acc_toggle_per_thread)
+    {
+      if (ev == acc_ev_none && cb == NULL)
+	{
+	  gomp_debug (0, "  thread: enabling callbacks\n");
+	  goacc_lazy_initialize ();
+	  struct goacc_thread *thr = goacc_thread ();
+	  thr->prof_callbacks_enabled = true;
+	  return;
+	}
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring bogus request\n");
+      return;
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  struct goacc_prof_callback_entry *it, *it_p;
+  it = goacc_prof_callback_entries[ev];
+  it_p = NULL;
+  while (it)
+    {
+      if (it->cb == cb)
+	break;
+      it_p = it;
+      it = it->next;
+    }
+
+  switch (reg)
+    {
+    case acc_reg:
+      /* If we already have this callback registered, just increment its
+	 reference count.  */
+      if (it != NULL)
+	{
+	  it->ref++;
+	  gomp_debug (0, "  already registered;"
+		      " incrementing reference count to: %d\n", it->ref);
+	}
+      else
+	{
+	  struct goacc_prof_callback_entry *e
+	    = gomp_malloc (sizeof (struct goacc_prof_callback_entry));
+	  e->cb = cb;
+	  e->ref = 1;
+	  e->enabled = true;
+	  bool prepend = (event_kind == EVENT_KIND_END);
+	  /* If we don't have any callback registered yet, also use the
+	     'prepend' code path.  */
+	  if (it_p == NULL)
+	    prepend = true;
+	  if (prepend)
+	    {
+	      gomp_debug (0, "  prepending\n");
+	      e->next = goacc_prof_callback_entries[ev];
+	      goacc_prof_callback_entries[ev] = e;
+	    }
+	  else
+	    {
+	      gomp_debug (0, "  appending\n");
+	      e->next = NULL;
+	      it_p->next = e;
+	    }
+	}
+      break;
+
+    case acc_toggle:
+      if (it == NULL)
+	{
+	  gomp_debug (0, "  ignoring request: is not registered\n");
+	  break;
+	}
+      else
+	{
+	  gomp_debug (0, "  enabling\n");
+	  it->enabled = true;
+	}
+      break;
+
+    case acc_toggle_per_thread:
+      __builtin_unreachable ();
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+void
+acc_prof_unregister (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+  gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+	      __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+  /* If profiling is not enabled, there cannot be anything to unregister.  */
+  if (!GOACC_PROF_ENABLED)
+    return;
+
+  if (ev < acc_ev_none
+      || ev >= acc_ev_last)
+    {
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring request for bogus 'acc_event_t'\n");
+      return;
+    }
+
+  bool bogus = true;
+  switch (reg)
+    {
+    case acc_reg:
+    case acc_toggle:
+    case acc_toggle_per_thread:
+      bogus = false;
+      break;
+    }
+  if (bogus)
+    {
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring request with bogus 'acc_register_t'\n");
+      return;
+    }
+
+  /* Special cases.  */
+  if (reg == acc_toggle)
+    {
+      if (cb == NULL)
+	{
+	  gomp_debug (0, "  globally disabling callbacks\n");
+	  gomp_mutex_lock (&goacc_prof_lock);
+	  /* For 'acc_ev_none', this acts as a global toggle.  */
+	  goacc_prof_callbacks_enabled[ev] = false;
+	  gomp_mutex_unlock (&goacc_prof_lock);
+	  return;
+	}
+      else if (ev == acc_ev_none && cb != NULL)
+	{
+	  gomp_debug (0, "  ignoring request\n");
+	  return;
+	}
+    }
+  else if (reg == acc_toggle_per_thread)
+    {
+      if (ev == acc_ev_none && cb == NULL)
+	{
+	  gomp_debug (0, "  thread: disabling callbacks\n");
+	  goacc_lazy_initialize ();
+	  struct goacc_thread *thr = goacc_thread ();
+	  thr->prof_callbacks_enabled = false;
+	  return;
+	}
+      /* Silently ignore.  */
+      gomp_debug (0, "  ignoring bogus request\n");
+      return;
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  struct goacc_prof_callback_entry *it, *it_p;
+  it = goacc_prof_callback_entries[ev];
+  it_p = NULL;
+  while (it)
+    {
+      if (it->cb == cb)
+	break;
+      it_p = it;
+      it = it->next;
+    }
+
+  switch (reg)
+    {
+    case acc_reg:
+      if (it == NULL)
+	{
+	  /* Silently ignore.  */
+	  gomp_debug (0, "  ignoring bogus request: is not registered\n");
+	  break;
+	}
+      it->ref--;
+      gomp_debug (0, "  decrementing reference count to: %d\n", it->ref);
+      if (it->ref == 0)
+	{
+	  if (it_p == NULL)
+	    goacc_prof_callback_entries[ev] = it->next;
+	  else
+	    it_p->next = it->next;
+	  free (it);
+	}
+      break;
+
+    case acc_toggle:
+      if (it == NULL)
+	{
+	  gomp_debug (0, "  ignoring request: is not registered\n");
+	  break;
+	}
+      else
+	{
+	  gomp_debug (0, "  disabling\n");
+	  it->enabled = false;
+	}
+      break;
+
+    case acc_toggle_per_thread:
+      __builtin_unreachable ();
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+acc_query_fn
+acc_prof_lookup (const char *name)
+{
+  gomp_debug (0, "%s (%s)\n",
+	      __FUNCTION__, name ?: "NULL");
+
+  return NULL;
+}
+
+/* Prepare to dispatch events?  */
+
+bool
+_goacc_profiling_dispatch_p (bool check_not_nested_p)
+{
+  gomp_debug (0, "%s\n", __FUNCTION__);
+
+  bool ret;
+
+  struct goacc_thread *thr = goacc_thread ();
+  if (__builtin_expect (thr == NULL, false))
+    {
+      /* If we don't have any per-thread state yet, that means that per-thread
+	 callback dispatch has not been explicitly disabled (which only a call
+	 to 'acc_prof_unregister' with 'acc_toggle_per_thread' would do, and
+	 that would have allocated per-thread state via
+	 'goacc_lazy_initialize'); initially, all callbacks for all events are
+	 enabled.  */
+      gomp_debug (0, "  %s: don't have any per-thread state yet\n", __FUNCTION__);
+    }
+  else
+    {
+      if (check_not_nested_p)
+	{
+	  /* No nesting.  */
+	  assert (thr->prof_info == NULL);
+	  assert (thr->api_info == NULL);
+	}
+
+      if (__builtin_expect (!thr->prof_callbacks_enabled, true))
+	{
+	  gomp_debug (0, "  %s: disabled for this thread\n", __FUNCTION__);
+	  ret = false;
+	  goto out;
+	}
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  /* 'goacc_prof_callbacks_enabled[acc_ev_none]' acts as a global toggle.  */
+  if (__builtin_expect (!goacc_prof_callbacks_enabled[acc_ev_none], true))
+    {
+      gomp_debug (0, "  %s: disabled globally\n", __FUNCTION__);
+      ret = false;
+      goto out_unlock;
+    }
+  else
+    ret = true;
+
+ out_unlock:
+  gomp_mutex_unlock (&goacc_prof_lock);
+
+ out:
+  return ret;
+}
+
+/* Set up to dispatch events?  */
+
+bool
+_goacc_profiling_setup_p (struct goacc_thread *thr,
+			  acc_prof_info *prof_info, acc_api_info *api_info)
+{
+  gomp_debug (0, "%s (%p)\n", __FUNCTION__, thr);
+
+  /* If we don't have any per-thread state yet, we can't register 'prof_info'
+     and 'api_info'.  */
+  if (__builtin_expect (thr == NULL, false))
+    {
+      gomp_debug (0, "Can't dispatch OpenACC Profiling Interface events for"
+		  " the current call, construct, or directive\n");
+      return false;
+    }
+
+  if (thr->prof_info != NULL)
+    {
+      /* Profiling has already been set up for an outer construct.  In this
+	 case, we continue to use the existing information, and thus return
+	 'false' here.
+
+	 This can happen, for example, for an 'enter data' directive, which
+	 sets up profiling, then calls into 'acc_copyin', which should not
+	 again set up profiling, should not overwrite the existing
+	 information.  */
+      return false;
+    }
+
+  thr->prof_info = prof_info;
+  thr->api_info = api_info;
+
+  /* Fill in some defaults.  */
+
+  prof_info->event_type = -1; /* Must be set later.  */
+  prof_info->valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+  prof_info->version = _ACC_PROF_INFO_VERSION;
+  if (thr->dev)
+    {
+      prof_info->device_type = acc_device_type (thr->dev->type);
+      prof_info->device_number = thr->dev->target_id;
+    }
+  else
+    {
+      prof_info->device_type = -1;
+      prof_info->device_number = -1;
+    }
+  prof_info->thread_id = -1;
+  prof_info->async = acc_async_sync;
+  prof_info->async_queue = prof_info->async;
+  prof_info->src_file = NULL;
+  prof_info->func_name = NULL;
+  prof_info->line_no = -1;
+  prof_info->end_line_no = -1;
+  prof_info->func_line_no = -1;
+  prof_info->func_end_line_no = -1;
+
+  api_info->device_api = acc_device_api_none;
+  api_info->valid_bytes = _ACC_API_INFO_VALID_BYTES;
+  api_info->device_type = prof_info->device_type;
+  api_info->vendor = -1;
+  api_info->device_handle = NULL;
+  api_info->context_handle = NULL;
+  api_info->async_handle = NULL;
+
+  return true;
+}
+
+/* Dispatch events.
+
+   This must only be called if 'GOACC_PROFILING_DISPATCH_P' or
+   'GOACC_PROFILING_SETUP_P' returned a true result.  */
+
+void
+goacc_profiling_dispatch (acc_prof_info *prof_info, acc_event_info *event_info,
+			  acc_api_info *apt_info)
+{
+  acc_event_t event_type = event_info->event_type;
+  gomp_debug (0, "%s: event_type=%d\n", __FUNCTION__, (int) event_type);
+  assert (event_type > acc_ev_none
+	  && event_type < acc_ev_last);
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  if (!goacc_prof_callbacks_enabled[event_type])
+    {
+      gomp_debug (0, "  disabled for this event type\n");
+
+      goto out_unlock;
+    }
+
+  for (struct goacc_prof_callback_entry *e
+	 = goacc_prof_callback_entries[event_type];
+       e != NULL;
+       e = e->next)
+    {
+      if (!e->enabled)
+	{
+	  gomp_debug (0, "  disabled for callback %p\n", e->cb);
+	  continue;
+	}
+
+      gomp_debug (0, "  calling callback %p\n", e->cb);
+      e->cb (prof_info, event_info, apt_info);
+    }
+
+ out_unlock:
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in
index d7ac714..0d8f476 100644
--- a/libgomp/omp.h.in
+++ b/libgomp/omp.h.in
@@ -188,6 +188,18 @@
 extern int omp_pause_resource (omp_pause_resource_t, int) __GOMP_NOTHROW;
 extern int omp_pause_resource_all (omp_pause_resource_t) __GOMP_NOTHROW;
 
+/************************************************************************/
+/* Libgomp extensions.                                                  */
+
+#include <stdint.h>
+
+/* Print a message, and value, possibly from a GPU-offloaded function.
+   Primarily intended for debug messages.
+   Maximum message & value length is 128 bytes.  */
+void gomp_print_string (const char *msg, const char *value);
+void gomp_print_integer (const char *msg, int64_t value);
+void gomp_print_double (const char *msg, double value);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in
index c4e7c0d..65aec54 100644
--- a/libgomp/omp_lib.f90.in
+++ b/libgomp/omp_lib.f90.in
@@ -484,4 +484,30 @@
           end function
         end interface
 
+        ! Libgomp extensions.
+
+        interface
+          subroutine gomp_print_string (msg, str) bind(C)
+            use iso_c_binding, only: c_char
+            character (kind=c_char) :: msg(*)
+            character (kind=c_char) :: str(*)
+          end subroutine gomp_print_string
+        end interface
+
+        interface
+          subroutine gomp_print_integer (msg, i) bind(C)
+            use iso_c_binding, only: c_char, c_int64_t
+            character (kind=c_char) :: msg(*)
+            integer (kind=c_int64_t), value :: i
+          end subroutine gomp_print_integer
+        end interface
+
+        interface
+          subroutine gomp_print_double (msg, d) bind(C)
+            use iso_c_binding, only: c_char, c_double
+            character (kind=c_char) :: msg(*)
+            real (kind=c_double), value :: d
+          end subroutine gomp_print_double
+        end interface
+
       end module omp_lib
diff --git a/libgomp/openacc.f90 b/libgomp/openacc.f90
index bc20545..e675843 100644
--- a/libgomp/openacc.f90
+++ b/libgomp/openacc.f90
@@ -28,7 +28,7 @@
 !  <http://www.gnu.org/licenses/>.
 
 module openacc_kinds
-  use iso_fortran_env, only: int32
+  use iso_fortran_env, only: int32, int64
   implicit none
 
   private :: int32
@@ -37,7 +37,7 @@
   integer, parameter :: acc_device_kind = int32
 
   public :: acc_device_none, acc_device_default, acc_device_host
-  public :: acc_device_not_host, acc_device_nvidia
+  public :: acc_device_not_host, acc_device_nvidia, acc_device_gcn
 
   ! Keep in sync with include/gomp-constants.h.
   integer (acc_device_kind), parameter :: acc_device_none = 0
@@ -46,6 +46,23 @@
   ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
   integer (acc_device_kind), parameter :: acc_device_not_host = 4
   integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+  integer (acc_device_kind), parameter :: acc_device_hsa = 7
+  integer (acc_device_kind), parameter :: acc_device_gcn = 8
+  integer (acc_device_kind), parameter :: acc_device_current = 9
+
+  public :: acc_device_property
+
+  integer, parameter :: acc_device_property = int64
+
+  public :: acc_property_memory, acc_property_free_memory
+  public :: acc_property_name, acc_property_vendor, acc_property_driver
+
+  ! Keep in sync with include/gomp-constants.h.
+  integer (acc_device_property), parameter :: acc_property_memory = 1
+  integer (acc_device_property), parameter :: acc_property_free_memory = 2
+  integer (acc_device_property), parameter :: acc_property_name = Z'10001'
+  integer (acc_device_property), parameter :: acc_property_vendor = Z'10002'
+  integer (acc_device_property), parameter :: acc_property_driver = Z'10003'
 
   public :: acc_handle_kind
 
@@ -86,6 +103,22 @@
       integer (acc_device_kind) d
     end subroutine
 
+    function acc_get_property_h (n, d, p)
+      import
+      integer (acc_device_property) :: acc_get_property_h
+      integer, value :: n
+      integer (acc_device_kind), value :: d
+      integer (acc_device_property), value :: p
+    end function
+
+    subroutine acc_get_property_string_h (n, d, p, s)
+      import
+      integer, value :: n
+      integer (acc_device_kind), value :: d
+      integer (acc_device_property), value :: p
+      character (*) :: s
+    end subroutine
+
     function acc_get_device_num_h (d)
       import
       integer acc_get_device_num_h
@@ -511,6 +544,24 @@
       integer (c_int), value :: d
     end function
 
+    function acc_get_property_l (n, d, p) &
+        bind (C, name = "acc_get_property")
+      use iso_c_binding, only: c_int, c_size_t
+      integer (c_size_t) :: acc_get_property_l
+      integer (c_int), value :: n
+      integer (c_int), value :: d
+      integer (c_int), value :: p
+    end function
+
+    function acc_get_property_string_l (n, d, p) &
+        bind (C, name = "acc_get_property_string")
+      use iso_c_binding, only: c_int, c_ptr
+      type (c_ptr) :: acc_get_property_string_l
+      integer (c_int), value :: n
+      integer (c_int), value :: d
+      integer (c_int), value :: p
+    end function
+
     function acc_async_test_l (a) &
         bind (C, name = "acc_async_test")
       use iso_c_binding, only: c_int
@@ -730,7 +781,7 @@
   public :: acc_copyin_async, acc_create_async, acc_copyout_async
   public :: acc_delete_async, acc_update_device_async, acc_update_self_async
 
-  integer, parameter :: openacc_version = 201306
+  integer, parameter :: openacc_version = 201711
 
   interface acc_get_num_devices
     procedure :: acc_get_num_devices_h
@@ -752,6 +803,14 @@
     procedure :: acc_get_device_num_h
   end interface
 
+  interface acc_get_property
+    procedure :: acc_get_property_h
+  end interface
+
+  interface acc_get_property_string
+    procedure :: acc_get_property_string_h
+  end interface
+
   interface acc_async_test
     procedure :: acc_async_test_h
   end interface
@@ -932,6 +991,19 @@
 
 end module
 
+module openacc_c_string
+  implicit none
+
+  interface
+    function strlen (s) bind (C, name = "strlen")
+      use iso_c_binding, only: c_ptr, c_size_t
+      type (c_ptr), intent(in), value :: s
+      integer (c_size_t) :: strlen
+    end function
+  end interface
+
+end module
+
 function acc_get_num_devices_h (d)
   use openacc_internal, only: acc_get_num_devices_l
   use openacc_kinds
@@ -970,6 +1042,50 @@
   acc_get_device_num_h = acc_get_device_num_l (d)
 end function
 
+function acc_get_property_h (n, d, p)
+  use iso_c_binding, only: c_int
+  use openacc_internal, only: acc_get_property_l
+  use openacc_kinds
+  integer (acc_device_property) :: acc_get_property_h
+  integer, value :: n
+  integer (acc_device_kind), value :: d
+  integer (acc_device_property), value :: p
+
+  integer (c_int) :: pint
+
+  pint = int (p, c_int)
+  acc_get_property_h = acc_get_property_l (n, d, pint)
+end function
+
+subroutine acc_get_property_string_h (n, d, p, s)
+  use iso_c_binding, only: c_char, c_int, c_ptr, c_f_pointer
+  use openacc_internal, only: acc_get_property_string_l
+  use openacc_c_string, only: strlen
+  use openacc_kinds
+  integer, value :: n
+  integer (acc_device_kind), value :: d
+  integer (acc_device_property), value :: p
+  character (*) :: s
+
+  integer (c_int) :: pint
+  type (c_ptr) :: cptr
+  integer :: clen
+  character (kind=c_char, len=1), pointer :: sptr (:)
+  integer :: slen
+  integer :: i
+
+  pint = int (p, c_int)
+  cptr = acc_get_property_string_l (n, d, pint)
+  clen = int (strlen (cptr))
+  call c_f_pointer (cptr, sptr, [clen])
+
+  s = ""
+  slen = min (clen, len (s))
+  do i = 1, slen
+    s (i:i) = sptr (i)
+  end do
+end subroutine
+
 function acc_async_test_h (a)
   use openacc_internal, only: acc_async_test_l
   logical acc_async_test_h
diff --git a/libgomp/openacc.h b/libgomp/openacc.h
index 1bbe6c9..83e7c46 100644
--- a/libgomp/openacc.h
+++ b/libgomp/openacc.h
@@ -55,12 +55,26 @@
   /* acc_device_host_nonshm = 3 removed.  */
   acc_device_not_host = 4,
   acc_device_nvidia = 5,
+  /* not supported */ _acc_device_intel_mic = 6,
+  /* not supported */ _acc_device_hsa = 7,
+  acc_device_gcn = 8,
+  acc_device_current = 9,
   _ACC_device_hwm,
   /* Ensure enumeration is layout compatible with int.  */
   _ACC_highest = __INT_MAX__,
   _ACC_neg = -1
 } acc_device_t;
 
+typedef enum acc_device_property_t {
+  /* Keep in sync with include/gomp-constants.h.  */
+  /* Start from 1 to catch uninitialized use.  */
+  acc_property_memory = 1,
+  acc_property_free_memory = 2,
+  acc_property_name = 0x10001,
+  acc_property_vendor = 0x10002,
+  acc_property_driver = 0x10003
+} acc_device_property_t;
+
 typedef enum acc_async_t {
   /* Keep in sync with include/gomp-constants.h.  */
   acc_async_noval = -1,
@@ -72,6 +86,10 @@
 acc_device_t acc_get_device_type (void) __GOACC_NOTHROW;
 void acc_set_device_num (int, acc_device_t) __GOACC_NOTHROW;
 int acc_get_device_num (acc_device_t) __GOACC_NOTHROW;
+size_t acc_get_property
+  (int, acc_device_t, acc_device_property_t) __GOACC_NOTHROW;
+const char *acc_get_property_string
+  (int, acc_device_t, acc_device_property_t) __GOACC_NOTHROW;
 int acc_async_test (int) __GOACC_NOTHROW;
 int acc_async_test_all (void) __GOACC_NOTHROW;
 void acc_wait (int) __GOACC_NOTHROW;
@@ -108,12 +126,18 @@
 int acc_is_present (void *, size_t) __GOACC_NOTHROW;
 void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW;
 void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW;
+void acc_attach (void **) __GOACC_NOTHROW;
+void acc_attach_async (void **, int) __GOACC_NOTHROW;
+void acc_detach (void **) __GOACC_NOTHROW;
+void acc_detach_async (void **, int) __GOACC_NOTHROW;
 
 /* Finalize versions of copyout/delete functions, specified in OpenACC 2.5.  */
 void acc_copyout_finalize (void *, size_t) __GOACC_NOTHROW;
 void acc_copyout_finalize_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW;
 void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_detach_finalize (void **) __GOACC_NOTHROW;
+void acc_detach_finalize_async (void **, int) __GOACC_NOTHROW;
 
 /* Async functions, specified in OpenACC 2.5.  */
 void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW;
diff --git a/libgomp/openacc_lib.h b/libgomp/openacc_lib.h
index fbd8f5e..3284b96 100644
--- a/libgomp/openacc_lib.h
+++ b/libgomp/openacc_lib.h
@@ -42,6 +42,8 @@
 !     removed.
       integer (acc_device_kind), parameter :: acc_device_not_host = 4
       integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+      integer (acc_device_kind), parameter :: acc_device_hsa = 7
+      integer (acc_device_kind), parameter :: acc_device_gcn = 8
 
       integer, parameter :: acc_handle_kind = 4
 
@@ -49,7 +51,7 @@
       integer (acc_handle_kind), parameter :: acc_async_noval = -1
       integer (acc_handle_kind), parameter :: acc_async_sync = -2
 
-      integer, parameter :: openacc_version = 201306
+      integer, parameter :: openacc_version = 201711
 
       interface acc_get_num_devices
         function acc_get_num_devices_h (d)
diff --git a/libgomp/plugin/Makefrag.am b/libgomp/plugin/Makefrag.am
index 168ef59..45ed043 100644
--- a/libgomp/plugin/Makefrag.am
+++ b/libgomp/plugin/Makefrag.am
@@ -52,3 +52,17 @@
 libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
 libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
 endif
+
+if PLUGIN_GCN
+# AMD GCN plugin
+libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-gcn.la
+libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_GCN_CPPFLAGS) \
+	-D_GNU_SOURCE
+libgomp_plugin_gcn_la_LDFLAGS = $(libgomp_plugin_gcn_version_info) \
+	$(lt_host_flags)
+libgomp_plugin_gcn_la_LDFLAGS += $(PLUGIN_GCN_LDFLAGS)
+libgomp_plugin_gcn_la_LIBADD = libgomp.la $(PLUGIN_GCN_LIBS)
+libgomp_plugin_gcn_la_LIBTOOLFLAGS = --tag=disable-static
+endif
diff --git a/libgomp/plugin/configfrag.ac b/libgomp/plugin/configfrag.ac
index 9718ac7..1ea67c9 100644
--- a/libgomp/plugin/configfrag.ac
+++ b/libgomp/plugin/configfrag.ac
@@ -137,6 +137,15 @@
 AC_SUBST(PLUGIN_HSA_LDFLAGS)
 AC_SUBST(PLUGIN_HSA_LIBS)
 
+PLUGIN_GCN=0
+PLUGIN_GCN_CPPFLAGS=
+PLUGIN_GCN_LDFLAGS=
+PLUGIN_GCN_LIBS=
+AC_SUBST(PLUGIN_GCN)
+AC_SUBST(PLUGIN_GCN_CPPFLAGS)
+AC_SUBST(PLUGIN_GCN_LDFLAGS)
+AC_SUBST(PLUGIN_GCN_LIBS)
+
 # Parse '--enable-offload-targets', figure out the corresponding libgomp
 # plugins, and configure to find the corresponding offload compilers.
 # 'offload_plugins' and 'offload_targets' will be populated in the same order.
@@ -237,6 +246,30 @@
             ;;
         esac
         ;;
+
+      amdgcn*)
+	case "${target}" in
+	  x86_64-*-*)
+	    case " ${CC} ${CFLAGS} " in
+	      *" -m32 "*)
+		PLUGIN_GCN=0
+		;;
+	      *)
+		tgt_name=gcn
+		tgt_plugin=gcn
+		PLUGIN_GCN=$tgt
+		PLUGIN_GCN_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
+		PLUGIN_GCN_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
+		PLUGIN_GCN_LIBS="-ldl"
+		PLUGIN_GCN=1
+		;;
+	      esac
+	    ;;
+	  *-*-*)
+	    PLUGIN_GCN=0
+	     ;;
+	esac
+	;;
       *)
 	AC_MSG_ERROR([unknown offload target specified])
 	;;
@@ -249,8 +282,8 @@
       offload_plugins=$tgt_plugin
       offload_targets=$tgt
     else
-      offload_plugins=$offload_plugins,$tgt_plugin
-      offload_targets=$offload_targets,$tgt
+      offload_plugins=$offload_plugins:$tgt_plugin
+      offload_targets=$offload_targets:$tgt
     fi
     # Configure additional search paths.
     if test "$tgt_plugin" = hsa; then
@@ -265,8 +298,8 @@
     fi
   done
 fi
-AC_DEFINE_UNQUOTED(OFFLOAD_PLUGINS, "$offload_plugins",
-  [Define to offload plugins, separated by commas.])
+AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets",
+  [Define to offload targets, separated by colons.])
 AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
 AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
   [Define to 1 if the NVIDIA plugin is built, 0 if not.])
@@ -275,6 +308,9 @@
 AM_CONDITIONAL([PLUGIN_HSA], [test $PLUGIN_HSA = 1])
 AC_DEFINE_UNQUOTED([PLUGIN_HSA], [$PLUGIN_HSA],
   [Define to 1 if the HSA plugin is built, 0 if not.])
+AM_CONDITIONAL([PLUGIN_GCN], [test $PLUGIN_GCN = 1])
+AC_DEFINE_UNQUOTED([PLUGIN_GCN], [$PLUGIN_GCN],
+  [Define to 1 if the GCN plugin is built, 0 if not.])
 
 if test "$HSA_RUNTIME_LIB" != ""; then
   HSA_RUNTIME_LIB="$HSA_RUNTIME_LIB/"
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index b2a4c21..cd91b39 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -8,6 +8,9 @@
 CUDA_ONE_CALL (cuDeviceGet)
 CUDA_ONE_CALL (cuDeviceGetAttribute)
 CUDA_ONE_CALL (cuDeviceGetCount)
+CUDA_ONE_CALL (cuDeviceGetName)
+CUDA_ONE_CALL (cuDeviceTotalMem)
+CUDA_ONE_CALL (cuDriverGetVersion)
 CUDA_ONE_CALL (cuEventCreate)
 CUDA_ONE_CALL (cuEventDestroy)
 CUDA_ONE_CALL (cuEventElapsedTime)
@@ -35,6 +38,7 @@
 CUDA_ONE_CALL (cuMemFree)
 CUDA_ONE_CALL (cuMemFreeHost)
 CUDA_ONE_CALL (cuMemGetAddressRange)
+CUDA_ONE_CALL (cuMemGetInfo)
 CUDA_ONE_CALL (cuMemHostGetDevicePointer)
 CUDA_ONE_CALL (cuModuleGetFunction)
 CUDA_ONE_CALL (cuModuleGetGlobal)
@@ -42,6 +46,7 @@
 CUDA_ONE_CALL (cuModuleLoadData)
 CUDA_ONE_CALL (cuModuleUnload)
 CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
+CUDA_ONE_CALL (cuStreamAddCallback)
 CUDA_ONE_CALL (cuStreamCreate)
 CUDA_ONE_CALL (cuStreamDestroy)
 CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 7c0afaf..e65f972 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -54,7 +54,11 @@
   CUDA_ERROR_INVALID_CONTEXT = 201,
   CUDA_ERROR_NOT_FOUND = 500,
   CUDA_ERROR_NOT_READY = 600,
-  CUDA_ERROR_LAUNCH_FAILED = 719
+  CUDA_ERROR_LAUNCH_FAILED = 719,
+  CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
+  CUDA_ERROR_NOT_PERMITTED = 800,
+  CUDA_ERROR_NOT_SUPPORTED = 801,
+  CUDA_ERROR_UNKNOWN = 999
 } CUresult;
 
 typedef enum {
@@ -173,6 +177,8 @@
 CUresult cuModuleUnload (CUmodule);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
 					  CUoccupancyB2DSize, size_t, int);
+typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
new file mode 100644
index 0000000..b5995af
--- /dev/null
+++ b/libgomp/plugin/plugin-gcn.c
@@ -0,0 +1,3726 @@
+/* Plugin for AMD GCN execution.
+
+   Copyright (C) 2013-2019 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <hsa.h>
+#include <dlfcn.h>
+#include <signal.h>
+#include "libgomp-plugin.h"
+#include "gomp-constants.h"
+#include <elf.h>
+#include "oacc-plugin.h"
+#include "oacc-int.h"
+#include <assert.h>
+
+/* Additional definitions not in HSA 1.1.
+   FIXME: this needs to be updated in hsa.h for upstream, but the only source
+          right now is the ROCr source which may cause license issues.  */
+#define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
+
+/* These probably won't be in elf.h for a while.  */
+#define R_AMDGPU_NONE		0
+#define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
+#define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
+#define R_AMDGPU_ABS64		3	/* S + A  */
+#define R_AMDGPU_REL32		4	/* S + A - P  */
+#define R_AMDGPU_REL64		5	/* S + A - P  */
+#define R_AMDGPU_ABS32		6	/* S + A  */
+#define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
+#define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
+#define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
+#define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
+#define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
+#define reserved		12
+#define R_AMDGPU_RELATIVE64	13	/* B + A  */
+
+/* Secure getenv() which returns NULL if running as SUID/SGID.  */
+#ifndef HAVE_SECURE_GETENV
+#ifdef HAVE___SECURE_GETENV
+#define secure_getenv __secure_getenv
+#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
+  && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
+
+#include <unistd.h>
+
+/* Implementation of secure_getenv() for targets where it is not provided but
+   we have at least means to test real and effective IDs. */
+
+static char *
+secure_getenv (const char *name)
+{
+  if ((getuid () == geteuid ()) && (getgid () == getegid ()))
+    return getenv (name);
+  else
+    return NULL;
+}
+
+#else
+#define secure_getenv getenv
+#endif
+#endif
+
+struct gcn_thread
+{
+  int async;
+};
+
+static inline struct gcn_thread *
+gcn_thread (void)
+{
+  return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
+}
+
+/* As an HSA runtime is dlopened, following structure defines function
+   pointers utilized by the HSA plug-in.  */
+
+struct hsa_runtime_fn_info
+{
+  /* HSA runtime.  */
+  hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
+					const char **status_string);
+  hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
+					  void *value);
+  hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
+					 hsa_agent_info_t attribute,
+					 void *value);
+  hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
+				      hsa_isa_info_t attribute,
+				      uint32_t index,
+				      void *value);
+  hsa_status_t (*hsa_init_fn) (void);
+  hsa_status_t (*hsa_iterate_agents_fn)
+    (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
+  hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
+					  hsa_region_info_t attribute,
+					  void *value);
+  hsa_status_t (*hsa_queue_create_fn)
+    (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
+     void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
+     void *data, uint32_t private_segment_size,
+     uint32_t group_segment_size, hsa_queue_t **queue);
+  hsa_status_t (*hsa_agent_iterate_regions_fn)
+    (hsa_agent_t agent,
+     hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
+  hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
+  hsa_status_t (*hsa_executable_create_fn)
+    (hsa_profile_t profile, hsa_executable_state_t executable_state,
+     const char *options, hsa_executable_t *executable);
+  hsa_status_t (*hsa_executable_global_variable_define_fn)
+    (hsa_executable_t executable, const char *variable_name, void *address);
+  hsa_status_t (*hsa_executable_load_code_object_fn)
+    (hsa_executable_t executable, hsa_agent_t agent,
+     hsa_code_object_t code_object, const char *options);
+  hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
+					   const char *options);
+  hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
+					uint32_t num_consumers,
+					const hsa_agent_t *consumers,
+					hsa_signal_t *signal);
+  hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
+					  void **ptr);
+  hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
+					      hsa_access_permission_t access);
+  hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
+  hsa_status_t (*hsa_memory_free_fn) (void *ptr);
+  hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
+  hsa_status_t (*hsa_executable_get_symbol_fn)
+    (hsa_executable_t executable, const char *module_name,
+     const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
+     hsa_executable_symbol_t *symbol);
+  hsa_status_t (*hsa_executable_symbol_get_info_fn)
+    (hsa_executable_symbol_t executable_symbol,
+     hsa_executable_symbol_info_t attribute, void *value);
+  hsa_status_t (*hsa_executable_iterate_symbols_fn)
+    (hsa_executable_t executable,
+     hsa_status_t (*callback)(hsa_executable_t executable,
+			      hsa_executable_symbol_t symbol, void *data),
+     void *data);
+  uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
+						    uint64_t value);
+  uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
+  void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
+				       hsa_signal_value_t value);
+  void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
+				       hsa_signal_value_t value);
+  hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
+    (hsa_signal_t signal, hsa_signal_condition_t condition,
+     hsa_signal_value_t compare_value, uint64_t timeout_hint,
+     hsa_wait_state_t wait_state_hint);
+  hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
+  hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
+
+  hsa_status_t (*hsa_code_object_deserialize_fn)
+    (void *serialized_code_object, size_t serialized_code_object_size,
+     const char *options, hsa_code_object_t *code_object);
+};
+
+/* HSA runtime functions that are initialized in init_hsa_context.  */
+
+static struct hsa_runtime_fn_info hsa_fns;
+
+/* Keep the following GOMP prefixed structures in sync with respective parts of
+   the compiler.  */
+
+/* Structure describing the run-time and grid properties of an HSA kernel
+   lauch.  */
+
+struct GOMP_kernel_launch_attributes
+{
+  /* Number of dimensions the workload has.  Maximum number is 3.  */
+  uint32_t ndim;
+  /* Size of the grid in the three respective dimensions.  */
+  uint32_t gdims[3];
+  /* Size of work-groups in the respective dimensions.  */
+  uint32_t wdims[3];
+};
+
+/* Collection of information needed for a dispatch of a kernel from a
+   kernel.  */
+
+struct GOMP_hsa_kernel_dispatch
+{
+  /* Pointer to a command queue associated with a kernel dispatch agent.  */
+  void *queue;
+  /* Pointer to a memory space used for kernel arguments passing.  */
+  void *kernarg_address;
+  /* Kernel object.  */
+  uint64_t object;
+  /* Synchronization signal used for dispatch synchronization.  */
+  uint64_t signal;
+  /* Private segment size.  */
+  uint32_t private_segment_size;
+  /* Group segment size.  */
+  uint32_t group_segment_size;
+};
+
+/* Structure of the default kernargs segment, supporting gomp_print_*.
+   This will only be used if the requested space is less than 9 bytes.  */
+
+struct kernargs {
+  /* Leave space for the real kernel arguments.
+     OpenACC and OpenMP only use one pointer.  */
+  int64_t dummy1;
+  int64_t dummy2;
+
+  /* A pointer to struct output, below, for console output data.  */
+  int64_t out_ptr;
+
+  /* A pointer to struct heap, below.  */
+  int64_t heap_ptr;
+
+  /* Output data.  */
+  struct output {
+    int return_value;
+    unsigned int next_output;
+    struct printf_data {
+      int written;
+      char msg[128];
+      int type;
+      union {
+	int64_t ivalue;
+	double dvalue;
+	char text[128];
+      };
+    } queue[1024];
+    unsigned int consumed;
+  } output_data;
+};
+
+/* Heap space, allocated target-side, provided for use of newlib malloc.
+   Each module should have it's own heap allocated.
+   Beware that heap usage increases with OpenMP teams.  */
+static size_t gcn_kernel_heap_size = 100*1024*1024;  /* 100MB.  */
+struct heap {
+  int64_t size;
+  char data[0];
+};
+
+/* GCN specific definition of asynchronous queues.  */
+
+#define ASYNC_QUEUE_SIZE 64
+#define DRAIN_QUEUE_SYNCHRONOUS_P false
+#define DEBUG_QUEUES 0
+#define DEBUG_THREAD_SLEEP 0
+#define DEBUG_THREAD_SIGNAL 0
+
+struct kernel_launch
+{
+  struct kernel_info *kernel;
+  void *vars;
+  struct GOMP_kernel_launch_attributes kla;
+};
+
+struct callback
+{
+  void (*fn)(void *);
+  void *data;
+};
+
+struct placeholder
+{
+  int executed;
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+};
+
+struct asyncwait_info
+{
+  struct placeholder *placeholderp;
+};
+
+enum entry_type
+{
+  KERNEL_LAUNCH,
+  CALLBACK,
+  ASYNC_WAIT,
+  ASYNC_PLACEHOLDER
+};
+
+struct queue_entry
+{
+  enum entry_type type;
+  union {
+    struct kernel_launch launch;
+    struct callback callback;
+    struct asyncwait_info asyncwait;
+    struct placeholder placeholder;
+  } u;
+};
+
+struct goacc_asyncqueue
+{
+  struct agent_info *agent;
+  hsa_queue_t *hsa_queue;
+
+  pthread_t thread_drain_queue;
+  pthread_mutex_t mutex;
+  pthread_cond_t queue_cond_in;
+  pthread_cond_t queue_cond_out;
+  struct queue_entry queue[ASYNC_QUEUE_SIZE];
+  int queue_first;
+  int queue_n;
+  int drain_queue_stop;
+
+  int id;
+  struct goacc_asyncqueue *prev;
+  struct goacc_asyncqueue *next;
+};
+
+/* Part of the libgomp plugin interface.  Return the name of the accelerator,
+   which is "gcn".  */
+
+const char *
+GOMP_OFFLOAD_get_name (void)
+{
+  return "gcn";
+}
+
+/* Part of the libgomp plugin interface.  Return the specific capabilities the
+   HSA accelerator have.  */
+
+unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+  /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
+  return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
+	    | GOMP_OFFLOAD_CAP_OPENACC_200;
+}
+
+/* Part of the libgomp plugin interface.  Identify as HSA accelerator.  */
+
+int
+GOMP_OFFLOAD_get_type (void)
+{
+  return OFFLOAD_TARGET_TYPE_GCN;
+}
+
+/* Return the libgomp version number we're compatible with.  There is
+   no requirement for cross-version compatibility.  */
+
+unsigned
+GOMP_OFFLOAD_version (void)
+{
+  return GOMP_VERSION;
+}
+
+/* Flag to decide whether print to stderr information about what is going on.
+   Set in init_debug depending on environment variables.  */
+
+static bool debug;
+
+/* Flag to decide if the runtime should suppress a possible fallback to host
+   execution.  */
+
+static bool suppress_host_fallback;
+
+/* Flag to locate HSA runtime shared library that is dlopened
+   by this plug-in.  */
+
+static const char *hsa_runtime_lib;
+
+/* Flag to decide if the runtime should support also CPU devices (can be
+   a simulator).  */
+
+static bool support_cpu_devices;
+
+/* Runtime dimension overrides.  Zero indicates default.  */
+
+static int override_x_dim = 0;
+static int override_z_dim = 0;
+
+/* Initialize debug and suppress_host_fallback according to the environment.  */
+
+static void
+init_environment_variables (void)
+{
+  if (secure_getenv ("GCN_DEBUG"))
+    debug = true;
+  else
+    debug = false;
+
+  if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
+    suppress_host_fallback = true;
+  else
+    suppress_host_fallback = false;
+
+  hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
+  if (hsa_runtime_lib == NULL)
+    hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
+
+  support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
+
+  const char *x = secure_getenv ("GCN_NUM_TEAMS");
+  if (!x)
+    x = secure_getenv ("GCN_NUM_GANGS");
+  if (x)
+    override_x_dim = atoi (x);
+
+  const char *z = secure_getenv ("GCN_NUM_THREADS");
+  if (!z)
+    z = secure_getenv ("GCN_NUM_WORKERS");
+  if (z)
+    override_z_dim = atoi (z);
+
+  const char *heap = secure_getenv ("GCN_HEAP_SIZE");
+  if (heap)
+    {
+      size_t tmp = atol (heap);
+      if (tmp)
+	gcn_kernel_heap_size = tmp;
+    }
+}
+
+/* Print a message to stderr if HSA_DEBUG value is set to true.  */
+
+#define HSA_DPRINT(...) \
+  do \
+  { \
+    if (debug) \
+      { \
+	fprintf (stderr, __VA_ARGS__); \
+      } \
+  } \
+  while (false);
+
+/* Flush stderr if GCN_DEBUG value is set to true.  */
+
+#define HSA_FLUSH()				\
+  do {						\
+    if (debug)					\
+      fflush (stderr);				\
+  } while (0)
+
+/* Print a logging message with PREFIX to stderr if HSA_DEBUG value
+   is set to true.  */
+
+#define HSA_LOG(prefix, ...)			\
+  do						\
+    {						\
+      HSA_DPRINT (prefix);			\
+      HSA_DPRINT (__VA_ARGS__);			\
+      HSA_FLUSH ();				\
+    } while (false)
+
+/* Print a debugging message to stderr.  */
+
+#define HSA_DEBUG(...) HSA_LOG ("GCN debug: ", __VA_ARGS__)
+
+/* Print a warning message to stderr.  */
+
+#define HSA_WARNING(...) HSA_LOG ("GCN warning: ", __VA_ARGS__)
+
+/* Print HSA warning STR with an HSA STATUS code.  */
+
+static void
+hsa_warn (const char *str, hsa_status_t status)
+{
+  if (!debug)
+    return;
+
+  const char *hsa_error_msg = "[unknown]";
+  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+
+  fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
+	   hsa_error_msg);
+}
+
+/* Report a fatal error STR together with the HSA error corresponding to STATUS
+   and terminate execution of the current process.  */
+
+static void
+hsa_fatal (const char *str, hsa_status_t status)
+{
+  const char *hsa_error_msg = "[unknown]";
+  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+  GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
+		     hsa_error_msg);
+}
+
+/* Like hsa_fatal, except only report error message, and return FALSE
+   for propagating error processing to outside of plugin.  */
+
+static bool
+hsa_error (const char *str, hsa_status_t status)
+{
+  const char *hsa_error_msg = "[unknown]";
+  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+  GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
+		     hsa_error_msg);
+  return false;
+}
+
+struct hsa_kernel_description
+{
+  const char *name;
+  int oacc_dims[3];  /* Only present for GCN kernels.  */
+};
+
+struct global_var_info
+{
+  const char *name;
+  void *address;
+};
+
+/* Data passed by the static initializer of a compilation unit containing GCN
+   object code to GOMP_offload_register.  */
+
+struct gcn_image_desc
+{
+  union {
+    struct gcn_image {
+      char magic[4];  /* Will be "GCN" for GCN code objects.  */
+      size_t size;
+      void *image;
+    } *gcn_image;
+  };
+  const unsigned kernel_count;
+  struct hsa_kernel_description *kernel_infos;
+  const unsigned global_variable_count;
+  struct global_var_info *global_variables;
+};
+
+struct agent_info;
+
+/* Information required to identify, finalize and run any given kernel.  */
+
+struct kernel_info
+{
+  /* Name of the kernel, required to locate it within the GCN object-code
+     module.  */
+  const char *name;
+  /* The specific agent the kernel has been or will be finalized for and run
+     on.  */
+  struct agent_info *agent;
+  /* The specific module where the kernel takes place.  */
+  struct module_info *module;
+  /* Mutex enforcing that at most once thread ever initializes a kernel for
+     use.  A thread should have locked agent->module_rwlock for reading before
+     acquiring it.  */
+  pthread_mutex_t init_mutex;
+  /* Flag indicating whether the kernel has been initialized and all fields
+     below it contain valid data.  */
+  bool initialized;
+  /* Flag indicating that the kernel has a problem that blocks an execution.  */
+  bool initialization_failed;
+  /* The object to be put into the dispatch queue.  */
+  uint64_t object;
+  /* Required size of kernel arguments.  */
+  uint32_t kernarg_segment_size;
+  /* Required size of group segment.  */
+  uint32_t group_segment_size;
+  /* Required size of private segment.  */
+  uint32_t private_segment_size;
+};
+
+/* Information about a particular GCN module, its image and kernels.  */
+
+struct module_info
+{
+  /* The description with which the program has registered the image.  */
+  struct gcn_image_desc *image_desc;
+  /* GCN heap allocation.  */
+  struct heap *heap;
+  /* Physical boundaries of the loaded module.  */
+  Elf64_Addr phys_address_start;
+  Elf64_Addr phys_address_end;
+
+  bool constructors_run_p;
+  struct kernel_info *init_array_func, *fini_array_func;
+
+  /* Number of kernels in this module.  */
+  int kernel_count;
+  /* An array of kernel_info structures describing each kernel in this
+     module.  */
+  struct kernel_info kernels[];
+};
+
+/* Description of an HSA GPU agent and the program associated with it.  */
+
+struct agent_info
+{
+  /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
+  hsa_agent_t id;
+  /* The user-visible device number.  */
+  int device_id;
+  /* Whether the agent has been initialized.  The fields below are usable only
+     if it has been.  */
+  bool initialized;
+  /* Precomuted check for problem architectures.  */
+  bool gfx900_p;
+
+  /* Command queues of the agent.  */
+  hsa_queue_t *sync_queue;
+  struct goacc_asyncqueue *async_queues, *omp_async_queue;
+  pthread_mutex_t async_queues_mutex;
+
+  /* The HSA memory region from which to allocate kernel arguments.  */
+  hsa_region_t kernarg_region;
+
+  /* The HSA memory region from which to allocate device data.  */
+  hsa_region_t data_region;
+
+  /* Read-write lock that protects kernels which are running or about to be run
+     from interference with loading and unloading of images.  Needs to be
+     locked for reading while a kernel is being run, and for writing if the
+     list of modules is manipulated (and thus the HSA program invalidated).  */
+  pthread_rwlock_t module_rwlock;
+
+  /* The module associated with this kernel.  */
+  struct module_info *module;
+
+  /* Mutex enforcing that only one thread will finalize the HSA program.  A
+     thread should have locked agent->module_rwlock for reading before
+     acquiring it.  */
+  pthread_mutex_t prog_mutex;
+  /* Flag whether the HSA program that consists of all the modules has been
+     finalized.  */
+  bool prog_finalized;
+  /* HSA executable - the finalized program that is used to locate kernels.  */
+  hsa_executable_t executable;
+};
+
+static bool create_and_finalize_hsa_program (struct agent_info *);
+
+/* Information about the whole HSA environment and all of its agents.  */
+
+struct hsa_context_info
+{
+  /* Whether the structure has been initialized.  */
+  bool initialized;
+  /* Number of usable GPU HSA agents in the system.  */
+  int agent_count;
+  /* Array of agent_info structures describing the individual HSA agents.  */
+  struct agent_info *agents;
+};
+
+/* Information about the whole HSA environment and all of its agents.  */
+
+static struct hsa_context_info hsa_context;
+
+static bool
+init_hsa_runtime_functions (void)
+{
+#define DLSYM_FN(function) \
+  hsa_fns.function##_fn = dlsym (handle, #function); \
+  if (hsa_fns.function##_fn == NULL) \
+    return false;
+  void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
+  if (handle == NULL)
+    return false;
+
+  DLSYM_FN (hsa_status_string)
+  DLSYM_FN (hsa_system_get_info)
+  DLSYM_FN (hsa_agent_get_info)
+  DLSYM_FN (hsa_init)
+  DLSYM_FN (hsa_iterate_agents)
+  DLSYM_FN (hsa_region_get_info)
+  DLSYM_FN (hsa_queue_create)
+  DLSYM_FN (hsa_agent_iterate_regions)
+  DLSYM_FN (hsa_executable_destroy)
+  DLSYM_FN (hsa_executable_create)
+  DLSYM_FN (hsa_executable_global_variable_define)
+  DLSYM_FN (hsa_executable_load_code_object)
+  DLSYM_FN (hsa_executable_freeze)
+  DLSYM_FN (hsa_signal_create)
+  DLSYM_FN (hsa_memory_allocate)
+  DLSYM_FN (hsa_memory_assign_agent)
+  DLSYM_FN (hsa_memory_copy)
+  DLSYM_FN (hsa_memory_free)
+  DLSYM_FN (hsa_signal_destroy)
+  DLSYM_FN (hsa_executable_get_symbol)
+  DLSYM_FN (hsa_executable_symbol_get_info)
+  DLSYM_FN (hsa_executable_iterate_symbols)
+  DLSYM_FN (hsa_queue_add_write_index_release)
+  DLSYM_FN (hsa_queue_load_read_index_acquire)
+  DLSYM_FN (hsa_signal_wait_acquire)
+  DLSYM_FN (hsa_signal_store_relaxed)
+  DLSYM_FN (hsa_signal_store_release)
+  DLSYM_FN (hsa_signal_load_acquire)
+  DLSYM_FN (hsa_queue_destroy)
+  DLSYM_FN (hsa_code_object_deserialize)
+  return true;
+#undef DLSYM_FN
+}
+
+static void
+dump_hsa_system_info (void)
+{
+  hsa_status_t status;
+
+  hsa_endianness_t endianness;
+  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
+					   &endianness);
+  if (status == HSA_STATUS_SUCCESS)
+    switch (endianness)
+      {
+      case HSA_ENDIANNESS_LITTLE:
+	HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
+	break;
+      case HSA_ENDIANNESS_BIG:
+	HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
+	break;
+      default:
+	HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
+      }
+  else
+    HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
+
+  uint8_t extensions[128];
+  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
+					   &extensions);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
+	HSA_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
+    }
+  else
+    HSA_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
+}
+
+static void
+dump_machine_model (hsa_machine_model_t machine_model, const char *s)
+{
+  switch (machine_model)
+    {
+    case HSA_MACHINE_MODEL_SMALL:
+      HSA_DEBUG ("%s: SMALL\n", s);
+      break;
+    case HSA_MACHINE_MODEL_LARGE:
+      HSA_DEBUG ("%s: LARGE\n", s);
+      break;
+    default:
+      HSA_DEBUG ("%s: UNKNOWN\n", s);
+      break;
+    }
+}
+
+static void
+dump_profile (hsa_profile_t profile, const char *s)
+{
+  switch (profile)
+    {
+    case HSA_PROFILE_FULL:
+      HSA_DEBUG ("%s: FULL\n", s);
+      break;
+    case HSA_PROFILE_BASE:
+      HSA_DEBUG ("%s: BASE\n", s);
+      break;
+    default:
+      HSA_DEBUG ("%s: UNKNOWN\n", s);
+      break;
+    }
+}
+
+static void dump_hsa_regions (hsa_agent_t agent);
+
+static hsa_status_t
+dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
+{
+  hsa_status_t status;
+
+  char buf[64];
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
+					  &buf);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_NAME: FAILED\n");
+
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
+					  &buf);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
+
+  hsa_machine_model_t machine_model;
+  status
+    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
+				     &machine_model);
+  if (status == HSA_STATUS_SUCCESS)
+    dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
+
+  hsa_profile_t profile;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
+					  &profile);
+  if (status == HSA_STATUS_SUCCESS)
+    dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_PROFILE: FAILED\n");
+
+  hsa_device_type_t device_type;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
+					  &device_type);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      switch (device_type)
+	{
+	case HSA_DEVICE_TYPE_CPU:
+	  HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
+	  break;
+	case HSA_DEVICE_TYPE_GPU:
+	  HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
+	  break;
+	case HSA_DEVICE_TYPE_DSP:
+	  HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
+	  break;
+	default:
+	  HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
+	  break;
+	}
+    }
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: FAILED\n");
+
+  uint32_t cu_count;
+  status = hsa_fns.hsa_agent_get_info_fn
+    (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
+  else
+    HSA_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
+
+  uint32_t size;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
+					  &size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
+
+  uint32_t max_dim;
+  status = hsa_fns.hsa_agent_get_info_fn (agent,
+					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
+					  &max_dim);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
+
+  uint32_t max_size;
+  status = hsa_fns.hsa_agent_get_info_fn (agent,
+					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
+					  &max_size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
+
+  uint32_t grid_max_dim;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
+					  &grid_max_dim);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
+
+  uint32_t grid_max_size;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
+					  &grid_max_size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
+
+  dump_hsa_regions (agent);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Return true if the agent is a GPU and acceptable of concurrent submissions
+   from different threads.  */
+
+static bool
+suitable_hsa_agent_p (hsa_agent_t agent)
+{
+  hsa_device_type_t device_type;
+  hsa_status_t status
+    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
+				     &device_type);
+  if (status != HSA_STATUS_SUCCESS)
+    return false;
+
+  switch (device_type)
+    {
+    case HSA_DEVICE_TYPE_GPU:
+      break;
+    case HSA_DEVICE_TYPE_CPU:
+      if (!support_cpu_devices)
+	return false;
+      break;
+    default:
+      return false;
+    }
+
+  uint32_t features = 0;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
+					  &features);
+  if (status != HSA_STATUS_SUCCESS
+      || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
+    return false;
+  hsa_queue_type_t queue_type;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
+					  &queue_type);
+  if (status != HSA_STATUS_SUCCESS
+      || (queue_type != HSA_QUEUE_TYPE_MULTI))
+    return false;
+
+  return true;
+}
+
+/* Callback of hsa_iterate_agents, if AGENT is a GPU device, increment
+   agent_count in hsa_context.  */
+
+static hsa_status_t
+count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
+{
+  if (suitable_hsa_agent_p (agent))
+    hsa_context.agent_count++;
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Callback of hsa_iterate_agents, if AGENT is a GPU device, assign the agent
+   id to the describing structure in the hsa context.  The index of the
+   structure is pointed to by DATA, increment it afterwards.  */
+
+static hsa_status_t
+assign_agent_ids (hsa_agent_t agent, void *data)
+{
+  if (suitable_hsa_agent_p (agent))
+    {
+      int *agent_index = (int *) data;
+      hsa_context.agents[*agent_index].id = agent;
+      ++*agent_index;
+    }
+  return HSA_STATUS_SUCCESS;
+}
+
+static void
+finalize_async_thread (struct goacc_asyncqueue *aq)
+{
+  pthread_mutex_lock (&aq->mutex);
+  if (aq->drain_queue_stop == 2)
+    {
+      pthread_mutex_unlock (&aq->mutex);
+      return;
+    }
+
+  aq->drain_queue_stop = 1;
+
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("Signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  while (aq->drain_queue_stop != 2)
+    {
+      if (DEBUG_THREAD_SLEEP)
+	HSA_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
+		   " to sleep\n", aq->agent->device_id, aq->id);
+      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+      if (DEBUG_THREAD_SLEEP)
+	HSA_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
+		   aq->agent->device_id, aq->id);
+    }
+
+  HSA_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
+	     aq->id);
+  pthread_mutex_unlock (&aq->mutex);
+
+  int err = pthread_join (aq->thread_drain_queue, NULL);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
+		       aq->agent->device_id, aq->id, strerror (err));
+  HSA_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
+}
+
+/* Initialize hsa_context if it has not already been done.
+   Return TRUE on success.  */
+
+static bool
+init_hsa_context (void)
+{
+  hsa_status_t status;
+  int agent_index = 0;
+
+  if (hsa_context.initialized)
+    return true;
+  init_environment_variables ();
+  if (!init_hsa_runtime_functions ())
+    {
+      HSA_DEBUG ("Run-time could not be dynamically opened\n");
+      if (suppress_host_fallback)
+	GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
+      return false;
+    }
+  status = hsa_fns.hsa_init_fn ();
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Run-time could not be initialized", status);
+  HSA_DEBUG ("HSA run-time initialized for GCN\n");
+
+  if (debug)
+    dump_hsa_system_info ();
+
+  status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("GCN GPU devices could not be enumerated", status);
+  HSA_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
+
+  hsa_context.agents
+    = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
+				  * sizeof (struct agent_info));
+  status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
+  if (agent_index != hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
+      return false;
+    }
+
+  if (debug)
+    {
+      status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
+      if (status != HSA_STATUS_SUCCESS)
+	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
+    }
+
+  hsa_context.initialized = true;
+  return true;
+}
+
+/* Verify that hsa_context has already been initialized and return the
+   agent_info structure describing device number N.  Return NULL on error.  */
+
+static struct agent_info *
+get_agent_info (int n)
+{
+  if (!hsa_context.initialized)
+    {
+      GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
+      return NULL;
+    }
+  if (n >= hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
+      return NULL;
+    }
+  if (!hsa_context.agents[n].initialized)
+    {
+      GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
+      return NULL;
+    }
+  return &hsa_context.agents[n];
+}
+
+/* Callback of dispatch queues to report errors.  */
+
+static void
+queue_callback (hsa_status_t status,
+		hsa_queue_t *queue __attribute__ ((unused)),
+		void *data __attribute__ ((unused)))
+{
+  hsa_fatal ("Asynchronous queue error", status);
+}
+
+static hsa_status_t
+dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
+{
+  hsa_status_t status;
+
+  hsa_region_segment_t segment;
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
+					   &segment);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      if (segment == HSA_REGION_SEGMENT_GLOBAL)
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
+      else if (segment == HSA_REGION_SEGMENT_READONLY)
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
+      else if (segment == HSA_REGION_SEGMENT_PRIVATE)
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
+      else if (segment == HSA_REGION_SEGMENT_GROUP)
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
+      else
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
+    }
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: FAILED\n");
+
+  if (segment == HSA_REGION_SEGMENT_GLOBAL)
+    {
+      uint32_t flags;
+      status
+	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
+					  &flags);
+      if (status == HSA_STATUS_SUCCESS)
+	{
+	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
+	    HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
+	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
+	    HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
+	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
+	    HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
+	}
+      else
+	HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
+    }
+
+  size_t size;
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_SIZE: FAILED\n");
+
+  status
+    = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
+				      &size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
+
+  bool alloc_allowed;
+  status
+    = hsa_fns.hsa_region_get_info_fn (region,
+				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
+				      &alloc_allowed);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
+
+  if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
+    return HSA_STATUS_SUCCESS;
+
+  status
+    = hsa_fns.hsa_region_get_info_fn (region,
+				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
+				      &size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
+
+  size_t align;
+  status
+    = hsa_fns.hsa_region_get_info_fn (region,
+				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
+				      &align);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
+
+  return HSA_STATUS_SUCCESS;
+}
+
+static void
+dump_hsa_regions (hsa_agent_t agent)
+{
+  hsa_status_t status;
+  status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
+						 dump_hsa_region,
+						 NULL);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_error ("Dumping hsa regions failed", status);
+}
+
+/* Return malloc'd string with name of SYMBOL.  */
+
+static char *
+get_executable_symbol_name (hsa_executable_symbol_t symbol)
+{
+  hsa_status_t status;
+  char *res;
+  uint32_t len;
+  const hsa_executable_symbol_info_t info_name_length
+    = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
+
+  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
+						      &len);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not get length of symbol name", status);
+      return NULL;
+    }
+
+  res = GOMP_PLUGIN_malloc (len + 1);
+
+  const hsa_executable_symbol_info_t info_name
+    = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
+
+  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
+
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not get symbol name", status);
+      free (res);
+      return NULL;
+    }
+
+  res[len] = '\0';
+
+  return res;
+}
+
+/* Helper function for dump_executable_symbols.  */
+
+static hsa_status_t
+dump_executable_symbol (hsa_executable_t executable,
+			hsa_executable_symbol_t symbol,
+			void *data __attribute__((unused)))
+{
+  char *name = get_executable_symbol_name (symbol);
+
+  if (name)
+    {
+      HSA_DEBUG ("executable symbol: %s\n", name);
+      free (name);
+    }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Dump all global symbol in executable.  */
+
+static void
+dump_executable_symbols (hsa_executable_t executable)
+{
+  hsa_status_t status;
+  status
+    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
+						 dump_executable_symbol,
+						 NULL);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not dump HSA executable symbols", status);
+}
+
+/* Helper function for find_executable_symbol.  */
+
+static hsa_status_t
+find_executable_symbol_1 (hsa_executable_t executable,
+			  hsa_executable_symbol_t symbol,
+			  void *data)
+{
+  hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
+  *res = symbol;
+  return HSA_STATUS_INFO_BREAK;
+}
+
+/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true.  If not
+   found, return false.  */
+
+static bool
+find_executable_symbol (hsa_executable_t executable,
+			hsa_executable_symbol_t *symbol)
+{
+  hsa_status_t status;
+
+  status
+    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
+						 find_executable_symbol_1,
+						 symbol);
+  if (status != HSA_STATUS_INFO_BREAK)
+    {
+      hsa_error ("Could not find executable symbol", status);
+      return false;
+    }
+
+  return true;
+}
+
+/* Callback of hsa_agent_iterate_regions.  Determine if a memory REGION can be
+   used for allocations of KIND and if so write it to the memory pointed to by
+   DATA and break the query.  */
+
+static hsa_status_t
+get_memory_region (hsa_region_t region, hsa_region_t *retval,
+		   hsa_region_global_flag_t kind)
+{
+  hsa_status_t status;
+  hsa_region_segment_t segment;
+
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
+					   &segment);
+  if (status != HSA_STATUS_SUCCESS)
+    return status;
+  if (segment != HSA_REGION_SEGMENT_GLOBAL)
+    return HSA_STATUS_SUCCESS;
+
+  uint32_t flags;
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
+					   &flags);
+  if (status != HSA_STATUS_SUCCESS)
+    return status;
+  if (flags & kind)
+    {
+      *retval = region;
+      return HSA_STATUS_INFO_BREAK;
+    }
+  return HSA_STATUS_SUCCESS;
+}
+
+static hsa_status_t
+get_kernarg_memory_region (hsa_region_t region, void *data)
+{
+  return get_memory_region (region, (hsa_region_t *)data,
+			    HSA_REGION_GLOBAL_FLAG_KERNARG);
+}
+
+static hsa_status_t
+get_data_memory_region (hsa_region_t region, void *data)
+{
+  return get_memory_region (region, (hsa_region_t *)data,
+			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
+}
+
+/* Part of the libgomp plugin interface.  Return the number of HSA devices on
+   the system.  */
+
+int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+  if (!init_hsa_context ())
+    return 0;
+  return hsa_context.agent_count;
+}
+
+union gomp_device_property_value
+GOMP_OFFLOAD_get_property (int device, int prop)
+{
+  struct agent_info *agent = get_agent_info (device);
+  hsa_region_t region = agent->data_region;
+
+  union gomp_device_property_value propval = { .val = 0 };
+
+  static char buf[64];
+  buf[0] = '\0';
+  size_t size;
+  hsa_status_t status;
+
+  switch (prop)
+    {
+    case GOMP_DEVICE_PROPERTY_FREE_MEMORY:
+      /* Not known: fall through.  */
+    case GOMP_DEVICE_PROPERTY_MEMORY:
+      status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE,
+					       &size);
+      propval.val = size;
+      break;
+      break;
+    case GOMP_DEVICE_PROPERTY_NAME:
+      status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
+					      buf);
+      if (status == HSA_STATUS_SUCCESS)
+	propval.ptr = buf;
+      break;
+    case GOMP_DEVICE_PROPERTY_VENDOR:
+      status = hsa_fns.hsa_agent_get_info_fn (agent->id,
+					      HSA_AGENT_INFO_VENDOR_NAME,
+					      buf);
+      if (status == HSA_STATUS_SUCCESS)
+	propval.ptr = buf;
+      break;
+    case GOMP_DEVICE_PROPERTY_DRIVER:
+      propval.ptr = "HSA Runtime";
+      break;
+    }
+
+  return propval;
+}
+
+static void
+wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
+{
+  if (aq->queue_n == ASYNC_QUEUE_SIZE)
+    {
+      /* Queue is full.  Wait for it to not be full.  */
+      while (aq->queue_n == ASYNC_QUEUE_SIZE)
+	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+    }
+}
+
+static void
+queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
+		   void *vars, struct GOMP_kernel_launch_attributes *kla)
+{
+  assert (aq->agent == kernel->agent);
+
+  pthread_mutex_lock (&aq->mutex);
+
+  wait_for_queue_nonfull (aq);
+
+  int queue_last = ((aq->queue_first + aq->queue_n)
+		    % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    HSA_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = KERNEL_LAUNCH;
+  aq->queue[queue_last].u.launch.kernel = kernel;
+  aq->queue[queue_last].u.launch.vars = vars;
+  aq->queue[queue_last].u.launch.kla = *kla;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+static void
+queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
+		     void *data)
+{
+  pthread_mutex_lock (&aq->mutex);
+
+  wait_for_queue_nonfull (aq);
+
+  int queue_last = ((aq->queue_first + aq->queue_n)
+		    % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    HSA_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = CALLBACK;
+  aq->queue[queue_last].u.callback.fn = fn;
+  aq->queue[queue_last].u.callback.data = data;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+/* Push an entry on AQ to wait for the event described by PLACEHOLDERP (on
+   another queue) to execute.  */
+
+static void
+queue_push_asyncwait (struct goacc_asyncqueue *aq,
+		      struct placeholder *placeholderp)
+{
+  pthread_mutex_lock (&aq->mutex);
+
+  wait_for_queue_nonfull (aq);
+
+  int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    HSA_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = ASYNC_WAIT;
+  aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+static struct placeholder *
+queue_push_placeholder (struct goacc_asyncqueue *aq)
+{
+  struct placeholder *placeholderp;
+
+  pthread_mutex_lock (&aq->mutex);
+
+  wait_for_queue_nonfull (aq);
+
+  int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    HSA_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
+  placeholderp = &aq->queue[queue_last].u.placeholder;
+
+  if (pthread_mutex_init (&placeholderp->mutex, NULL))
+    {
+      pthread_mutex_unlock (&aq->mutex);
+      GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
+    }
+
+  if (pthread_cond_init (&placeholderp->cond, NULL))
+    {
+      pthread_mutex_unlock (&aq->mutex);
+      GOMP_PLUGIN_error ("Failed to initialize serialization cond");
+    }
+
+  placeholderp->executed = 0;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+
+  return placeholderp;
+}
+
+static void run_kernel (struct kernel_info *kernel, void *vars,
+			struct GOMP_kernel_launch_attributes *kla,
+			struct goacc_asyncqueue *aq, bool module_locked);
+
+static void wait_queue (struct goacc_asyncqueue *aq);
+
+static void
+execute_queue_entry (struct goacc_asyncqueue *aq, int index)
+{
+  struct queue_entry *entry = &aq->queue[index];
+
+  switch (entry->type)
+    {
+    case KERNEL_LAUNCH:
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
+		   aq->agent->device_id, aq->id, index);
+      run_kernel (entry->u.launch.kernel,
+		  entry->u.launch.vars,
+		  &entry->u.launch.kla, aq, false);
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
+		   aq->agent->device_id, aq->id, index);
+      break;
+
+    case CALLBACK:
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
+		   aq->agent->device_id, aq->id, index);
+      entry->u.callback.fn (entry->u.callback.data);
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
+		   aq->agent->device_id, aq->id, index);
+      break;
+
+    case ASYNC_WAIT:
+      {
+        struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
+
+	if (DEBUG_QUEUES)
+          HSA_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
+		     aq->agent->device_id, aq->id, index);
+
+	pthread_mutex_lock (&placeholderp->mutex);
+
+	while (!placeholderp->executed)
+          pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
+
+	pthread_mutex_unlock (&placeholderp->mutex);
+
+	if (pthread_cond_destroy (&placeholderp->cond))
+	  GOMP_PLUGIN_error ("Failed to destroy serialization cond");
+
+	if (pthread_mutex_destroy (&placeholderp->mutex))
+	  GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
+
+	if (DEBUG_QUEUES)
+          HSA_DEBUG ("Async thread %d:%d: Executing async wait "
+		     "entry (%d) done\n", aq->agent->device_id, aq->id, index);
+      }
+      break;
+
+    case ASYNC_PLACEHOLDER:
+      pthread_mutex_lock (&entry->u.placeholder.mutex);
+      entry->u.placeholder.executed = 1;
+      pthread_cond_signal (&entry->u.placeholder.cond);
+      pthread_mutex_unlock (&entry->u.placeholder.mutex);
+      break;
+
+    default:
+      GOMP_PLUGIN_fatal ("Unknown queue element");
+    }
+}
+
+static void *
+drain_queue (void *thread_arg)
+{
+  struct goacc_asyncqueue *aq = thread_arg;
+
+  if (DRAIN_QUEUE_SYNCHRONOUS_P)
+    {
+      aq->drain_queue_stop = 2;
+      return NULL;
+    }
+
+  pthread_mutex_lock (&aq->mutex);
+
+  while (true)
+    {
+      if (aq->drain_queue_stop)
+	break;
+
+      if (aq->queue_n > 0)
+	{
+	  pthread_mutex_unlock (&aq->mutex);
+	  execute_queue_entry (aq, aq->queue_first);
+
+	  pthread_mutex_lock (&aq->mutex);
+	  aq->queue_first = ((aq->queue_first + 1)
+			     % ASYNC_QUEUE_SIZE);
+	  aq->queue_n--;
+
+	  if (DEBUG_THREAD_SIGNAL)
+	    HSA_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
+		       aq->agent->device_id, aq->id);
+	  pthread_cond_broadcast (&aq->queue_cond_out);
+	  pthread_mutex_unlock (&aq->mutex);
+
+	  if (DEBUG_QUEUES)
+	    HSA_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
+		       aq->id);
+	  pthread_mutex_lock (&aq->mutex);
+	}
+      else
+	{
+	  if (DEBUG_THREAD_SLEEP)
+	    HSA_DEBUG ("Async thread %d:%d: going to sleep\n",
+		       aq->agent->device_id, aq->id);
+	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
+	  if (DEBUG_THREAD_SLEEP)
+	    HSA_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
+		       aq->agent->device_id, aq->id);
+	}
+    }
+
+  aq->drain_queue_stop = 2;
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_broadcast (&aq->queue_cond_out);
+  pthread_mutex_unlock (&aq->mutex);
+
+  HSA_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
+  return NULL;
+}
+
+static void
+drain_queue_synchronous (struct goacc_asyncqueue *aq)
+{
+  pthread_mutex_lock (&aq->mutex);
+
+  while (aq->queue_n > 0)
+    {
+      execute_queue_entry (aq, aq->queue_first);
+
+      aq->queue_first = ((aq->queue_first + 1)
+			 % ASYNC_QUEUE_SIZE);
+      aq->queue_n--;
+    }
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+/* Part of the libgomp plugin interface.  Initialize agent number N so that it
+   can be used for computation.  Return TRUE on success.  */
+
+bool
+GOMP_OFFLOAD_init_device (int n)
+{
+  if (!init_hsa_context ())
+    return false;
+  if (n >= hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
+      return false;
+    }
+  struct agent_info *agent = &hsa_context.agents[n];
+
+  if (agent->initialized)
+    return true;
+
+  agent->device_id = n;
+
+  if (pthread_rwlock_init (&agent->module_rwlock, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
+      return false;
+    }
+  if (pthread_mutex_init (&agent->prog_mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
+      return false;
+    }
+  if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
+      return false;
+    }
+  agent->async_queues = NULL;
+  agent->omp_async_queue = NULL;
+
+  uint32_t queue_size;
+  hsa_status_t status;
+  status = hsa_fns.hsa_agent_get_info_fn (agent->id,
+					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
+					  &queue_size);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error requesting maximum queue size of the GCN agent",
+		      status);
+
+  char buf[64];
+  status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
+					  &buf);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error querying the name of the agent", status);
+  agent->gfx900_p = (strncmp (buf, "gfx900", 6) == 0);
+
+  status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
+					HSA_QUEUE_TYPE_MULTI, queue_callback,
+					NULL, UINT32_MAX, UINT32_MAX,
+					&agent->sync_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error creating command queue", status);
+
+  agent->kernarg_region.handle = (uint64_t) -1;
+  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
+						 get_kernarg_memory_region,
+						 &agent->kernarg_region);
+  if (agent->kernarg_region.handle == (uint64_t) -1)
+    {
+      GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
+			 "arguments");
+      return false;
+    }
+  HSA_DEBUG ("Selected kernel arguments memory region:\n");
+  dump_hsa_region (agent->kernarg_region, NULL);
+
+  agent->data_region.handle = (uint64_t) -1;
+  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
+						 get_data_memory_region,
+						 &agent->data_region);
+  if (agent->data_region.handle == (uint64_t) -1)
+    {
+      GOMP_PLUGIN_error ("Could not find suitable memory region for device "
+			 "data");
+      return false;
+    }
+  HSA_DEBUG ("Selected device data memory region:\n");
+  dump_hsa_region (agent->data_region, NULL);
+
+  HSA_DEBUG ("GCN agent %d initialized\n", n);
+
+  agent->initialized = true;
+  return true;
+}
+
+/* Free the HSA program in agent and everything associated with it and set
+   agent->prog_finalized and the initialized flags of all kernels to false.
+   Return TRUE on success.  */
+
+static bool
+destroy_hsa_program (struct agent_info *agent)
+{
+  if (!agent->prog_finalized)
+    return true;
+
+  hsa_status_t status;
+
+  HSA_DEBUG ("Destroying the current GCN program.\n");
+
+  status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Could not destroy GCN executable", status);
+
+  if (agent->module)
+    {
+      int i;
+      for (i = 0; i < agent->module->kernel_count; i++)
+	agent->module->kernels[i].initialized = false;
+
+      if (agent->module->heap)
+	{
+	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
+	  agent->module->heap = NULL;
+	}
+    }
+  agent->prog_finalized = false;
+  return true;
+}
+
+/* Initialize KERNEL from D and other parameters.  Return true on success. */
+
+static bool
+init_basic_kernel_info (struct kernel_info *kernel,
+			struct hsa_kernel_description *d,
+			struct agent_info *agent,
+			struct module_info *module)
+{
+  kernel->agent = agent;
+  kernel->module = module;
+  kernel->name = d->name;
+  if (pthread_mutex_init (&kernel->init_mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
+      return false;
+    }
+  return true;
+}
+
+static void init_kernel (struct kernel_info *kernel);
+
+/* Part of the libgomp plugin interface.  Load GCN object-code module
+   described by struct gcn_image_desc in TARGET_DATA and return references to
+   kernel descriptors in TARGET_TABLE.  */
+
+int
+GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
+			 struct addr_pair **target_table)
+{
+  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_GCN)
+    {
+      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
+			 " (expected %u, received %u)",
+			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
+      return -1;
+    }
+
+  struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
+  struct agent_info *agent;
+  struct addr_pair *pair;
+  struct module_info *module;
+  struct kernel_info *kernel;
+  int kernel_count = image_desc->kernel_count;
+  unsigned var_count = image_desc->global_variable_count;
+
+  agent = get_agent_info (ord);
+  if (!agent)
+    return -1;
+
+  if (pthread_rwlock_wrlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
+      return -1;
+    }
+  if (agent->prog_finalized
+      && !destroy_hsa_program (agent))
+    return -1;
+
+  HSA_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
+  HSA_DEBUG ("Encountered %u global variables in an image\n", var_count);
+  pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
+			     * sizeof (struct addr_pair));
+  *target_table = pair;
+  module = (struct module_info *)
+    GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
+				+ kernel_count * sizeof (struct kernel_info));
+  module->image_desc = image_desc;
+  module->kernel_count = kernel_count;
+  module->heap = NULL;
+  module->constructors_run_p = false;
+
+  kernel = &module->kernels[0];
+
+  /* We have the magic code for a native GCN ELF kernel, not something
+     else.  */
+  if (strcmp (image_desc->gcn_image->magic, "GCN") != 0)
+    return -1;
+
+  /* Allocate memory for kernel dependencies.  */
+  for (unsigned i = 0; i < kernel_count; i++)
+    {
+      struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
+      if (!init_basic_kernel_info (kernel, d, agent, module))
+	return -1;
+      if (strcmp (d->name, "_init_array") == 0)
+	module->init_array_func = kernel;
+      else if (strcmp (d->name, "_fini_array") == 0)
+        module->fini_array_func = kernel;
+      else
+	{
+	  pair->start = (uintptr_t) kernel;
+	  pair->end = (uintptr_t) (kernel + 1);
+	  pair++;
+	}
+      kernel++;
+    }
+
+  agent->module = module;
+  if (pthread_rwlock_unlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
+      return -1;
+    }
+
+  if (!create_and_finalize_hsa_program (agent))
+    return -1;
+
+  for (unsigned i = 0; i < var_count; i++)
+    {
+      struct global_var_info *v = &image_desc->global_variables[i];
+      HSA_DEBUG ("Looking for variable %s\n", v->name);
+
+      hsa_status_t status;
+      hsa_executable_symbol_t var_symbol;
+      status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
+						     v->name, agent->id,
+						     0, &var_symbol);
+
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not find symbol for variable in the code object",
+		   status);
+
+      uint64_t var_addr;
+      uint32_t var_size;
+      status = hsa_fns.hsa_executable_symbol_get_info_fn
+	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not extract a variable from its symbol", status);
+      status = hsa_fns.hsa_executable_symbol_get_info_fn
+	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not extract a variable size from its symbol", status);
+
+      pair->start = var_addr;
+      pair->end = var_addr + var_size;
+      HSA_DEBUG ("Found variable %s at %p with size %u\n", v->name,
+		 (void *)var_addr, var_size);
+      pair++;
+    }
+
+  /* Ensure that constructors are run first.  */
+  struct GOMP_kernel_launch_attributes kla =
+    { 3,
+      /* Grid size.  */
+      { 1, 64, 1 },
+      /* Work-group size.  */
+      { 1, 64, 1 }
+    };
+
+  if (module->init_array_func)
+    {
+      init_kernel (module->init_array_func);
+      run_kernel (module->init_array_func, NULL, &kla, NULL, false);
+    }
+  module->constructors_run_p = true;
+
+  return kernel_count + var_count;
+}
+
+/* Find the load_offset for MODULE, savte to *LOAD_OFFSET, and return true.  If
+   not found, return false.  */
+
+static bool
+find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
+		  struct module_info *module, Elf64_Ehdr *image,
+		  Elf64_Shdr *sections)
+{
+  bool res = false;
+
+  hsa_status_t status;
+
+  hsa_executable_symbol_t symbol;
+  if (!find_executable_symbol (agent->executable, &symbol))
+    return false;
+
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not extract symbol address", status);
+      return false;
+    }
+
+  char *symbol_name = get_executable_symbol_name (symbol);
+  if (symbol_name == NULL)
+    return false;
+
+  /* Find the kernel function in ELF, and calculate actual load offset.  */
+  for (int i = 0; i < image->e_shnum; i++)
+    if (sections[i].sh_type == SHT_SYMTAB)
+      {
+	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
+	char *strings = (char *)image + strtab->sh_offset;
+
+	for (size_t offset = 0;
+	     offset < sections[i].sh_size;
+	     offset += sections[i].sh_entsize)
+	  {
+	    Elf64_Sym *sym = (Elf64_Sym*)((char*)image
+					  + sections[i].sh_offset
+					  + offset);
+	    if (strcmp (symbol_name, strings + sym->st_name) == 0)
+	      {
+		*load_offset -= sym->st_value;
+		res = true;
+		break;
+	      }
+	  }
+      }
+
+  free (symbol_name);
+  return res;
+}
+
+/* Create and finalize the program consisting of all loaded modules.  */
+
+static bool
+create_and_finalize_hsa_program (struct agent_info *agent)
+{
+  hsa_status_t status;
+  int reloc_count = 0;
+  bool res = true;
+  if (pthread_mutex_lock (&agent->prog_mutex))
+    {
+      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+      return false;
+    }
+  if (agent->prog_finalized)
+    goto final;
+
+  status
+    = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
+					HSA_EXECUTABLE_STATE_UNFROZEN,
+					"", &agent->executable);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not create GCN executable", status);
+      goto fail;
+    }
+
+  /* Load any GCN modules.  */
+  struct module_info *module = agent->module;
+  if (module)
+    {
+      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
+
+      /* Hide relocations from the HSA runtime loader.
+	 Keep a copy of the unmodified section headers to use later.  */
+      Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
+						  + image->e_shoff);
+      for (int i = image->e_shnum - 1; i >= 0; i--)
+	{
+	  if (image_sections[i].sh_type == SHT_RELA
+	      || image_sections[i].sh_type == SHT_REL)
+	    /* Change section type to something harmless.  */
+	    image_sections[i].sh_type |= 0x80;
+	}
+
+      hsa_code_object_t co = { 0 };
+      status = hsa_fns.hsa_code_object_deserialize_fn
+	(module->image_desc->gcn_image->image,
+	 module->image_desc->gcn_image->size,
+	 NULL, &co);
+      if (status != HSA_STATUS_SUCCESS)
+	{
+	  hsa_error ("Could not deserialize GCN code object", status);
+	  goto fail;
+	}
+
+      status = hsa_fns.hsa_executable_load_code_object_fn
+	(agent->executable, agent->id, co, "");
+      if (status != HSA_STATUS_SUCCESS)
+	{
+	  hsa_error ("Could not load GCN code object", status);
+	  goto fail;
+	}
+
+      if (!module->heap)
+	{
+	  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
+						   gcn_kernel_heap_size,
+						   (void**)&module->heap);
+	  if (status != HSA_STATUS_SUCCESS)
+	    {
+	      hsa_error ("Could not allocate memory for GCN heap", status);
+	      goto fail;
+	    }
+
+	  status = hsa_fns.hsa_memory_assign_agent_fn
+			(module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
+	  if (status != HSA_STATUS_SUCCESS)
+	    {
+	      hsa_error ("Could not assign GCN heap memory to device", status);
+	      goto fail;
+	    }
+
+	  hsa_fns.hsa_memory_copy_fn (&module->heap->size,
+				      &gcn_kernel_heap_size,
+				      sizeof (gcn_kernel_heap_size));
+	}
+
+    }
+
+  if (debug)
+    dump_executable_symbols (agent->executable);
+
+  status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not freeze the GCN executable", status);
+      goto fail;
+    }
+
+  if (agent->module)
+    {
+      struct module_info *module = agent->module;
+      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
+      Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
+
+      Elf64_Addr load_offset;
+      if (!find_load_offset (&load_offset, agent, module, image, sections))
+	goto fail;
+
+      /* Record the physical load address range.
+	 We need this for data copies later.  */
+      Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
+      Elf64_Addr low = ~0, high = 0;
+      for (int i = 0; i < image->e_phnum; i++)
+	if (segments[i].p_memsz > 0)
+	  {
+	    if (segments[i].p_paddr < low)
+	      low = segments[i].p_paddr;
+	    if (segments[i].p_paddr > high)
+	      high = segments[i].p_paddr + segments[i].p_memsz - 1;
+	  }
+      module->phys_address_start = low + load_offset;
+      module->phys_address_end = high + load_offset;
+
+      // Find dynamic symbol table
+      Elf64_Shdr *dynsym = NULL;
+      for (int i = 0; i < image->e_shnum; i++)
+	if (sections[i].sh_type == SHT_DYNSYM)
+	  {
+	    dynsym = &sections[i];
+	    break;
+	  }
+
+      /* Fix up relocations.  */
+      for (int i = 0; i < image->e_shnum; i++)
+	{
+	  if (sections[i].sh_type == (SHT_RELA | 0x80))
+	    for (size_t offset = 0;
+		 offset < sections[i].sh_size;
+		 offset += sections[i].sh_entsize)
+	      {
+		Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
+						  + sections[i].sh_offset
+						  + offset);
+		Elf64_Sym *sym =
+		  (dynsym
+		   ? (Elf64_Sym*)((char*)image
+				  + dynsym->sh_offset
+				  + (dynsym->sh_entsize
+				     * ELF64_R_SYM (reloc->r_info)))
+		   : NULL);
+
+		int64_t S = (sym ? sym->st_value : 0);
+		int64_t P = reloc->r_offset + load_offset;
+		int64_t A = reloc->r_addend;
+		int64_t B = load_offset;
+		int64_t V, size;
+		switch (ELF64_R_TYPE (reloc->r_info))
+		  {
+		  case R_AMDGPU_ABS32_LO:
+		    V = (S + A) & 0xFFFFFFFF;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_ABS32_HI:
+		    V = (S + A) >> 32;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_ABS64:
+		    V = S + A;
+		    size = 8;
+		    break;
+		  case R_AMDGPU_REL32:
+		    V = S + A - P;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_REL64:
+		    /* FIXME
+		       LLD seems to emit REL64 where the the assembler has
+		       ABS64.  This is clearly wrong because it's not what the
+		       compiler is expecting.  Let's assume, for now, that
+		       it's a bug.  In any case, GCN kernels are always self
+		       contained and therefore relative relocations will have
+		       been resolved already, so this should be a safe
+		       workaround.  */
+		    V = S + A/* - P*/;
+		    size = 8;
+		    break;
+		  case R_AMDGPU_ABS32:
+		    V = S + A;
+		    size = 4;
+		    break;
+		    /* TODO R_AMDGPU_GOTPCREL */
+		    /* TODO R_AMDGPU_GOTPCREL32_LO */
+		    /* TODO R_AMDGPU_GOTPCREL32_HI */
+		  case R_AMDGPU_REL32_LO:
+		    V = (S + A - P) & 0xFFFFFFFF;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_REL32_HI:
+		    V = (S + A - P) >> 32;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_RELATIVE64:
+		    V = B + A;
+		    size = 8;
+		    break;
+		  default:
+		    fprintf (stderr, "Error: unsupported relocation type.\n");
+		    exit (1);
+		  }
+		status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
+		if (status != HSA_STATUS_SUCCESS)
+		  {
+		    hsa_error ("Failed to fix up relocation", status);
+		    goto fail;
+		  }
+		reloc_count++;
+	      }
+	}
+    }
+
+  HSA_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
+	     agent->device_id, reloc_count);
+
+final:
+  agent->prog_finalized = true;
+
+  if (pthread_mutex_unlock (&agent->prog_mutex))
+    {
+      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
+      res = false;
+    }
+
+  return res;
+
+fail:
+  res = false;
+  goto final;
+}
+
+/* Create kernel dispatch data structure for given KERNEL.  */
+
+static struct GOMP_hsa_kernel_dispatch *
+create_single_kernel_dispatch (struct kernel_info *kernel)
+{
+  struct agent_info *agent = kernel->agent;
+  struct GOMP_hsa_kernel_dispatch *shadow
+    = GOMP_PLUGIN_malloc_cleared (sizeof (struct GOMP_hsa_kernel_dispatch));
+
+  shadow->object = kernel->object;
+
+  hsa_signal_t sync_signal;
+  hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Error creating the GCN sync signal", status);
+
+  shadow->signal = sync_signal.handle;
+  shadow->private_segment_size = kernel->private_segment_size;
+  shadow->group_segment_size = kernel->group_segment_size;
+
+  /* Ensure that there is space for the gomp_print data.
+     See also gcn-run.c, in GCC.  */
+  size_t kss = kernel->kernarg_segment_size;
+  bool use_gomp_print = false;
+  if (kss <= 8)
+    {
+      kss = sizeof (struct kernargs);
+      use_gomp_print = true;
+    }
+
+  status
+    = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
+				      kss,
+				      &shadow->kernarg_address);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
+
+  struct kernargs *kernargs = shadow->kernarg_address;
+  if (use_gomp_print)
+    {
+      /* Zero-initialize the output_data (minimum needed).  */
+      kernargs->out_ptr = (int64_t)&kernargs->output_data;
+      kernargs->output_data.next_output = 0;
+      for (unsigned i = 0;
+	   i < (sizeof (kernargs->output_data.queue)
+		/ sizeof (kernargs->output_data.queue[0]));
+	   i++)
+	kernargs->output_data.queue[i].written = 0;
+      kernargs->output_data.consumed = 0;
+
+      /* Pass in the heap location.  */
+      kernargs->heap_ptr = (int64_t)kernel->module->heap;
+    }
+
+  kernargs->output_data.return_value = 0xcafe0000;
+
+  return shadow;
+}
+
+/* Output any data written by gomp_print_*.
+   Only enabled when the requested kernarg_segment_size would not
+   overwrite the gomp_print data.
+   We print all entries from print_index to the next entry without a "written"
+   flag.  Subsequent calls should use the returned print_index value to resume
+   from the same point.  */
+static void
+gomp_print_output (struct kernel_info *kernel, struct kernargs *kernargs,
+		   bool final)
+{
+  if (kernel->kernarg_segment_size <= 8)
+    {
+      unsigned int limit = (sizeof (kernargs->output_data.queue)
+			    / sizeof (kernargs->output_data.queue[0]));
+
+      unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
+					   __ATOMIC_ACQUIRE);
+      unsigned int to = kernargs->output_data.next_output;
+
+      if (from > to)
+	{
+	  /* Overflow.  */
+	  if (final)
+	    printf ("GCN print buffer overflowed.\n");
+	  return;
+	}
+
+      unsigned int i;
+      for (i = from; i < to; i++)
+	{
+	  struct printf_data *data = &kernargs->output_data.queue[i%limit];
+
+	  if (!data->written && !final)
+	    break;
+
+	  switch (data->type)
+	    {
+	    case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
+	    case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
+	    case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
+	    case 3: printf ("%.128s%.128s", data->msg, data->text); break;
+	    default: printf ("GCN print buffer error!\n"); break;
+	    }
+	  data->written = 0;
+	  __atomic_store_n (&kernargs->output_data.consumed, i+1,
+			    __ATOMIC_RELEASE);
+	}
+      fflush (stdout);
+    }
+}
+
+/* Release data structure created for a kernel dispatch in SHADOW argument.  */
+
+static void
+release_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *shadow)
+{
+  HSA_DEBUG ("Released kernel dispatch: %p\n", shadow);
+
+  hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
+
+  hsa_signal_t s;
+  s.handle = shadow->signal;
+  hsa_fns.hsa_signal_destroy_fn (s);
+
+  free (shadow);
+}
+
+/* Initialize a KERNEL without its dependencies.  MAX_OMP_DATA_SIZE is used
+   to calculate maximum necessary memory for OMP data allocation.  */
+
+static void
+init_single_kernel (struct kernel_info *kernel)
+{
+  hsa_status_t status;
+  struct agent_info *agent = kernel->agent;
+  hsa_executable_symbol_t kernel_symbol;
+  status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
+						 kernel->name, agent->id,
+						 0, &kernel_symbol);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_warn ("Could not find symbol for kernel in the code object", status);
+      fprintf (stderr, "not found name: '%s'\n", kernel->name);
+      dump_executable_symbols (agent->executable);
+      goto failure;
+    }
+  HSA_DEBUG ("Located kernel %s\n", kernel->name);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not extract a kernel object from its symbol", status);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+     &kernel->kernarg_segment_size);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not get info about kernel argument size", status);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+     &kernel->group_segment_size);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not get info about kernel group segment size", status);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+     &kernel->private_segment_size);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not get info about kernel private segment size",
+	       status);
+
+  HSA_DEBUG ("Kernel structure for %s fully initialized with "
+	     "following segment sizes: \n", kernel->name);
+  HSA_DEBUG ("  group_segment_size: %u\n",
+	     (unsigned) kernel->group_segment_size);
+  HSA_DEBUG ("  private_segment_size: %u\n",
+	     (unsigned) kernel->private_segment_size);
+  HSA_DEBUG ("  kernarg_segment_size: %u\n",
+	     (unsigned) kernel->kernarg_segment_size);
+  return;
+
+failure:
+  kernel->initialization_failed = true;
+}
+
+/* Indent stream F by INDENT spaces.  */
+
+static void
+indent_stream (FILE *f, unsigned indent)
+{
+  fprintf (f, "%*s", indent, "");
+}
+
+/* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
+
+static void
+print_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *dispatch,
+		       unsigned indent)
+{
+  indent_stream (stderr, indent);
+  fprintf (stderr, "this: %p\n", dispatch);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "queue: %p\n", dispatch->queue);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "kernarg_address: %p\n", dispatch->kernarg_address);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "object: %lu\n", dispatch->object);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "private_segment_size: %u\n",
+	   dispatch->private_segment_size);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "group_segment_size: %u\n",
+	   dispatch->group_segment_size);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "\n");
+}
+
+/* Create kernel dispatch data structure for a KERNEL and all its
+   dependencies.  */
+
+static struct GOMP_hsa_kernel_dispatch *
+create_kernel_dispatch (struct kernel_info *kernel)
+{
+  struct GOMP_hsa_kernel_dispatch *shadow
+    = create_single_kernel_dispatch (kernel);
+
+  return shadow;
+}
+
+/* Do all the work that is necessary before running KERNEL for the first time.
+   The function assumes the program has been created, finalized and frozen by
+   create_and_finalize_hsa_program.  */
+
+static void
+init_kernel (struct kernel_info *kernel)
+{
+  if (pthread_mutex_lock (&kernel->init_mutex))
+    GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
+  if (kernel->initialized)
+    {
+      if (pthread_mutex_unlock (&kernel->init_mutex))
+	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
+			   "mutex");
+
+      return;
+    }
+
+  /* Precomputed maximum size of OMP data necessary for a kernel from kernel
+     dispatch operation.  */
+  init_single_kernel (kernel);
+
+  if (!kernel->initialization_failed)
+    {
+      HSA_DEBUG ("\n");
+
+      kernel->initialized = true;
+    }
+  if (pthread_mutex_unlock (&kernel->init_mutex))
+    GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
+		       "mutex");
+}
+
+static int
+get_cu_count (struct agent_info *agent)
+{
+  uint32_t cu_count;
+  hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
+    (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
+  if (status == HSA_STATUS_SUCCESS)
+    return cu_count;
+  else
+    return 64;  /* The usual number for older devices.  */
+}
+
+/* Calculate the maximum grid size for OMP threads / OACC workers.
+   This depends on the kernel's resource usage levels.  */
+
+static int
+limit_worker_threads (int threads)
+{
+  /* FIXME Do something more inteligent here.
+     GCN can always run 4 threads within a Compute Unit, but
+     more than that depends on register usage.  */
+  if (threads > 16)
+    threads = 16;
+  return threads;
+}
+
+/* Parse the target attributes INPUT provided by the compiler and return true
+   if we should run anything all.  If INPUT is NULL, fill DEF with default
+   values, then store INPUT or DEF into *RESULT.  */
+
+static bool
+parse_target_attributes (void **input,
+			 struct GOMP_kernel_launch_attributes *def,
+			 struct GOMP_kernel_launch_attributes **result,
+			 struct agent_info *agent)
+{
+  if (!input)
+    GOMP_PLUGIN_fatal ("No target arguments provided");
+
+  bool grid_attrs_found = false;
+  bool gcn_dims_found = false;
+  int gcn_teams = 0;
+  int gcn_threads = 0;
+  while (*input)
+    {
+      intptr_t id = (intptr_t) *input++, val;
+
+      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+	val = (intptr_t) *input++;
+      else
+	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+
+      val = (val > INT_MAX) ? INT_MAX : val;
+
+      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
+	  && ((id & GOMP_TARGET_ARG_ID_MASK)
+	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
+	{
+	  grid_attrs_found = true;
+	  break;
+	}
+      else if ((id & GOMP_TARGET_ARG_DEVICE_ALL) == GOMP_TARGET_ARG_DEVICE_ALL)
+	{
+	  gcn_dims_found = true;
+	  switch (id & GOMP_TARGET_ARG_ID_MASK)
+	    {
+	    case GOMP_TARGET_ARG_NUM_TEAMS:
+	      gcn_teams = val;
+	      break;
+	    case GOMP_TARGET_ARG_THREAD_LIMIT:
+	      gcn_threads = limit_worker_threads (val);
+	      break;
+	    default:
+	      ;
+	    }
+	}
+    }
+
+  if (gcn_dims_found)
+    {
+      if (agent->gfx900_p && gcn_threads == 0 && override_z_dim == 0)
+	{
+	  gcn_threads = 4;
+	  HSA_DEBUG ("VEGA BUG WORKAROUND: reducing default number of "
+		     "threads to 4 per team.\n");
+	  HSA_DEBUG (" - If this is not a Vega 10 device, please use "
+		     "GCN_NUM_THREADS=16\n");
+	}
+
+      def->ndim = 3;
+      /* Fiji has 64 CUs, but Vega20 has 60.  */
+      def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
+      /* Each thread is 64 work items wide.  */
+      def->gdims[1] = 64;
+      /* A work group can have 16 wavefronts.  */
+      def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
+      def->wdims[0] = 1; /* Single team per work-group.  */
+      def->wdims[1] = 64;
+      def->wdims[2] = 16;
+      *result = def;
+      return true;
+    }
+  else if (!grid_attrs_found)
+    {
+      def->ndim = 1;
+      def->gdims[0] = 1;
+      def->gdims[1] = 1;
+      def->gdims[2] = 1;
+      def->wdims[0] = 1;
+      def->wdims[1] = 1;
+      def->wdims[2] = 1;
+      *result = def;
+      HSA_DEBUG ("GOMP_OFFLOAD_run called with no launch attributes\n");
+      return true;
+    }
+
+  struct GOMP_kernel_launch_attributes *kla;
+  kla = (struct GOMP_kernel_launch_attributes *) *input;
+  *result = kla;
+  if (kla->ndim == 0 || kla->ndim > 3)
+    GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
+
+  HSA_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
+  unsigned i;
+  for (i = 0; i < kla->ndim; i++)
+    {
+      HSA_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
+		 kla->gdims[i], kla->wdims[i]);
+      if (kla->gdims[i] == 0)
+	return false;
+    }
+  return true;
+}
+
+/* Return the group size given the requested GROUP size, GRID size and number
+   of grid dimensions NDIM.  */
+
+static uint32_t
+get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
+{
+  if (group == 0)
+    {
+      /* TODO: Provide a default via environment or device characteristics.  */
+      if (ndim == 1)
+	group = 64;
+      else if (ndim == 2)
+	group = 8;
+      else
+	group = 4;
+    }
+
+  if (group > grid)
+    group = grid;
+  return group;
+}
+
+/* Return true if the HSA runtime can run function FN_PTR.  */
+
+bool
+GOMP_OFFLOAD_can_run (void *fn_ptr)
+{
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+  init_kernel (kernel);
+  if (kernel->initialization_failed)
+    goto failure;
+
+  return true;
+
+failure:
+  if (suppress_host_fallback)
+    GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
+  HSA_DEBUG ("GCN target cannot be launched, doing a host fallback\n");
+  return false;
+}
+
+/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
+
+void
+packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
+{
+  __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
+}
+
+/* Run KERNEL on its agent, pass VARS to it as arguments and take
+   launchattributes from KLA.  MODULE_LOCKED indicates that the caller
+   already holds the lock and run_kernel need not lock it again.
+   If AQ is NULL then agent->sync_queue will be used.  */
+
+static void
+run_kernel (struct kernel_info *kernel, void *vars,
+	    struct GOMP_kernel_launch_attributes *kla,
+	    struct goacc_asyncqueue *aq, bool module_locked)
+{
+  HSA_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
+	     (aq ? aq->id : 0));
+  HSA_DEBUG ("GCN launch attribs: gdims:[");
+  int i;
+  for (i = 0; i < kla->ndim; ++i)
+    {
+      if (i)
+	HSA_DPRINT (", ");
+      HSA_DPRINT ("%u", kla->gdims[i]);
+    }
+  HSA_DPRINT ("], normalized gdims:[");
+  for (i = 0; i < kla->ndim; ++i)
+    {
+      if (i)
+	HSA_DPRINT (", ");
+      HSA_DPRINT ("%u", kla->gdims[i] / kla->wdims[i]);
+    }
+  HSA_DPRINT ("], wdims:[");
+  for (i = 0; i < kla->ndim; ++i)
+    {
+      if (i)
+	HSA_DPRINT (", ");
+      HSA_DPRINT ("%u", kla->wdims[i]);
+    }
+  HSA_DPRINT ("]\n");
+  HSA_FLUSH ();
+
+  struct agent_info *agent = kernel->agent;
+  if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
+    GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
+
+  if (!agent->initialized)
+    GOMP_PLUGIN_fatal ("Agent must be initialized");
+
+  if (!kernel->initialized)
+    GOMP_PLUGIN_fatal ("Called kernel must be initialized");
+
+  struct GOMP_hsa_kernel_dispatch *shadow = create_kernel_dispatch (kernel);
+
+  hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
+  shadow->queue = command_q;
+
+  if (debug)
+    {
+      fprintf (stderr, "\nKernel has following dependencies:\n");
+      print_kernel_dispatch (shadow, 2);
+    }
+
+  uint64_t index
+    = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
+  HSA_DEBUG ("Got AQL index %llu\n", (long long int) index);
+
+  /* Wait until the queue is not full before writing the packet.   */
+  while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
+	 >= command_q->size)
+    ;
+
+  /* Do not allow the dimensions to be overridden when running
+     constructors or destructors.  */
+  struct module_info *module = kernel->module;
+  bool init_fini_p = kernel == module->init_array_func
+		     || kernel == module->fini_array_func;
+  int override_x = init_fini_p ? 0 : override_x_dim;
+  int override_z = init_fini_p ? 0 : override_z_dim;
+
+  hsa_kernel_dispatch_packet_t *packet;
+  packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
+	   + index % command_q->size;
+
+  memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
+  packet->grid_size_x = override_x ? : kla->gdims[0];
+  packet->workgroup_size_x = get_group_size (kla->ndim,
+					     packet->grid_size_x,
+					     kla->wdims[0]);
+
+  if (kla->ndim >= 2)
+    {
+      packet->grid_size_y = kla->gdims[1];
+      packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
+						 kla->wdims[1]);
+    }
+  else
+    {
+      packet->grid_size_y = 1;
+      packet->workgroup_size_y = 1;
+    }
+
+  if (kla->ndim == 3)
+    {
+      packet->grid_size_z = limit_worker_threads (override_z
+						  ? : kla->gdims[2]);
+      packet->workgroup_size_z = get_group_size (kla->ndim,
+						 packet->grid_size_z,
+						 kla->wdims[2]);
+    }
+  else
+    {
+      packet->grid_size_z = 1;
+      packet->workgroup_size_z = 1;
+    }
+
+  HSA_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
+	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
+	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
+	     packet->grid_size_x / packet->workgroup_size_x,
+	     packet->grid_size_y / packet->workgroup_size_y,
+	     packet->grid_size_z / packet->workgroup_size_z,
+	     packet->workgroup_size_x, packet->workgroup_size_y,
+	     packet->workgroup_size_z);
+
+  packet->private_segment_size = kernel->private_segment_size;
+  packet->group_segment_size = kernel->group_segment_size;
+  packet->kernel_object = kernel->object;
+  packet->kernarg_address = shadow->kernarg_address;
+  hsa_signal_t s;
+  s.handle = shadow->signal;
+  packet->completion_signal = s;
+  hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
+  memcpy (shadow->kernarg_address, &vars, sizeof (vars));
+
+  /* PR hsa/70337.  */
+  size_t vars_size = sizeof (vars);
+  if (kernel->kernarg_segment_size > vars_size)
+    {
+      if (kernel->kernarg_segment_size != vars_size
+	  + sizeof (struct hsa_kernel_runtime *))
+	GOMP_PLUGIN_fatal ("Kernel segment size has an unexpected value");
+      memcpy (packet->kernarg_address + vars_size, &shadow,
+	      sizeof (struct hsa_kernel_runtime *));
+    }
+
+  HSA_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
+
+  uint16_t header;
+  header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+
+  HSA_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
+	     agent->device_id);
+
+  packet_store_release ((uint32_t *) packet, header,
+			(uint16_t) kla->ndim
+			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
+
+  hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
+				       index);
+
+  HSA_DEBUG ("Kernel dispatched, waiting for completion\n");
+
+  /* Root signal waits with 1ms timeout.  */
+  while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
+					     1000 * 1000,
+					     HSA_WAIT_STATE_BLOCKED) != 0)
+    {
+      gomp_print_output (kernel, shadow->kernarg_address, false);
+    }
+  gomp_print_output (kernel, shadow->kernarg_address, true);
+
+  struct kernargs *kernargs = shadow->kernarg_address;
+  unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
+
+  release_kernel_dispatch (shadow);
+
+  if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
+    GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
+
+  unsigned int upper = (return_value & ~0xffff) >> 16;
+  if (upper == 0xcafe)
+    ; // exit not called, normal termination.
+  else if (upper == 0xffff)
+    ; // exit called.
+  else
+    {
+      GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
+			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
+			 return_value);
+      abort ();
+    }
+
+  if (upper == 0xffff)
+    {
+      unsigned int signal = (return_value >> 8) & 0xff;
+
+      if (signal == SIGABRT)
+	{
+	  HSA_DEBUG ("GCN Kernel aborted\n");
+	  abort ();
+	}
+      else if (signal != 0)
+	{
+	  HSA_DEBUG ("GCN Kernel received unknown signal\n");
+	  abort ();
+	}
+
+      HSA_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
+      exit (return_value & 0xff);
+    }
+}
+
+/* Part of the libgomp plugin interface.  Run a kernel on device N (the number
+   is actually ignored, we assume the FN_PTR has been mapped using the correct
+   device) and pass it an array of pointers in VARS as a parameter.  The kernel
+   is identified by FN_PTR which must point to a kernel_info structure.  */
+
+void
+GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
+{
+  struct agent_info *agent = get_agent_info (device);
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+  struct GOMP_kernel_launch_attributes def;
+  struct GOMP_kernel_launch_attributes *kla;
+  assert (agent == kernel->agent);
+
+  if (!parse_target_attributes (args, &def, &kla, agent))
+    {
+      HSA_DEBUG ("Will not run GCN kernel because the grid size is zero\n");
+      return;
+    }
+  run_kernel (kernel, vars, kla, NULL, false);
+}
+
+/* Set up an async queue for OpenMP.  There will be only one.
+   FIXME: is this thread-safe if two threads call this function?  */
+static void
+maybe_init_omp_async (struct agent_info *agent)
+{
+  if (!agent->omp_async_queue)
+    agent->omp_async_queue
+      = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
+}
+
+/* Part of the libgomp plugin interface.  Run a kernel like GOMP_OFFLOAD_run
+   does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it
+   has finished.  */
+
+void
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+			void **args, void *async_data)
+{
+  HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
+  struct agent_info *agent = get_agent_info (device);
+  struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
+  struct GOMP_kernel_launch_attributes def;
+  struct GOMP_kernel_launch_attributes *kla;
+  assert (agent == kernel->agent);
+
+  if (!parse_target_attributes (args, &def, &kla, agent))
+    {
+      HSA_DEBUG ("Will not run GCN kernel because the grid size is zero\n");
+      return;
+    }
+
+  maybe_init_omp_async (agent);
+  queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
+  queue_push_callback (agent->omp_async_queue,
+		       GOMP_PLUGIN_target_task_completion, async_data);
+}
+
+/* Deinitialize all information associated with MODULE and kernels within
+   it.  Return TRUE on success.  */
+
+static bool
+destroy_module (struct module_info *module, bool locked)
+{
+  /* Run destructors before destroying module.  */
+  struct GOMP_kernel_launch_attributes kla =
+    { 3,
+      /* Grid size.  */
+      { 1, 64, 1 },
+      /* Work-group size.  */
+      { 1, 64, 1 }
+    };
+
+  if (module->fini_array_func)
+    {
+      init_kernel (module->fini_array_func);
+      run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
+    }
+  module->constructors_run_p = false;
+
+  int i;
+  for (i = 0; i < module->kernel_count; i++)
+    if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
+      {
+	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
+			   "mutex");
+	return false;
+      }
+
+  return true;
+}
+
+/* Part of the libgomp plugin interface.  Unload GCN object-code module
+   described by struct gcn_image_desc in TARGET_DATA from agent number N.
+   Return TRUE on success.  */
+
+bool
+GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
+{
+  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
+    {
+      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
+			 " (expected %u, received %u)",
+			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
+      return false;
+    }
+
+  struct agent_info *agent;
+  agent = get_agent_info (n);
+  if (!agent)
+    return false;
+
+  if (pthread_rwlock_wrlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
+      return false;
+    }
+
+  if (!agent->module || agent->module->image_desc != target_data)
+    {
+      GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
+			 "loaded before");
+      return false;
+    }
+
+  if (!destroy_module (agent->module, true))
+    return false;
+  free (agent->module);
+  agent->module = NULL;
+  if (!destroy_hsa_program (agent))
+    return false;
+  if (pthread_rwlock_unlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
+      return false;
+    }
+  return true;
+}
+
+/* Part of the libgomp plugin interface.  Deinitialize all information and
+   status associated with agent number N.  We do not attempt any
+   synchronization, assuming the user and libgomp will not attempt
+   deinitialization of a device that is in any way being used at the same
+   time.  Return TRUE on success.  */
+
+bool
+GOMP_OFFLOAD_fini_device (int n)
+{
+  struct agent_info *agent = get_agent_info (n);
+  if (!agent)
+    return false;
+
+  if (!agent->initialized)
+    return true;
+
+  if (agent->omp_async_queue)
+    {
+      GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
+      agent->omp_async_queue = NULL;
+    }
+
+  if (agent->module)
+    {
+      if (!destroy_module (agent->module, false))
+        return false;
+      free (agent->module);
+      agent->module = NULL;
+    }
+
+  if (!destroy_hsa_program (agent))
+    return false;
+
+  /*release_agent_shared_libraries (agent);*/
+
+  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error destroying command queue", status);
+
+  if (pthread_mutex_destroy (&agent->prog_mutex))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
+      return false;
+    }
+  if (pthread_rwlock_destroy (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
+      return false;
+    }
+
+  if (pthread_mutex_destroy (&agent->async_queues_mutex))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
+      return false;
+    }
+  agent->initialized = false;
+  return true;
+}
+
+static void *
+GOMP_OFFLOAD_alloc_by_agent (struct agent_info *agent, size_t size)
+{
+  HSA_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
+
+  /* Zero-size allocations are invalid, so in order to return a valid pointer
+     we need to pass a valid size.  One source of zero-size allocations is
+     kernargs for kernels that have no inputs or outputs (the kernel may
+     only use gomp_print, for example).  */
+  if (size == 0)
+    size = 4;
+
+  void *ptr;
+  hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
+							size, &ptr);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not allocate device memory", status);
+      return NULL;
+    }
+
+  status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
+					       HSA_ACCESS_PERMISSION_RW);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not assign data memory to device", status);
+      return NULL;
+    }
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  bool profiling_dispatch_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      acc_prof_info *prof_info = thr->prof_info;
+      acc_event_info data_event_info;
+      acc_api_info *api_info = thr->api_info;
+
+      prof_info->event_type = acc_ev_alloc;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+	= acc_construct_parallel;
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL;
+      data_event_info.data_event.bytes = size;
+      data_event_info.data_event.host_ptr = NULL;
+      data_event_info.data_event.device_ptr = (void *) ptr;
+
+      api_info->device_api = acc_device_api_other;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+					    api_info);
+    }
+
+  return ptr;
+}
+
+void *
+GOMP_OFFLOAD_alloc (int n, size_t size)
+{
+  struct agent_info *agent = get_agent_info (n);
+  return GOMP_OFFLOAD_alloc_by_agent (agent, size);
+}
+
+bool
+GOMP_OFFLOAD_free (int device, void *ptr)
+{
+  HSA_DEBUG ("Freeing memory on device %d\n", device);
+
+  hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not free device memory", status);
+      return false;
+    }
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  bool profiling_dispatch_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      acc_prof_info *prof_info = thr->prof_info;
+      acc_event_info data_event_info;
+      acc_api_info *api_info = thr->api_info;
+
+      prof_info->event_type = acc_ev_free;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+	= acc_construct_parallel;
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL;
+      data_event_info.data_event.bytes = 0;
+      data_event_info.data_event.host_ptr = NULL;
+      data_event_info.data_event.device_ptr = (void *) ptr;
+
+      api_info->device_api = acc_device_api_other;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+					    api_info);
+    }
+
+  return true;
+}
+
+struct copy_data
+{
+  void *dst;
+  const void *src;
+  size_t len;
+  bool using_src_copy;
+  struct goacc_asyncqueue *aq;
+};
+
+static void
+copy_data (void *data_)
+{
+  struct copy_data *data = (struct copy_data *)data_;
+  HSA_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
+	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
+	     data->dst);
+  hsa_fns.hsa_memory_copy_fn (data->dst, data->src, data->len);
+  if (data->using_src_copy)
+    free ((void *) data->src);
+  free (data);
+}
+
+static void
+queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
+		 size_t len, bool using_src_copy)
+{
+  if (DEBUG_QUEUES)
+    HSA_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
+	       aq->agent->device_id, aq->id, len, src, dst);
+  struct copy_data *data
+    = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
+  data->dst = dst;
+  data->src = src;
+  data->len = len;
+  data->using_src_copy = using_src_copy;
+  data->aq = aq;
+  queue_push_callback (aq, copy_data, data);
+}
+
+bool
+GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
+{
+  HSA_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
+	     src, dst);
+  hsa_fns.hsa_memory_copy_fn (dst, src, n);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
+{
+  HSA_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
+	     device, dst);
+  hsa_fns.hsa_memory_copy_fn (dst, src, n);
+  return true;
+}
+
+/* Part of the libgomp plugin interface.  */
+
+bool
+GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
+{
+  struct gcn_thread *thread_data = gcn_thread ();
+
+  if (thread_data && !async_synchronous_p (thread_data->async))
+    {
+      struct agent_info *agent = get_agent_info (device);
+      maybe_init_omp_async (agent);
+      queue_push_copy (agent->omp_async_queue, dst, src, n, false);
+      return true;
+    }
+
+  HSA_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
+	     device, src, device, dst);
+  hsa_fns.hsa_memory_copy_fn (dst, src, n);
+  return true;
+}
+
+static int
+queue_empty (struct goacc_asyncqueue *aq)
+{
+  pthread_mutex_lock (&aq->mutex);
+  int res = aq->queue_n == 0 ? 1 : 0;
+  pthread_mutex_unlock (&aq->mutex);
+
+  return res;
+}
+
+static void
+wait_queue (struct goacc_asyncqueue *aq)
+{
+  if (DRAIN_QUEUE_SYNCHRONOUS_P)
+    {
+      drain_queue_synchronous (aq);
+      return;
+    }
+
+  pthread_mutex_lock (&aq->mutex);
+
+  while (aq->queue_n > 0)
+    {
+      if (DEBUG_THREAD_SLEEP)
+	HSA_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
+		   aq->agent->device_id, aq->id);
+      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+      if (DEBUG_THREAD_SLEEP)
+	HSA_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
+		   aq->id);
+    }
+
+  pthread_mutex_unlock (&aq->mutex);
+  HSA_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
+}
+
+static void
+gomp_offload_free (void *ptr)
+{
+  HSA_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
+  GOMP_OFFLOAD_free (0, ptr);
+}
+
+static void
+gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
+	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
+	  struct goacc_asyncqueue *aq)
+{
+  if (!GOMP_OFFLOAD_can_run (kernel))
+    GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
+
+  // For some reason, devaddrs must be double-indirect on the target
+  void **ind_da = GOMP_OFFLOAD_alloc_by_agent (kernel->agent,
+					       sizeof (void*) * mapnum);
+  for (size_t i = 0; i < mapnum; i++)
+    hsa_fns.hsa_memory_copy_fn (&ind_da[i],
+				devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
+				sizeof (void *));
+
+  struct hsa_kernel_description *hsa_kernel_desc = NULL;
+  for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
+    {
+      struct hsa_kernel_description *d
+	= &kernel->module->image_desc->kernel_infos[i];
+      if (d->name == kernel->name)
+	{
+	  hsa_kernel_desc = d;
+	  break;
+	}
+    }
+
+  /* We may have statically-determined dimensions in
+     hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
+     invocation at runtime in dims[].  We allow static dimensions to take
+     priority over dynamic dimensions when present (non-zero).  */
+  if (hsa_kernel_desc->oacc_dims[0] > 0)
+    dims[0] = hsa_kernel_desc->oacc_dims[0];
+  if (hsa_kernel_desc->oacc_dims[1] > 0)
+    dims[1] = hsa_kernel_desc->oacc_dims[1];
+  if (hsa_kernel_desc->oacc_dims[2] > 0)
+    dims[2] = hsa_kernel_desc->oacc_dims[2];
+
+  /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
+     There isn't really a correct answer for this without a clue about the
+     problem size, so let's do a reasonable number of single-worker gangs.
+     64 gangs matches a typical Fiji device.  */
+
+  if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs.  */
+  if (dims[1] == 0) dims[1] = 16; /* Workers.  */
+
+  /* The incoming dimensions are expressed in terms of gangs, workers, and
+     vectors.  The HSA dimensions are expressed in terms of "work-items",
+     which means multiples of vector lanes.
+
+     The "grid size" specifies the size of the problem space, and the
+     "work-group size" specifies how much of that we want a single compute
+     unit to chew on at once.
+
+     The three dimensions do not really correspond to hardware, but the
+     important thing is that the HSA runtime will launch as many
+     work-groups as it takes to process the entire grid, and each
+     work-group will contain as many wave-fronts as it takes to process
+     the work-items in that group.
+
+     Essentially, as long as we set the Y dimension to 64 (the number of
+     vector lanes in hardware), and the Z group size to the maximum (16),
+     then we will get the gangs (X) and workers (Z) launched as we expect.
+
+     The reason for the apparent reversal of vector and worker dimension
+     order is to do with the way the run-time distributes work-items across
+     v1 and v2.  */
+  struct GOMP_kernel_launch_attributes kla =
+    {3,
+     /* Grid size.  */
+     {dims[0], 64, dims[1]},
+     /* Work-group size.  */
+     {1,       64, 16}
+    };
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info enqueue_launch_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_start;
+
+      enqueue_launch_event_info.launch_event.event_type
+	= prof_info->event_type;
+      enqueue_launch_event_info.launch_event.valid_bytes
+	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
+      enqueue_launch_event_info.launch_event.parent_construct
+	= acc_construct_parallel;
+      enqueue_launch_event_info.launch_event.implicit = 1;
+      enqueue_launch_event_info.launch_event.tool_info = NULL;
+      enqueue_launch_event_info.launch_event.kernel_name
+	= (char *) kernel->name;
+      enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
+      enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
+      enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
+
+      api_info->device_api = acc_device_api_other;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
+	&enqueue_launch_event_info, api_info);
+    }
+
+  if (!async)
+    {
+      run_kernel (kernel, ind_da, &kla, NULL, false);
+      gomp_offload_free (ind_da);
+    }
+  else
+    {
+      queue_push_launch (aq, kernel, ind_da, &kla);
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
+		   aq->agent->device_id, aq->id, ind_da);
+      queue_push_callback (aq, gomp_offload_free, ind_da);
+    }
+
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_end;
+      enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
+					    &enqueue_launch_event_info,
+					    api_info);
+    }
+}
+
+void
+GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
+			   void **hostaddrs, void **devaddrs, unsigned *dims,
+			   void *targ_mem_desc)
+{
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
+	    NULL);
+}
+
+void
+GOMP_OFFLOAD_openacc_exec_params (void (*fn_ptr) (void *), size_t mapnum,
+				  void **hostaddrs, void **devaddrs,
+				  unsigned *dims, void *targ_mem_desc)
+{
+  GOMP_PLUGIN_fatal ("OpenACC exec params unimplemented.");
+}
+
+void
+GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
+				 void **hostaddrs, void **devaddrs,
+				 unsigned *dims, void *targ_mem_desc,
+				 struct goacc_asyncqueue *aq)
+{
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
+	    aq);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_exec_params (void (*fn) (void *), size_t mapnum,
+					void **hostaddrs, void **devaddrs,
+					unsigned *dims, void *targ_mem_desc,
+					struct goacc_asyncqueue *aq)
+{
+  GOMP_PLUGIN_fatal ("OpenACC async exec params unimplemented.");
+}
+
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device)
+{
+  struct agent_info *agent = get_agent_info (device);
+
+  pthread_mutex_lock (&agent->async_queues_mutex);
+
+  struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
+  aq->agent = get_agent_info (device);
+  aq->prev = NULL;
+  aq->next = agent->async_queues;
+  if (aq->next)
+    {
+      aq->next->prev = aq;
+      aq->id = aq->next->id + 1;
+    }
+  else
+    aq->id = 1;
+  agent->async_queues = aq;
+
+  aq->queue_first = 0;
+  aq->queue_n = 0;
+  aq->drain_queue_stop = 0;
+
+  if (pthread_mutex_init (&aq->mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
+      return false;
+    }
+  if (pthread_cond_init (&aq->queue_cond_in, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
+      return false;
+    }
+  if (pthread_cond_init (&aq->queue_cond_out, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
+      return false;
+    }
+
+  hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
+						     ASYNC_QUEUE_SIZE,
+						     HSA_QUEUE_TYPE_MULTI,
+						     queue_callback, NULL,
+						     UINT32_MAX, UINT32_MAX,
+						     &aq->hsa_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Error creating command queue", status);
+
+  int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
+		       strerror (err));
+  HSA_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
+	     aq->id);
+
+  pthread_mutex_unlock (&agent->async_queues_mutex);
+
+  return aq;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+  struct agent_info *agent = aq->agent;
+
+  finalize_async_thread (aq);
+
+  pthread_mutex_lock (&agent->async_queues_mutex);
+
+  int err;
+  if ((err = pthread_mutex_destroy (&aq->mutex)))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
+      goto fail;
+    }
+  if (pthread_cond_destroy (&aq->queue_cond_in))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
+      goto fail;
+    }
+  if (pthread_cond_destroy (&aq->queue_cond_out))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
+      goto fail;
+    }
+  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Error destroying command queue", status);
+      goto fail;
+    }
+
+  if (aq->prev)
+    aq->prev->next = aq->next;
+  if (aq->next)
+    aq->next->prev = aq->prev;
+  if (agent->async_queues == aq)
+    agent->async_queues = aq->next;
+
+  HSA_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
+
+  free (aq);
+  pthread_mutex_unlock (&agent->async_queues_mutex);
+  return true;
+
+fail:
+  pthread_mutex_unlock (&agent->async_queues_mutex);
+  return false;
+}
+
+int
+GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
+{
+  return queue_empty (aq);
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+  wait_queue (aq);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
+				      struct goacc_asyncqueue *aq2)
+{
+  /* For serialize, stream aq2 waits for aq1 to complete work that has been
+     scheduled to run on it up to this point.  */
+  if (aq1 != aq2)
+    {
+      struct placeholder *placeholderp = queue_push_placeholder (aq1);
+      queue_push_asyncwait (aq2, placeholderp);
+    }
+  return true;
+}
+
+void
+GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
+					   void (*fn) (void *), void *data)
+{
+  queue_push_callback (aq, fn, data);
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
+				     size_t n, bool ephemeral,
+				     struct goacc_asyncqueue *aq)
+{
+  struct agent_info *agent = get_agent_info (device);
+  assert (agent == aq->agent);
+
+  if (ephemeral)
+    {
+      /* The source data is on the stack or otherwise may be deallocated
+	 before the asynchronous copy takes place.  Take a copy of the source
+	 data.  */
+      void *src_copy = GOMP_PLUGIN_malloc (n);
+      memcpy (src_copy, src, n);
+      src = src_copy;
+    }
+  queue_push_copy (aq, dst, src, n, ephemeral);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
+				     size_t n, struct goacc_asyncqueue *aq)
+{
+  struct agent_info *agent = get_agent_info (device);
+  assert (agent == aq->agent);
+  queue_push_copy (aq, dst, src, n, false);
+  return true;
+}
+
+void *
+GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
+{
+  struct gcn_thread *thread_data
+    = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
+
+  thread_data->async = GOMP_ASYNC_SYNC;
+
+  return (void *) thread_data;
+}
+
+void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
+{
+  free (data);
+}
diff --git a/libgomp/plugin/plugin-hsa.c b/libgomp/plugin/plugin-hsa.c
index e0bc87c..8b4dbfb 100644
--- a/libgomp/plugin/plugin-hsa.c
+++ b/libgomp/plugin/plugin-hsa.c
@@ -689,6 +689,32 @@
   return hsa_context.agent_count;
 }
 
+/* Part of the libgomp plugin interface.  Return the value of property
+   PROP of agent number N.  */
+
+union gomp_device_property_value
+GOMP_OFFLOAD_get_property (int n, int prop)
+{
+  union gomp_device_property_value nullval = { .val = 0 };
+
+  if (!init_hsa_context ())
+    return nullval;
+  if (n >= hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error
+	("Request for a property of a non-existing HSA device %i", n);
+      return nullval;
+    }
+
+  switch (prop)
+    {
+    case GOMP_DEVICE_PROPERTY_VENDOR:
+      return (union gomp_device_property_value) { .ptr = "AMD" };
+    default:
+      return nullval;
+    }
+}
+
 /* Part of the libgomp plugin interface.  Initialize agent number N so that it
    can be used for computation.  Return TRUE on success.  */
 
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 387e7cc..452415e 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -37,6 +37,7 @@
 #include "libgomp-plugin.h"
 #include "oacc-plugin.h"
 #include "gomp-constants.h"
+#include "oacc-int.h"
 
 #include <pthread.h>
 #include <cuda.h>
@@ -192,175 +193,30 @@
 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
-struct cuda_map
+/* NVPTX/CUDA specific definition of asynchronous queues.  */
+struct goacc_asyncqueue
 {
-  CUdeviceptr d;
-  size_t size;
-  bool active;
-  struct cuda_map *next;
+  CUstream cuda_stream;
 };
 
-struct ptx_stream
+struct nvptx_callback
 {
-  CUstream stream;
-  pthread_t host_thread;
-  bool multithreaded;
-  struct cuda_map *map;
-  struct ptx_stream *next;
+  void (*fn) (void *);
+  void *ptr;
+  struct goacc_asyncqueue *aq;
+  struct nvptx_callback *next;
 };
 
 /* Thread-specific data for PTX.  */
 
 struct nvptx_thread
 {
-  struct ptx_stream *current_stream;
+  /* We currently have this embedded inside the plugin because libgomp manages
+     devices through integer target_ids.  This might be better if using an
+     opaque target-specific pointer directly from gomp_device_descr.  */
   struct ptx_device *ptx_dev;
 };
 
-static struct cuda_map *
-cuda_map_create (size_t size)
-{
-  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
-
-  assert (map);
-
-  map->next = NULL;
-  map->size = size;
-  map->active = false;
-
-  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
-  assert (map->d);
-
-  return map;
-}
-
-static void
-cuda_map_destroy (struct cuda_map *map)
-{
-  if (map->active)
-    /* Possible reasons for the map to be still active:
-       - the associated async kernel might still be running.
-       - the associated async kernel might have finished, but the
-         corresponding event that should trigger the pop_map has not been
-	 processed by event_gc.
-       - the associated sync kernel might have aborted
-
-       The async cases could happen if the user specified an async region
-       without adding a corresponding wait that is guaranteed to be executed
-       (before returning from main, or in an atexit handler).
-       We do not want to deallocate a device pointer that is still being
-       used, so skip it.
-
-       In the sync case, the device pointer is no longer used, but deallocating
-       it using cuMemFree will not succeed, so skip it.
-
-       TODO: Handle this in a more constructive way, by f.i. waiting for streams
-       to finish before de-allocating them (PR88981), or by ensuring the CUDA
-       lib atexit handler is called before rather than after the libgomp plugin
-       atexit handler (PR83795).  */
-    ;
-  else
-    CUDA_CALL_NOCHECK (cuMemFree, map->d);
-
-  free (map);
-}
-
-/* The following map_* routines manage the CUDA device memory that
-   contains the data mapping arguments for cuLaunchKernel.  Each
-   asynchronous PTX stream may have multiple pending kernel
-   invocations, which are launched in a FIFO order.  As such, the map
-   routines maintains a queue of cuLaunchKernel arguments.
-
-   Calls to map_push and map_pop must be guarded by ptx_event_lock.
-   Likewise, calls to map_init and map_fini are guarded by
-   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
-   GOMP_OFFLOAD_fini_device, respectively.  */
-
-static bool
-map_init (struct ptx_stream *s)
-{
-  int size = getpagesize ();
-
-  assert (s);
-
-  s->map = cuda_map_create (size);
-
-  return true;
-}
-
-static bool
-map_fini (struct ptx_stream *s)
-{
-  assert (s->map->next == NULL);
-
-  cuda_map_destroy (s->map);
-
-  return true;
-}
-
-static void
-map_pop (struct ptx_stream *s)
-{
-  struct cuda_map *next;
-
-  assert (s != NULL);
-
-  if (s->map->next == NULL)
-    {
-      s->map->active = false;
-      return;
-    }
-
-  next = s->map->next;
-  cuda_map_destroy (s->map);
-  s->map = next;
-}
-
-static CUdeviceptr
-map_push (struct ptx_stream *s, size_t size)
-{
-  struct cuda_map *map = NULL;
-  struct cuda_map **t;
-
-  assert (s);
-  assert (s->map);
-
-  /* Select an element to push.  */
-  if (s->map->active)
-    map = cuda_map_create (size);
-  else
-    {
-      /* Pop the inactive front element.  */
-      struct cuda_map *pop = s->map;
-      s->map = pop->next;
-      pop->next = NULL;
-
-      if (pop->size < size)
-	{
-	  cuda_map_destroy (pop);
-
-	  map = cuda_map_create (size);
-	}
-      else
-	map = pop;
-    }
-
-  /* Check that the element is as expected.  */
-  assert (map->next == NULL);
-  assert (!map->active);
-
-  /* Mark the element active.  */
-  map->active = true;
-
-  /* Push the element to the back of the list.  */
-  for (t = &s->map; (*t) != NULL; t = &(*t)->next)
-    ;
-  assert (t != NULL && *t == NULL);
-  *t = map;
-
-  return map->d;
-}
-
 /* Target data function launch information.  */
 
 struct targ_fn_launch
@@ -412,22 +268,18 @@
   struct ptx_image_data *next;
 };
 
+struct ptx_free_block
+{
+  void *ptr;
+  struct ptx_free_block *next;
+};
+
 struct ptx_device
 {
   CUcontext ctx;
   bool ctx_shared;
   CUdevice dev;
-  struct ptx_stream *null_stream;
-  /* All non-null streams associated with this device (actually context),
-     either created implicitly or passed in from the user (via
-     acc_set_cuda_stream).  */
-  struct ptx_stream *active_streams;
-  struct {
-    struct ptx_stream **arr;
-    int size;
-  } async_streams;
-  /* A lock for use when manipulating the above stream list and array.  */
-  pthread_mutex_t stream_lock;
+
   int ord;
   bool overlap;
   bool map;
@@ -445,32 +297,13 @@
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
-  
+
+  struct ptx_free_block *free_blocks;
+  pthread_mutex_t free_blocks_lock;
+
   struct ptx_device *next;
 };
 
-enum ptx_event_type
-{
-  PTX_EVT_MEM,
-  PTX_EVT_KNL,
-  PTX_EVT_SYNC,
-  PTX_EVT_ASYNC_CLEANUP
-};
-
-struct ptx_event
-{
-  CUevent *evt;
-  int type;
-  void *addr;
-  int ord;
-  int val;
-
-  struct ptx_event *next;
-};
-
-static pthread_mutex_t ptx_event_lock;
-static struct ptx_event *ptx_events;
-
 static struct ptx_device **ptx_devices;
 
 static inline struct nvptx_thread *
@@ -479,193 +312,6 @@
   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 }
 
-static bool
-init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
-{
-  int i;
-  struct ptx_stream *null_stream
-    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
-
-  null_stream->stream = NULL;
-  null_stream->host_thread = pthread_self ();
-  null_stream->multithreaded = true;
-  if (!map_init (null_stream))
-    return false;
-
-  ptx_dev->null_stream = null_stream;
-  ptx_dev->active_streams = NULL;
-  pthread_mutex_init (&ptx_dev->stream_lock, NULL);
-
-  if (concurrency < 1)
-    concurrency = 1;
-
-  /* This is just a guess -- make space for as many async streams as the
-     current device is capable of concurrently executing.  This can grow
-     later as necessary.  No streams are created yet.  */
-  ptx_dev->async_streams.arr
-    = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
-  ptx_dev->async_streams.size = concurrency;
-
-  for (i = 0; i < concurrency; i++)
-    ptx_dev->async_streams.arr[i] = NULL;
-
-  return true;
-}
-
-static bool
-fini_streams_for_device (struct ptx_device *ptx_dev)
-{
-  free (ptx_dev->async_streams.arr);
-
-  bool ret = true;
-  while (ptx_dev->active_streams != NULL)
-    {
-      struct ptx_stream *s = ptx_dev->active_streams;
-      ptx_dev->active_streams = ptx_dev->active_streams->next;
-
-      ret &= map_fini (s);
-
-      CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
-      if (r != CUDA_SUCCESS)
-	{
-	  GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
-	  ret = false;
-	}
-      free (s);
-    }
-
-  ret &= map_fini (ptx_dev->null_stream);
-  free (ptx_dev->null_stream);
-  return ret;
-}
-
-/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
-   thread THREAD (and also current device/context).  If CREATE is true, create
-   the stream if it does not exist (or use EXISTING if it is non-NULL), and
-   associate the stream with the same thread argument.  Returns stream to use
-   as result.  */
-
-static struct ptx_stream *
-select_stream_for_async (int async, pthread_t thread, bool create,
-			 CUstream existing)
-{
-  struct nvptx_thread *nvthd = nvptx_thread ();
-  /* Local copy of TLS variable.  */
-  struct ptx_device *ptx_dev = nvthd->ptx_dev;
-  struct ptx_stream *stream = NULL;
-  int orig_async = async;
-
-  /* The special value acc_async_noval (-1) maps (for now) to an
-     implicitly-created stream, which is then handled the same as any other
-     numbered async stream.  Other options are available, e.g. using the null
-     stream for anonymous async operations, or choosing an idle stream from an
-     active set.  But, stick with this for now.  */
-  if (async > acc_async_sync)
-    async++;
-
-  if (create)
-    pthread_mutex_lock (&ptx_dev->stream_lock);
-
-  /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
-     null stream, and in fact better performance may be obtainable if it doesn't
-     (because the null stream enforces overly-strict synchronisation with
-     respect to other streams for legacy reasons, and that's probably not
-     needed with OpenACC).  Maybe investigate later.  */
-  if (async == acc_async_sync)
-    stream = ptx_dev->null_stream;
-  else if (async >= 0 && async < ptx_dev->async_streams.size
-	   && ptx_dev->async_streams.arr[async] && !(create && existing))
-    stream = ptx_dev->async_streams.arr[async];
-  else if (async >= 0 && create)
-    {
-      if (async >= ptx_dev->async_streams.size)
-	{
-	  int i, newsize = ptx_dev->async_streams.size * 2;
-
-	  if (async >= newsize)
-	    newsize = async + 1;
-
-	  ptx_dev->async_streams.arr
-	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
-				   newsize * sizeof (struct ptx_stream *));
-
-	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
-	    ptx_dev->async_streams.arr[i] = NULL;
-
-	  ptx_dev->async_streams.size = newsize;
-	}
-
-      /* Create a new stream on-demand if there isn't one already, or if we're
-	 setting a particular async value to an existing (externally-provided)
-	 stream.  */
-      if (!ptx_dev->async_streams.arr[async] || existing)
-        {
-	  CUresult r;
-	  struct ptx_stream *s
-	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
-
-	  if (existing)
-	    s->stream = existing;
-	  else
-	    {
-	      r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
-				     CU_STREAM_DEFAULT);
-	      if (r != CUDA_SUCCESS)
-		{
-		  pthread_mutex_unlock (&ptx_dev->stream_lock);
-		  GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
-				     cuda_error (r));
-		}
-	    }
-
-	  /* If CREATE is true, we're going to be queueing some work on this
-	     stream.  Associate it with the current host thread.  */
-	  s->host_thread = thread;
-	  s->multithreaded = false;
-
-	  if (!map_init (s))
-	    {
-	      pthread_mutex_unlock (&ptx_dev->stream_lock);
-	      GOMP_PLUGIN_fatal ("map_init fail");
-	    }
-
-	  s->next = ptx_dev->active_streams;
-	  ptx_dev->active_streams = s;
-	  ptx_dev->async_streams.arr[async] = s;
-	}
-
-      stream = ptx_dev->async_streams.arr[async];
-    }
-  else if (async < 0)
-    {
-      if (create)
-	pthread_mutex_unlock (&ptx_dev->stream_lock);
-      GOMP_PLUGIN_fatal ("bad async %d", async);
-    }
-
-  if (create)
-    {
-      assert (stream != NULL);
-
-      /* If we're trying to use the same stream from different threads
-	 simultaneously, set stream->multithreaded to true.  This affects the
-	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
-	 only wait for asynchronous launches from the same host thread they are
-	 invoked on.  If multiple threads use the same async value, we make note
-	 of that here and fall back to testing/waiting for all threads in those
-	 functions.  */
-      if (thread != stream->host_thread)
-        stream->multithreaded = true;
-
-      pthread_mutex_unlock (&ptx_dev->stream_lock);
-    }
-  else if (stream && !stream->multithreaded
-	   && !pthread_equal (stream->host_thread, thread))
-    GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
-
-  return stream;
-}
-
 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
    should be locked on entry and remains locked on exit.  */
 
@@ -677,9 +323,6 @@
   if (instantiated_devices != 0)
     return true;
 
-  ptx_events = NULL;
-  pthread_mutex_init (&ptx_event_lock, NULL);
-
   if (!init_cuda_lib ())
     return false;
 
@@ -703,6 +346,11 @@
   CUcontext thd_ctx;
 
   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
+  if (r == CUDA_ERROR_NOT_PERMITTED)
+    {
+      /* Assume we're in a CUDA callback, just return true.  */
+      return true;
+    }
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
     {
       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
@@ -847,8 +495,8 @@
   ptx_dev->images = NULL;
   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 
-  if (!init_streams_for_device (ptx_dev, async_engines))
-    return NULL;
+  ptx_dev->free_blocks = NULL;
+  pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 
   return ptx_dev;
 }
@@ -859,9 +507,15 @@
   if (!ptx_dev)
     return true;
 
-  if (!fini_streams_for_device (ptx_dev))
-    return false;
-  
+  for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
+    {
+      struct ptx_free_block *b_next = b->next;
+      CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
+      free (b);
+      b = b_next;
+    }
+
+  pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
   pthread_mutex_destroy (&ptx_dev->image_lock);
 
   if (!ptx_dev->ctx_shared)
@@ -1041,139 +695,18 @@
 }
 
 static void
-event_gc (bool memmap_lockable)
-{
-  struct ptx_event *ptx_event = ptx_events;
-  struct ptx_event *async_cleanups = NULL;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  pthread_mutex_lock (&ptx_event_lock);
-
-  while (ptx_event != NULL)
-    {
-      CUresult r;
-      struct ptx_event *e = ptx_event;
-
-      ptx_event = ptx_event->next;
-
-      if (e->ord != nvthd->ptx_dev->ord)
-	continue;
-
-      r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
-      if (r == CUDA_SUCCESS)
-	{
-	  bool append_async = false;
-	  CUevent *te;
-
-	  te = e->evt;
-
-	  switch (e->type)
-	    {
-	    case PTX_EVT_MEM:
-	    case PTX_EVT_SYNC:
-	      break;
-
-	    case PTX_EVT_KNL:
-	      map_pop (e->addr);
-	      break;
-
-	    case PTX_EVT_ASYNC_CLEANUP:
-	      {
-		/* The function gomp_plugin_async_unmap_vars needs to claim the
-		   memory-map splay tree lock for the current device, so we
-		   can't call it when one of our callers has already claimed
-		   the lock.  In that case, just delay the GC for this event
-		   until later.  */
-		if (!memmap_lockable)
-		  continue;
-
-		append_async = true;
-	      }
-	      break;
-	    }
-
-	  CUDA_CALL_NOCHECK (cuEventDestroy, *te);
-	  free ((void *)te);
-
-	  /* Unlink 'e' from ptx_events list.  */
-	  if (ptx_events == e)
-	    ptx_events = ptx_events->next;
-	  else
-	    {
-	      struct ptx_event *e_ = ptx_events;
-	      while (e_->next != e)
-		e_ = e_->next;
-	      e_->next = e_->next->next;
-	    }
-
-	  if (append_async)
-	    {
-	      e->next = async_cleanups;
-	      async_cleanups = e;
-	    }
-	  else
-	    free (e);
-	}
-    }
-
-  pthread_mutex_unlock (&ptx_event_lock);
-
-  /* We have to do these here, after ptx_event_lock is released.  */
-  while (async_cleanups)
-    {
-      struct ptx_event *e = async_cleanups;
-      async_cleanups = async_cleanups->next;
-
-      GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
-      free (e);
-    }
-}
-
-static void
-event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
-{
-  struct ptx_event *ptx_event;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
-	  || type == PTX_EVT_ASYNC_CLEANUP);
-
-  ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
-  ptx_event->type = type;
-  ptx_event->evt = e;
-  ptx_event->addr = h;
-  ptx_event->ord = nvthd->ptx_dev->ord;
-  ptx_event->val = val;
-
-  pthread_mutex_lock (&ptx_event_lock);
-
-  ptx_event->next = ptx_events;
-  ptx_events = ptx_event;
-
-  pthread_mutex_unlock (&ptx_event_lock);
-}
-
-static void
 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
-	    int async, unsigned *dims, void *targ_mem_desc)
+	    unsigned *dims, void *targ_mem_desc,
+	    void **kargs, CUstream stream)
 {
   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
   CUfunction function;
-  CUresult r;
   int i;
-  struct ptx_stream *dev_str;
-  void *kargs[1];
-  void *hp;
-  CUdeviceptr dp = 0;
   struct nvptx_thread *nvthd = nvptx_thread ();
   int warp_size = nvthd->ptx_dev->warp_size;
-  const char *maybe_abort_msg = "(perhaps abort was called)";
 
   function = targ_fn->fn;
 
-  dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
-  assert (dev_str == nvthd->current_stream);
-
   /* Initialize the launch dimensions.  Typically this is constant,
      provided by the device compiler, but we must permit runtime
      values.  */
@@ -1361,27 +894,6 @@
 			   dims[GOMP_DIM_VECTOR]);
     }
 
-  if (mapnum > 0)
-    {
-      /* This reserves a chunk of a pre-allocated page of memory mapped on both
-	 the host and the device. HP is a host pointer to the new chunk, and DP is
-	 the corresponding device pointer.  */
-      pthread_mutex_lock (&ptx_event_lock);
-      dp = map_push (dev_str, mapnum * sizeof (void *));
-      pthread_mutex_unlock (&ptx_event_lock);
-
-      GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-
-      /* Copy the array of arguments to the mapped page.  */
-      hp = alloca(sizeof(void *) * mapnum);
-      for (i = 0; i < mapnum; i++)
-	((void **) hp)[i] = devaddrs[i];
-
-      /* Copy the (device) pointers to arguments to the device */
-      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
-			mapnum * sizeof (void *));
-    }
-
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",
 		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
@@ -1393,77 +905,136 @@
   // num_workers	ntid.y
   // vector length	ntid.x
 
-  kargs[0] = &dp;
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info enqueue_launch_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_start;
+
+      enqueue_launch_event_info.launch_event.event_type
+	= prof_info->event_type;
+      enqueue_launch_event_info.launch_event.valid_bytes
+	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
+      enqueue_launch_event_info.launch_event.parent_construct
+	= acc_construct_parallel;
+      enqueue_launch_event_info.launch_event.implicit = 1;
+      enqueue_launch_event_info.launch_event.tool_info = NULL;
+      enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
+      enqueue_launch_event_info.launch_event.num_gangs
+	= dims[GOMP_DIM_GANG];
+      enqueue_launch_event_info.launch_event.num_workers
+	= dims[GOMP_DIM_WORKER];
+      enqueue_launch_event_info.launch_event.vector_length
+	= dims[GOMP_DIM_VECTOR];
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
+					    api_info);
+    }
+
   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 		    dims[GOMP_DIM_GANG], 1, 1,
 		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
-		    0, dev_str->stream, kargs, 0);
+		    0, stream, kargs, 0);
 
-#ifndef DISABLE_ASYNC
-  if (async < acc_async_noval)
+  if (profiling_p)
     {
-      r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
-      if (r == CUDA_ERROR_LAUNCH_FAILED)
-	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
-			   maybe_abort_msg);
-      else if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+      prof_info->event_type = acc_ev_enqueue_launch_end;
+      enqueue_launch_event_info.launch_event.event_type
+	= prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
+					    api_info);
     }
-  else
-    {
-      CUevent *e;
-
-      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-      r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-      if (r == CUDA_ERROR_LAUNCH_FAILED)
-	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
-			   maybe_abort_msg);
-      else if (r != CUDA_SUCCESS)
-        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
-
-      event_gc (true);
-
-      CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
-
-      if (mapnum > 0)
-	event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
-    }
-#else
-  r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
-  if (r == CUDA_ERROR_LAUNCH_FAILED)
-    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
-		       maybe_abort_msg);
-  else if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
-#endif
 
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 		     targ_fn->launch->fn);
-
-#ifndef DISABLE_ASYNC
-  if (async < acc_async_noval)
-#endif
-    {
-      if (mapnum > 0)
-	map_pop (dev_str);
-    }
 }
 
 void * openacc_get_current_cuda_context (void);
 
+static void
+goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
+{
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+
+  prof_info->event_type = acc_ev_alloc;
+
+  data_event_info.data_event.event_type = prof_info->event_type;
+  data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+  data_event_info.data_event.parent_construct = acc_construct_parallel;
+  data_event_info.data_event.implicit = 1;
+  data_event_info.data_event.tool_info = NULL;
+  data_event_info.data_event.var_name = NULL;
+  data_event_info.data_event.bytes = s;
+  data_event_info.data_event.host_ptr = NULL;
+  data_event_info.data_event.device_ptr = dp;
+
+  api_info->device_api = acc_device_api_cuda;
+
+  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
+}
+
 static void *
 nvptx_alloc (size_t s)
 {
   CUdeviceptr d;
 
   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  bool profiling_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_p)
+    goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
+
   return (void *) d;
 }
 
-static bool
-nvptx_free (void *p)
+static void
+goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
 {
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+
+  prof_info->event_type = acc_ev_free;
+
+  data_event_info.data_event.event_type = prof_info->event_type;
+  data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+  data_event_info.data_event.parent_construct = acc_construct_parallel;
+  data_event_info.data_event.implicit = 1;
+  data_event_info.data_event.tool_info = NULL;
+  data_event_info.data_event.var_name = NULL;
+  data_event_info.data_event.bytes = -1;
+  data_event_info.data_event.host_ptr = NULL;
+  data_event_info.data_event.device_ptr = p;
+
+  api_info->device_api = acc_device_api_cuda;
+
+  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
+}
+
+static bool
+nvptx_free (void *p, struct ptx_device *ptx_dev)
+{
+  /* Assume callback context if this is null.  */
+  if (GOMP_PLUGIN_acc_thread () == NULL)
+    {
+      struct ptx_free_block *n
+	= GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
+      n->ptr = p;
+      pthread_mutex_lock (&ptx_dev->free_blocks_lock);
+      n->next = ptx_dev->free_blocks;
+      ptx_dev->free_blocks = n;
+      pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
+      return true;
+    }
+
   CUdeviceptr pb;
   size_t ps;
 
@@ -1475,308 +1046,15 @@
     }
 
   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
-  return true;
-}
-
-
-static bool
-nvptx_host2dev (void *d, const void *h, size_t s)
-{
-  CUdeviceptr pb;
-  size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  if (!s)
-    return true;
-  if (!d)
-    {
-      GOMP_PLUGIN_error ("invalid device address");
-      return false;
-    }
-
-  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
-
-  if (!pb)
-    {
-      GOMP_PLUGIN_error ("invalid device address");
-      return false;
-    }
-  if (!h)
-    {
-      GOMP_PLUGIN_error ("invalid host address");
-      return false;
-    }
-  if (d == h)
-    {
-      GOMP_PLUGIN_error ("invalid host or device address");
-      return false;
-    }
-  if ((void *)(d + s) > (void *)(pb + ps))
-    {
-      GOMP_PLUGIN_error ("invalid size");
-      return false;
-    }
-
-#ifndef DISABLE_ASYNC
-  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
-    {
-      CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-      CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-      event_gc (false);
-      CUDA_CALL (cuMemcpyHtoDAsync,
-		 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
-      CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
-      event_add (PTX_EVT_MEM, e, (void *)h, 0);
-    }
-  else
-#endif
-    CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  bool profiling_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_p)
+    goacc_profiling_acc_ev_free (thr, p);
 
   return true;
 }
 
-static bool
-nvptx_dev2host (void *h, const void *d, size_t s)
-{
-  CUdeviceptr pb;
-  size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  if (!s)
-    return true;
-  if (!d)
-    {
-      GOMP_PLUGIN_error ("invalid device address");
-      return false;
-    }
-
-  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
-
-  if (!pb)
-    {
-      GOMP_PLUGIN_error ("invalid device address");
-      return false;
-    }
-  if (!h)
-    {
-      GOMP_PLUGIN_error ("invalid host address");
-      return false;
-    }
-  if (d == h)
-    {
-      GOMP_PLUGIN_error ("invalid host or device address");
-      return false;
-    }
-  if ((void *)(d + s) > (void *)(pb + ps))
-    {
-      GOMP_PLUGIN_error ("invalid size");
-      return false;
-    }
-
-#ifndef DISABLE_ASYNC
-  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
-    {
-      CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-      CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-      event_gc (false);
-      CUDA_CALL (cuMemcpyDtoHAsync,
-		 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
-      CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
-      event_add (PTX_EVT_MEM, e, (void *)h, 0);
-    }
-  else
-#endif
-    CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
-
-  return true;
-}
-
-static void
-nvptx_set_async (int async)
-{
-  struct nvptx_thread *nvthd = nvptx_thread ();
-  nvthd->current_stream
-    = select_stream_for_async (async, pthread_self (), true, NULL);
-}
-
-static int
-nvptx_async_test (int async)
-{
-  CUresult r;
-  struct ptx_stream *s;
-
-  s = select_stream_for_async (async, pthread_self (), false, NULL);
-  if (!s)
-    return 1;
-
-  r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
-  if (r == CUDA_SUCCESS)
-    {
-      /* The oacc-parallel.c:goacc_wait function calls this hook to determine
-	 whether all work has completed on this stream, and if so omits the call
-	 to the wait hook.  If that happens, event_gc might not get called
-	 (which prevents variables from getting unmapped and their associated
-	 device storage freed), so call it here.  */
-      event_gc (true);
-      return 1;
-    }
-  else if (r == CUDA_ERROR_NOT_READY)
-    return 0;
-
-  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
-
-  return 0;
-}
-
-static int
-nvptx_async_test_all (void)
-{
-  struct ptx_stream *s;
-  pthread_t self = pthread_self ();
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
-  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
-    {
-      if ((s->multithreaded || pthread_equal (s->host_thread, self))
-	  && CUDA_CALL_NOCHECK (cuStreamQuery,
-				s->stream) == CUDA_ERROR_NOT_READY)
-	{
-	  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-	  return 0;
-	}
-    }
-
-  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
-  event_gc (true);
-
-  return 1;
-}
-
-static void
-nvptx_wait (int async)
-{
-  struct ptx_stream *s;
-
-  s = select_stream_for_async (async, pthread_self (), false, NULL);
-  if (!s)
-    return;
-
-  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
-
-  event_gc (true);
-}
-
-static void
-nvptx_wait_async (int async1, int async2)
-{
-  CUevent *e;
-  struct ptx_stream *s1, *s2;
-  pthread_t self = pthread_self ();
-
-  s1 = select_stream_for_async (async1, self, false, NULL);
-  if (!s1)
-    return;
-
-  /* The stream that is waiting (rather than being waited for) doesn't
-     necessarily have to exist already.  */
-  s2 = select_stream_for_async (async2, self, true, NULL);
-
-  /* A stream is always synchronized with itself.  */
-  if (s1 == s2)
-    return;
-
-  e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-  CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-
-  event_gc (true);
-
-  CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
-
-  event_add (PTX_EVT_SYNC, e, NULL, 0);
-
-  CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
-}
-
-static void
-nvptx_wait_all (void)
-{
-  CUresult r;
-  struct ptx_stream *s;
-  pthread_t self = pthread_self ();
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
-  /* Wait for active streams initiated by this thread (or by multiple threads)
-     to complete.  */
-  for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
-    {
-      if (s->multithreaded || pthread_equal (s->host_thread, self))
-	{
-	  r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
-	  if (r == CUDA_SUCCESS)
-	    continue;
-	  else if (r != CUDA_ERROR_NOT_READY)
-	    GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
-
-	  CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
-	}
-    }
-
-  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
-  event_gc (true);
-}
-
-static void
-nvptx_wait_all_async (int async)
-{
-  struct ptx_stream *waiting_stream, *other_stream;
-  CUevent *e;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-  pthread_t self = pthread_self ();
-
-  /* The stream doing the waiting.  This could be the first mention of the
-     stream, so create it if necessary.  */
-  waiting_stream
-    = select_stream_for_async (async, pthread_self (), true, NULL);
-
-  /* Launches on the null stream already block on other streams in the
-     context.  */
-  if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
-    return;
-
-  event_gc (true);
-
-  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
-  for (other_stream = nvthd->ptx_dev->active_streams;
-       other_stream != NULL;
-       other_stream = other_stream->next)
-    {
-      if (!other_stream->multithreaded
-	  && !pthread_equal (other_stream->host_thread, self))
-	continue;
-
-      e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-      CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-
-      /* Record an event on the waited-for stream.  */
-      CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
-
-      event_add (PTX_EVT_SYNC, e, NULL, 0);
-
-      CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
-   }
-
-  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-}
-
 static void *
 nvptx_get_current_cuda_device (void)
 {
@@ -1799,75 +1077,6 @@
   return nvthd->ptx_dev->ctx;
 }
 
-static void *
-nvptx_get_cuda_stream (int async)
-{
-  struct ptx_stream *s;
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  if (!nvthd || !nvthd->ptx_dev)
-    return NULL;
-
-  s = select_stream_for_async (async, pthread_self (), false, NULL);
-
-  return s ? s->stream : NULL;
-}
-
-static int
-nvptx_set_cuda_stream (int async, void *stream)
-{
-  struct ptx_stream *oldstream;
-  pthread_t self = pthread_self ();
-  struct nvptx_thread *nvthd = nvptx_thread ();
-
-  /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used
-     to change the stream handle associated with "acc_async_sync".  */
-  if (async == acc_async_sync)
-    {
-      GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated"
-			 " with \"acc_async_sync\"\n");
-      return 0;
-    }
-
-  pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
-  /* We have a list of active streams and an array mapping async values to
-     entries of that list.  We need to take "ownership" of the passed-in stream,
-     and add it to our list, removing the previous entry also (if there was one)
-     in order to prevent resource leaks.  Note the potential for surprise
-     here: maybe we should keep track of passed-in streams and leave it up to
-     the user to tidy those up, but that doesn't work for stream handles
-     returned from acc_get_cuda_stream above...  */
-
-  oldstream = select_stream_for_async (async, self, false, NULL);
-
-  if (oldstream)
-    {
-      if (nvthd->ptx_dev->active_streams == oldstream)
-	nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
-      else
-	{
-	  struct ptx_stream *s = nvthd->ptx_dev->active_streams;
-	  while (s->next != oldstream)
-	    s = s->next;
-	  s->next = s->next->next;
-	}
-
-      CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
-
-      if (!map_fini (oldstream))
-	GOMP_PLUGIN_fatal ("error when freeing host memory");
-
-      free (oldstream);
-    }
-
-  pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
-  (void) select_stream_for_async (async, self, true, (CUstream) stream);
-
-  return 1;
-}
-
 /* Plugin entry points.  */
 
 const char *
@@ -1894,6 +1103,93 @@
   return nvptx_get_num_devices ();
 }
 
+union gomp_device_property_value
+GOMP_OFFLOAD_get_property (int n, int prop)
+{
+  union gomp_device_property_value propval = { .val = 0 };
+
+  pthread_mutex_lock (&ptx_dev_lock);
+
+  if (!nvptx_init () || n >= nvptx_get_num_devices ())
+    {
+      pthread_mutex_unlock (&ptx_dev_lock);
+      return propval;
+    }
+
+  switch (prop)
+    {
+    case GOMP_DEVICE_PROPERTY_MEMORY:
+      {
+	size_t total_mem;
+	CUdevice dev;
+
+	CUDA_CALL_ERET (propval, cuDeviceGet, &dev, n);
+	CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, dev);
+	propval.val = total_mem;
+      }
+      break;
+    case GOMP_DEVICE_PROPERTY_FREE_MEMORY:
+      {
+	size_t total_mem;
+	size_t free_mem;
+	CUdevice ctxdev;
+	CUdevice dev;
+
+	CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
+	CUDA_CALL_ERET (propval, cuDeviceGet, &dev, n);
+	if (dev == ctxdev)
+	  CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
+	else if (ptx_devices[n])
+	  {
+	    CUcontext old_ctx;
+
+	    CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_devices[n]->ctx);
+	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
+	    CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
+	  }
+	else
+	  {
+	    CUcontext new_ctx;
+
+	    CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
+			    dev);
+	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
+	    CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
+	  }
+	propval.val = free_mem;
+      }
+      break;
+    case GOMP_DEVICE_PROPERTY_NAME:
+      {
+	static char name[256];
+	CUdevice dev;
+
+	CUDA_CALL_ERET (propval, cuDeviceGet, &dev, n);
+	CUDA_CALL_ERET (propval, cuDeviceGetName, name, sizeof (name), dev);
+	propval.ptr = name;
+      }
+      break;
+    case GOMP_DEVICE_PROPERTY_VENDOR:
+      propval.ptr = "Nvidia";
+      break;
+    case GOMP_DEVICE_PROPERTY_DRIVER:
+      {
+	static char ver[11];
+	int v;
+
+	CUDA_CALL_ERET (propval, cuDriverGetVersion, &v);
+	snprintf (ver, sizeof (ver) - 1, "%u.%u", v / 1000, v % 1000 / 10);
+	propval.ptr = ver;
+      }
+      break;
+    default:
+      break;
+    }
+
+  pthread_mutex_unlock (&ptx_dev_lock);
+  return propval;
+}
+
 bool
 GOMP_OFFLOAD_init_device (int n)
 {
@@ -2107,6 +1403,23 @@
 {
   if (!nvptx_attach_host_thread_to_device (ord))
     return NULL;
+
+  struct ptx_device *ptx_dev = ptx_devices[ord];
+  struct ptx_free_block *blocks, *tmp;
+
+  pthread_mutex_lock (&ptx_dev->free_blocks_lock);
+  blocks = ptx_dev->free_blocks;
+  ptx_dev->free_blocks = NULL;
+  pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
+
+  while (blocks)
+    {
+      tmp = blocks->next;
+      nvptx_free (blocks->ptr, ptx_dev);
+      free (blocks);
+      blocks = tmp;
+    }
+
   return nvptx_alloc (size);
 }
 
@@ -2114,93 +1427,245 @@
 GOMP_OFFLOAD_free (int ord, void *ptr)
 {
   return (nvptx_attach_host_thread_to_device (ord)
-	  && nvptx_free (ptr));
+	  && nvptx_free (ptr, ptx_devices[ord]));
 }
 
-bool
-GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
+static void
+openacc_exec_internal (void (*fn) (void *), int params, size_t mapnum,
+		       void **hostaddrs, void **devaddrs,
+		       unsigned *dims, void *targ_mem_desc)
 {
-  return (nvptx_attach_host_thread_to_device (ord)
-	  && nvptx_dev2host (dst, src, n));
+  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_p = __builtin_expect (prof_info != NULL, false);
+
+  void **hp = alloca (mapnum * sizeof (void *));
+  CUdeviceptr dp = 0;
+
+  if (mapnum > 0)
+    {
+      if (params)
+	{
+	  for (int i = 0; i < mapnum; i++)
+	    hp[i] = (devaddrs[i] ? &devaddrs[i] : &hostaddrs[i]);
+	}
+      else
+	{
+	  size_t s = mapnum * sizeof (void *);
+	  for (int i = 0; i < mapnum; i++)
+	    hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
+	  CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
+	  if (profiling_p)
+	    goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
+	}
+    }
+
+  /* Copy the (device) pointers to arguments to the device (dp and hp might in
+     fact have the same value on a unified-memory system).  */
+  if (!params && mapnum > 0)
+    {
+      if (profiling_p)
+	{
+	  prof_info->event_type = acc_ev_enqueue_upload_start;
+
+	  data_event_info.data_event.event_type = prof_info->event_type;
+	  data_event_info.data_event.valid_bytes
+	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+	  data_event_info.data_event.parent_construct
+	    = acc_construct_parallel;
+	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
+	  data_event_info.data_event.tool_info = NULL;
+	  data_event_info.data_event.var_name = NULL;
+	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
+	  data_event_info.data_event.host_ptr = hp;
+	  data_event_info.data_event.device_ptr = (const void *) dp;
+
+	  api_info->device_api = acc_device_api_cuda;
+
+	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+						api_info);
+	}
+      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
+			mapnum * sizeof (void *));
+      if (profiling_p)
+	{
+	  prof_info->event_type = acc_ev_enqueue_upload_end;
+	  data_event_info.data_event.event_type = prof_info->event_type;
+	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+						api_info);
+	}
+    }
+
+  if (params)
+    nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+		hp, NULL);
+  else
+    {
+      void *kargs[1] = { &dp };
+      nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+		  kargs, NULL);
+    }
+
+  CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
+  const char *maybe_abort_msg = "(perhaps abort was called)";
+  if (r == CUDA_ERROR_LAUNCH_FAILED)
+    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
+		       maybe_abort_msg);
+  else if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+
+  if (!params)
+    {
+      CUDA_CALL_ASSERT (cuMemFree, dp);
+      if (profiling_p)
+	goacc_profiling_acc_ev_free (thr, (void *) dp);
+    }
 }
 
-bool
-GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
+void
+GOMP_OFFLOAD_openacc_exec_params (void (*fn) (void *), size_t mapnum,
+			   void **hostaddrs, void **devaddrs,
+			   unsigned *dims, void *targ_mem_desc)
 {
-  return (nvptx_attach_host_thread_to_device (ord)
-	  && nvptx_host2dev (dst, src, n));
+  openacc_exec_internal (fn, 1, mapnum, hostaddrs, devaddrs, dims,
+			 targ_mem_desc);
 }
 
-bool
-GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
-{
-  struct ptx_device *ptx_dev = ptx_devices[ord];
-  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
-				ptx_dev->null_stream->stream);
-  return true;
-}
-
-void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
-
 void
 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
 			   void **hostaddrs, void **devaddrs,
-			   int async, unsigned *dims, void *targ_mem_desc)
+			   unsigned *dims, void *targ_mem_desc)
 {
-  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
+  openacc_exec_internal (fn, 0, mapnum, hostaddrs, devaddrs, dims,
+			 targ_mem_desc);
+}
+
+static void
+cuda_free_argmem (void *ptr)
+{
+  void **block = (void **) ptr;
+  nvptx_free (block[0], (struct ptx_device *) block[1]);
+  free (block);
+}
+
+static void
+openacc_async_exec_internal (void (*fn) (void *), int params, size_t mapnum,
+			     void **hostaddrs, void **devaddrs,
+			     unsigned *dims, void *targ_mem_desc,
+			     struct goacc_asyncqueue *aq)
+{
+  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_p = __builtin_expect (prof_info != NULL, false);
+
+  void **hp = NULL;
+  CUdeviceptr dp = 0;
+  void **block = NULL;
+
+  if (mapnum > 0)
+    {
+      if (params)
+	{
+	  hp = alloca (sizeof (void *) * mapnum);
+	  for (int i = 0; i < mapnum; i++)
+	    hp[i] = (devaddrs[i] ? &devaddrs[i] : &hostaddrs[i]);
+	}
+      else
+	{
+	  size_t s = mapnum * sizeof (void *);
+	  block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
+	  hp = block + 2;
+	  for (int i = 0; i < mapnum; i++)
+	    hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
+	  CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
+	  if (profiling_p)
+	    goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
+	}
+    }
+
+  /* Copy the (device) pointers to arguments to the device (dp and hp might in
+     fact have the same value on a unified-memory system).  */
+  if (!params && mapnum > 0)
+    {
+      if (profiling_p)
+	{
+	  prof_info->event_type = acc_ev_enqueue_upload_start;
+
+	  data_event_info.data_event.event_type = prof_info->event_type;
+	  data_event_info.data_event.valid_bytes
+	    = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+	  data_event_info.data_event.parent_construct
+	    = acc_construct_parallel;
+	  data_event_info.data_event.implicit = 1; /* Always implicit.  */
+	  data_event_info.data_event.tool_info = NULL;
+	  data_event_info.data_event.var_name = NULL;
+	  data_event_info.data_event.bytes = mapnum * sizeof (void *);
+	  data_event_info.data_event.host_ptr = hp;
+	  data_event_info.data_event.device_ptr = (const void *) dp;
+
+	  api_info->device_api = acc_device_api_cuda;
+
+	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+						api_info);
+	}
+
+      CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
+			mapnum * sizeof (void *), aq->cuda_stream);
+      block[0] = (void *) dp;
+
+      struct nvptx_thread *nvthd =
+	(struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
+      block[1] = (void *) nvthd->ptx_dev;
+
+      if (profiling_p)
+	{
+	  prof_info->event_type = acc_ev_enqueue_upload_end;
+	  data_event_info.data_event.event_type = prof_info->event_type;
+	  GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+						api_info);
+	}
+    }
+
+  if (params)
+    nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+		hp, aq->cuda_stream);
+  else
+    {
+      void *kargs[1] = { &dp };
+      nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+		  kargs, aq->cuda_stream);
+    }
+
+  if (!params && mapnum > 0)
+    GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
 }
 
 void
-GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
+GOMP_OFFLOAD_openacc_async_exec_params (void (*fn) (void *), size_t mapnum,
+				 void **hostaddrs, void **devaddrs,
+				 unsigned *dims, void *targ_mem_desc,
+				 struct goacc_asyncqueue *aq)
 {
-  struct nvptx_thread *nvthd = nvptx_thread ();
-  CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
-  CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-  CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
-  event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
-}
-
-int
-GOMP_OFFLOAD_openacc_async_test (int async)
-{
-  return nvptx_async_test (async);
-}
-
-int
-GOMP_OFFLOAD_openacc_async_test_all (void)
-{
-  return nvptx_async_test_all ();
+  openacc_async_exec_internal (fn, 1, mapnum, hostaddrs, devaddrs, dims,
+			       targ_mem_desc, aq);
 }
 
 void
-GOMP_OFFLOAD_openacc_async_wait (int async)
+GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
+				 void **hostaddrs, void **devaddrs,
+				 unsigned *dims, void *targ_mem_desc,
+				 struct goacc_asyncqueue *aq)
 {
-  nvptx_wait (async);
-}
-
-void
-GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
-{
-  nvptx_wait_async (async1, async2);
-}
-
-void
-GOMP_OFFLOAD_openacc_async_wait_all (void)
-{
-  nvptx_wait_all ();
-}
-
-void
-GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
-{
-  nvptx_wait_all_async (async);
-}
-
-void
-GOMP_OFFLOAD_openacc_async_set_async (int async)
-{
-  nvptx_set_async (async);
+  openacc_async_exec_internal (fn, 0, mapnum, hostaddrs, devaddrs, dims,
+			       targ_mem_desc, aq);
 }
 
 void *
@@ -2222,7 +1687,6 @@
   if (!thd_ctx)
     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
 
-  nvthd->current_stream = ptx_dev->null_stream;
   nvthd->ptx_dev = ptx_dev;
 
   return (void *) nvthd;
@@ -2246,20 +1710,195 @@
   return nvptx_get_current_cuda_context ();
 }
 
-/* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
-
+/* This returns a CUstream.  */
 void *
-GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
+GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
 {
-  return nvptx_get_cuda_stream (async);
+  return (void *) aq->cuda_stream;
 }
 
-/* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
+/* This takes a CUstream.  */
+int
+GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
+{
+  if (aq->cuda_stream)
+    {
+      CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
+      CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
+    }
+
+  aq->cuda_stream = (CUstream) stream;
+  return 1;
+}
+
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+{
+  CUstream stream = NULL;
+  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+
+  struct goacc_asyncqueue *aq
+    = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
+  aq->cuda_stream = stream;
+  return aq;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+  CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
+  free (aq);
+  return true;
+}
 
 int
-GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
+GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
 {
-  return nvptx_set_cuda_stream (async, stream);
+  CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
+  if (r == CUDA_SUCCESS)
+    return 1;
+  if (r == CUDA_ERROR_NOT_READY)
+    return 0;
+
+  GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
+  return -1;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+  CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
+				      struct goacc_asyncqueue *aq2)
+{
+  CUevent e;
+  CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
+  CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
+  CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
+  return true;
+}
+
+static void
+cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
+{
+  if (res != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
+  struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
+  cb->fn (cb->ptr);
+  free (ptr);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
+					   void (*callback_fn)(void *),
+					   void *userptr)
+{
+  struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
+  b->fn = callback_fn;
+  b->ptr = userptr;
+  b->aq = aq;
+  CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
+		    cuda_callback_wrapper, (void *) b, 0);
+}
+
+static bool
+cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
+{
+  CUdeviceptr pb;
+  size_t ps;
+  if (!s)
+    return true;
+  if (!d)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
+  if (!pb)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  if (!h)
+    {
+      GOMP_PLUGIN_error ("invalid host address");
+      return false;
+    }
+  if (d == h)
+    {
+      GOMP_PLUGIN_error ("invalid host or device address");
+      return false;
+    }
+  if ((void *)(d + s) > (void *)(pb + ps))
+    {
+      GOMP_PLUGIN_error ("invalid size");
+      return false;
+    }
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_sanity_check (src, dst, n))
+    return false;
+  CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_sanity_check (dst, src, n))
+    return false;
+  CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
+{
+  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
+  return true;
+}
+
+/* FIXME: It is unknown whether the cuMemcpyHtoDAsync API call caches source
+   data before the asynchronous copy takes place.  Either way there is a data
+   race associated with ignoring the EPHEMERAL parameter here -- either if it
+   is TRUE (because we are copying uncached data that may disappear before the
+   async copy takes place) or if it is FALSE (because the source data may be
+   cached/snapshotted here before it is modified by an earlier async operation,
+   so stale data gets copied to the target).
+   Neither problem has been observed in practice, so far.  */
+
+bool
+GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
+				     size_t n,
+				     bool ephemeral __attribute__((unused)),
+				     struct goacc_asyncqueue *aq)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_sanity_check (src, dst, n))
+    return false;
+  CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
+				     size_t n, struct goacc_asyncqueue *aq)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_sanity_check (dst, src, n))
+    return false;
+  CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
+  return true;
 }
 
 /* Adjust launch dimensions: pick good values for number of blocks and warps
@@ -2360,8 +1999,7 @@
     CU_LAUNCH_PARAM_END
   };
   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
-			 32, threads, 1, 0, ptx_dev->null_stream->stream,
-			 NULL, config);
+			 32, threads, 1, 0, NULL, NULL, config);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 
diff --git a/libgomp/target.c b/libgomp/target.c
index 97fc1ee..9a5b259 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -39,12 +39,17 @@
 #include <string.h>
 #include <assert.h>
 #include <errno.h>
+#if defined(RC_CHECKING)
+#include <stdio.h>
+#endif
 
 #ifdef PLUGIN_SUPPORT
 #include <dlfcn.h>
 #include "plugin-suffix.h"
 #endif
 
+#define FIELD_TGT_EMPTY (~(size_t) 0)
+
 static void gomp_target_init (void);
 
 /* The whole initialization code for offloading plugins is only run one.  */
@@ -106,13 +111,18 @@
 }
 
 static struct gomp_device_descr *
-resolve_device (int device_id)
+resolve_device (int device)
 {
-  if (device_id == GOMP_DEVICE_ICV)
+  gomp_debug (0, "%s (%d)\n", __FUNCTION__, device);
+
+  int device_id;
+  if (device == GOMP_DEVICE_ICV)
     {
       struct gomp_task_icv *icv = gomp_icv (false);
       device_id = icv->default_device_var;
     }
+  else
+    device_id = device;
 
   if (device_id < 0 || device_id >= gomp_get_num_devices ())
     return NULL;
@@ -127,6 +137,13 @@
     }
   gomp_mutex_unlock (&devices[device_id].lock);
 
+  /* If the device-var ICV does not actually have offload data available, don't
+     try use it (which will fail), and use host fallback instead.  */
+  if (device == GOMP_DEVICE_ICV
+      && !gomp_offload_target_available_p (devices[device_id].type))
+    return NULL;
+
+  gomp_debug (0, "  %s (%d): %d\n", __FUNCTION__, device, device_id);
   return &devices[device_id];
 }
 
@@ -177,8 +194,8 @@
     }
 }
 
-/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
-   host to device memory transfers.  */
+/* Infrastructure for coalescing adjacent or nearly adjacent (in device
+   addresses) host to device memory transfers.  */
 
 struct gomp_coalesce_chunk
 {
@@ -269,10 +286,18 @@
     }
 }
 
-static void
+/* Copy host memory to an offload device.  In asynchronous mode (if AQ is
+   non-NULL), when the source data is stack or may otherwise be deallocated
+   before the asynchronous copy takes place, EPHEMERAL must be passed as
+   TRUE.  The CBUF isn't used for non-ephemeral asynchronous copies, because
+   the host data might not be computed yet (by an earlier asynchronous compute
+   region).  */
+
+attribute_hidden void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
+		    struct goacc_asyncqueue *aq,
 		    void *d, const void *h, size_t sz,
-		    struct gomp_coalesce_buf *cbuf)
+		    bool ephemeral, struct gomp_coalesce_buf *cbuf)
 {
   if (cbuf)
     {
@@ -299,14 +324,37 @@
 	    }
 	}
     }
-  gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
+  if (__builtin_expect (aq != NULL, 0))
+    {
+      if (!devicep->openacc.async.host2dev_func (devicep->target_id, d, h, sz,
+						 ephemeral, aq))
+	{
+	  gomp_mutex_unlock (&devicep->lock);
+	  gomp_fatal ("Copying of host object [%p..%p) to dev object [%p..%p) "
+		      "failed", h, h + sz, d, d + sz);
+	}
+    }
+  else
+    gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
 }
 
-static void
+attribute_hidden void
 gomp_copy_dev2host (struct gomp_device_descr *devicep,
+		    struct goacc_asyncqueue *aq,
 		    void *h, const void *d, size_t sz)
 {
-  gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
+  if (__builtin_expect (aq != NULL, 0))
+    {
+      if (!devicep->openacc.async.dev2host_func (devicep->target_id, h, d, sz,
+						 aq))
+	{
+	  gomp_mutex_unlock (&devicep->lock);
+	  gomp_fatal ("Copying of dev object [%p..%p) to host object [%p..%p) "
+		      "failed", d, d + sz, h, h + sz);
+	}
+    }
+  else
+    gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
 }
 
 static void
@@ -319,18 +367,199 @@
     }
 }
 
+#ifdef RC_CHECKING
+void
+dump_tgt (const char *where, struct target_mem_desc *tgt)
+{
+  if (!getenv ("GOMP_DEBUG_TGT"))
+    return;
+
+  fprintf (stderr, "%s: %s: tgt=%p\n", __FUNCTION__, where, (void*) tgt);
+  fprintf (stderr, "refcount=%d\n", (int) tgt->refcount);
+  fprintf (stderr, "tgt_start=%p\n", (void*) tgt->tgt_start);
+  fprintf (stderr, "tgt_end=%p\n", (void*) tgt->tgt_end);
+  fprintf (stderr, "to_free=%p\n", tgt->to_free);
+  fprintf (stderr, "list_count=%d\n", (int) tgt->list_count);
+  for (int i = 0; i < tgt->list_count; i++)
+    {
+      fprintf (stderr, "list item %d:\n", i);
+      fprintf (stderr, "  key: %p\n", (void*) tgt->list[i].key);
+      if (tgt->list[i].key)
+	{
+	  fprintf (stderr, "  key.host_start=%p\n",
+		   (void*) tgt->list[i].key->host_start);
+	  fprintf (stderr, "  key.host_end=%p\n",
+		   (void*) tgt->list[i].key->host_end);
+	  fprintf (stderr, "  key.tgt=%p\n", (void*) tgt->list[i].key->tgt);
+	  fprintf (stderr, "  key.offset=%d\n",
+		   (int) tgt->list[i].key->tgt_offset);
+	  fprintf (stderr, "  key.refcount=%d\n",
+		   (int) tgt->list[i].key->refcount);
+	  if (tgt->list[i].key->virtual_refcount == VREFCOUNT_LINK_KEY)
+	    fprintf (stderr, "  key.u.link_key=%p\n",
+		     (void*) tgt->list[i].key->u.link_key);
+	  else
+	    {
+	      fprintf (stderr, "  key.virtual_refcount=%d\n",
+		       (int) tgt->list[i].key->virtual_refcount);
+	      fprintf (stderr, "  key.u.attach_count=%p\n",
+		       (void*) tgt->list[i].key->u.attach_count);
+	    }
+	}
+    }
+  fprintf (stderr, "\n");
+}
+
+static void
+rc_check_clear (splay_tree_node node)
+{
+  splay_tree_key k = &node->key;
+
+  k->refcount_chk = 0;
+  k->tgt->refcount_chk = 0;
+  k->tgt->mark = false;
+
+  if (node->left)
+    rc_check_clear (node->left);
+  if (node->right)
+    rc_check_clear (node->right);
+}
+
+static void
+rc_check_count (splay_tree_node node)
+{
+  splay_tree_key k = &node->key;
+  struct target_mem_desc *t;
+
+  /* Add virtual reference counts ("acc enter data", etc.) for this key.  */
+  k->refcount_chk += k->virtual_refcount;
+
+  t = k->tgt;
+  t->refcount_chk++;
+
+  if (!t->mark)
+    {
+      for (int i = 0; i < t->list_count; i++)
+	if (t->list[i].key)
+	  t->list[i].key->refcount_chk++;
+
+      t->mark = true;
+    }
+
+  if (node->left)
+    rc_check_count (node->left);
+  if (node->right)
+    rc_check_count (node->right);
+}
+
+static bool
+rc_check_verify (splay_tree_node node, bool noisy, bool errors)
+{
+  splay_tree_key k = &node->key;
+  struct target_mem_desc *t;
+
+  if (k->refcount != REFCOUNT_INFINITY)
+    {
+      if (noisy)
+	fprintf (stderr, "key %p (%p..+%d): rc=%d/%d, virt_rc=%d\n", k,
+		 (void *) k->host_start, (int) (k->host_end - k->host_start),
+		 (int) k->refcount, (int) k->refcount_chk,
+		 (int) k->virtual_refcount);
+
+      if (k->refcount != k->refcount_chk)
+	{
+	  if (noisy)
+	    fprintf (stderr, "  -- key refcount mismatch!\n");
+	  errors = true;
+	}
+
+      t = k->tgt;
+
+      if (noisy)
+	fprintf (stderr, "tgt %p: rc=%d/%d\n", t, (int) t->refcount,
+		 (int) t->refcount_chk);
+
+      if (t->refcount != t->refcount_chk)
+	{
+	  if (noisy)
+	    fprintf (stderr,
+		     "  -- target memory descriptor refcount mismatch!\n");
+	  errors = true;
+	}
+    }
+
+  if (node->left)
+    errors |= rc_check_verify (node->left, noisy, errors);
+  if (node->right)
+    errors |= rc_check_verify (node->right, noisy, errors);
+
+  return errors;
+}
+
+/* Call with device locked.  */
+
+attribute_hidden void
+gomp_rc_check (struct gomp_device_descr *devicep, struct target_mem_desc *tgt)
+{
+  splay_tree sp = &devicep->mem_map;
+
+  bool noisy = getenv ("GOMP_DEBUG_TGT") != 0;
+
+  if (noisy)
+    fprintf (stderr, "\n*** GOMP_RC_CHECK ***\n\n");
+
+  if (sp->root)
+    {
+      rc_check_clear (sp->root);
+
+      for (struct target_mem_desc *t = tgt; t; t = t->prev)
+	{
+	  t->refcount_chk = 0;
+	  t->mark = false;
+	}
+
+      /* Add references for interconnected splay-tree keys.  */
+      rc_check_count (sp->root);
+
+      /* Add references for the tgt for a currently-executing kernel and/or
+	 any enclosing data directives.  */
+      for (struct target_mem_desc *t = tgt; t; t = t->prev)
+	{
+	  t->refcount_chk++;
+
+	  if (!t->mark)
+	    {
+	      for (int i = 0; i < t->list_count; i++)
+		if (t->list[i].key)
+		  t->list[i].key->refcount_chk++;
+
+	      t->mark = true;
+	    }
+	}
+
+      if (rc_check_verify (sp->root, noisy, false))
+	{
+	  gomp_mutex_unlock (&devicep->lock);
+	  gomp_fatal ("refcount checking failure");
+	}
+    }
+}
+#endif
+
 /* Handle the case where gomp_map_lookup, splay_tree_lookup or
    gomp_map_0len_lookup found oldn for newn.
    Helper function of gomp_map_vars.  */
 
 static inline void
-gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
+gomp_map_vars_existing (struct gomp_device_descr *devicep,
+			struct goacc_asyncqueue *aq, splay_tree_key oldn,
 			splay_tree_key newn, struct target_var_desc *tgt_var,
 			unsigned char kind, struct gomp_coalesce_buf *cbuf)
 {
   tgt_var->key = oldn;
   tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
   tgt_var->always_copy_from = GOMP_MAP_ALWAYS_FROM_P (kind);
+  tgt_var->do_detach = kind == GOMP_MAP_ATTACH;
   tgt_var->offset = newn->host_start - oldn->host_start;
   tgt_var->length = newn->host_end - newn->host_start;
 
@@ -346,11 +575,11 @@
     }
 
   if (GOMP_MAP_ALWAYS_TO_P (kind))
-    gomp_copy_host2dev (devicep,
+    gomp_copy_host2dev (devicep, aq,
 			(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
 				  + newn->host_start - oldn->host_start),
 			(void *) newn->host_start,
-			newn->host_end - newn->host_start, cbuf);
+			newn->host_end - newn->host_start, false, cbuf);
 
   if (oldn->refcount != REFCOUNT_INFINITY)
     oldn->refcount++;
@@ -364,8 +593,8 @@
 }
 
 static void
-gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
-		  uintptr_t target_offset, uintptr_t bias,
+gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
+		  uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias,
 		  struct gomp_coalesce_buf *cbuf)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
@@ -376,10 +605,10 @@
   if (cur_node.host_start == (uintptr_t) NULL)
     {
       cur_node.tgt_offset = (uintptr_t) NULL;
-      gomp_copy_host2dev (devicep,
+      gomp_copy_host2dev (devicep, aq,
 			  (void *) (tgt->tgt_start + target_offset),
 			  (void *) &cur_node.tgt_offset,
-			  sizeof (void *), cbuf);
+			  sizeof (void *), true, cbuf);
       return;
     }
   /* Add bias to the pointer value.  */
@@ -398,12 +627,14 @@
      array section.  Now subtract bias to get what we want
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
-  gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
-		      (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
+  gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
+		      (void *) &cur_node.tgt_offset, sizeof (void *), true,
+		      cbuf);
 }
 
 static void
-gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
+gomp_map_fields_existing (struct target_mem_desc *tgt,
+			  struct goacc_asyncqueue *aq, splay_tree_key n,
 			  size_t first, size_t i, void **hostaddrs,
 			  size_t *sizes, void *kinds,
 			  struct gomp_coalesce_buf *cbuf)
@@ -423,7 +654,7 @@
       && n2->tgt == n->tgt
       && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
     {
-      gomp_map_vars_existing (devicep, n2, &cur_node,
+      gomp_map_vars_existing (devicep, aq, n2, &cur_node,
 			      &tgt->list[i], kind & typemask, cbuf);
       return;
     }
@@ -439,8 +670,8 @@
 	      && n2->host_start - n->host_start
 		 == n2->tgt_offset - n->tgt_offset)
 	    {
-	      gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-				      kind & typemask, cbuf);
+	      gomp_map_vars_existing (devicep, aq, n2, &cur_node,
+				      &tgt->list[i], kind & typemask, cbuf);
 	      return;
 	    }
 	}
@@ -451,7 +682,7 @@
 	  && n2->tgt == n->tgt
 	  && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
 	{
-	  gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
+	  gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i],
 				  kind & typemask, cbuf);
 	  return;
 	}
@@ -463,30 +694,314 @@
 	      (void *) cur_node.host_end);
 }
 
-static inline uintptr_t
+void
+gomp_attach_pointer (struct gomp_device_descr *devicep,
+		     struct goacc_asyncqueue *aq, splay_tree mem_map,
+		     splay_tree_key n, uintptr_t attach_to, size_t bias,
+		     struct gomp_coalesce_buf *cbufp)
+{
+  struct splay_tree_key_s s;
+  size_t size, idx;
+
+  if (n == NULL)
+    {
+      gomp_mutex_unlock (&devicep->lock);
+      gomp_fatal ("enclosing struct not mapped for attach");
+    }
+
+  size = (n->host_end - n->host_start + sizeof (void *) - 1) / sizeof (void *);
+  /* We might have a pointer in a packed struct: however we cannot have more
+     than one such pointer in each pointer-sized portion of the struct, so
+     this is safe.  */
+  idx = (attach_to - n->host_start) / sizeof (void *);
+
+  assert (n->virtual_refcount != VREFCOUNT_LINK_KEY);
+
+  if (!n->u.attach_count)
+    n->u.attach_count
+      = gomp_malloc_cleared (sizeof (*n->u.attach_count) * size);
+
+  if (n->u.attach_count[idx] < UINTPTR_MAX)
+    n->u.attach_count[idx]++;
+  else
+    {
+      gomp_mutex_unlock (&devicep->lock);
+      gomp_fatal ("attach count overflow");
+    }
+
+  if (n->u.attach_count[idx] == 1)
+    {
+      uintptr_t devptr = n->tgt->tgt_start + n->tgt_offset + attach_to
+			 - n->host_start;
+      uintptr_t target = (uintptr_t) *(void **) attach_to;
+      splay_tree_key tn;
+      uintptr_t data;
+
+      if ((void *) target == NULL)
+	{
+	  gomp_mutex_unlock (&devicep->lock);
+	  gomp_fatal ("attempt to attach null pointer");
+	}
+
+      s.host_start = target + bias;
+      s.host_end = s.host_start + 1;
+      tn = splay_tree_lookup (mem_map, &s);
+
+      if (!tn)
+	{
+	  gomp_mutex_unlock (&devicep->lock);
+	  gomp_fatal ("pointer target not mapped for attach");
+	}
+
+      data = tn->tgt->tgt_start + tn->tgt_offset + target - tn->host_start;
+
+      gomp_debug (1,
+		  "%s: attaching host %p, target %p (struct base %p) to %p\n",
+		  __FUNCTION__, (void *) attach_to, (void *) devptr,
+		  (void *) (n->tgt->tgt_start + n->tgt_offset), (void *) data);
+
+      gomp_copy_host2dev (devicep, aq, (void *) devptr, (void *) &data,
+			  sizeof (void *), true, cbufp);
+    }
+  else
+    gomp_debug (1, "%s: attach count for %p -> %u\n", __FUNCTION__,
+		(void *) attach_to, (int) n->u.attach_count[idx]);
+}
+
+void
+gomp_detach_pointer (struct gomp_device_descr *devicep,
+		     struct goacc_asyncqueue *aq, splay_tree_key n,
+		     uintptr_t detach_from, bool finalize,
+		     struct gomp_coalesce_buf *cbufp)
+{
+  size_t idx;
+
+  if (n == NULL)
+    {
+      gomp_mutex_unlock (&devicep->lock);
+      gomp_fatal ("enclosing struct not mapped for detach");
+    }
+
+  idx = (detach_from - n->host_start) / sizeof (void *);
+
+  assert (n->virtual_refcount != VREFCOUNT_LINK_KEY);
+
+  if (!n->u.attach_count)
+    {
+      gomp_mutex_unlock (&devicep->lock);
+      gomp_fatal ("no attachment counters for struct");
+    }
+
+  if (finalize)
+    n->u.attach_count[idx] = 1;
+
+  if (n->u.attach_count[idx] == 0)
+    {
+      gomp_mutex_unlock (&devicep->lock);
+      gomp_fatal ("attach count underflow");
+    }
+  else
+    n->u.attach_count[idx]--;
+
+  if (n->u.attach_count[idx] == 0)
+    {
+      uintptr_t devptr = n->tgt->tgt_start + n->tgt_offset + detach_from
+			 - n->host_start;
+      uintptr_t target = (uintptr_t) *(void **) detach_from;
+
+      gomp_debug (1,
+		  "%s: detaching host %p, target %p (struct base %p) to %p\n",
+		  __FUNCTION__, (void *) detach_from, (void *) devptr,
+		  (void *) (n->tgt->tgt_start + n->tgt_offset),
+		  (void *) target);
+
+      gomp_copy_host2dev (devicep, aq, (void *) devptr, (void *) &target,
+			  sizeof (void *), true, cbufp);
+    }
+  else
+    gomp_debug (1, "%s: attach count for %p -> %u\n", __FUNCTION__,
+		(void *) detach_from, (int) n->u.attach_count[idx]);
+}
+
+uintptr_t
 gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i)
 {
   if (tgt->list[i].key != NULL)
     return tgt->list[i].key->tgt->tgt_start
 	   + tgt->list[i].key->tgt_offset
 	   + tgt->list[i].offset;
-  if (tgt->list[i].offset == ~(uintptr_t) 0)
-    return (uintptr_t) hostaddrs[i];
-  if (tgt->list[i].offset == ~(uintptr_t) 1)
-    return 0;
-  if (tgt->list[i].offset == ~(uintptr_t) 2)
-    return tgt->list[i + 1].key->tgt->tgt_start
-	   + tgt->list[i + 1].key->tgt_offset
-	   + tgt->list[i + 1].offset
-	   + (uintptr_t) hostaddrs[i]
-	   - (uintptr_t) hostaddrs[i + 1];
-  return tgt->tgt_start + tgt->list[i].offset;
+
+  switch (tgt->list[i].offset)
+    {
+    case OFFSET_INLINED:
+      return (uintptr_t) hostaddrs[i];
+
+    case OFFSET_POINTER:
+      return 0;
+
+    case OFFSET_STRUCT:
+      return tgt->list[i + 1].key->tgt->tgt_start
+	     + tgt->list[i + 1].key->tgt_offset
+	     + tgt->list[i + 1].offset
+	     + (uintptr_t) hostaddrs[i]
+	     - (uintptr_t) hostaddrs[i + 1];
+
+    default:
+      return tgt->tgt_start + tgt->list[i].offset;
+    }
 }
 
-attribute_hidden struct target_mem_desc *
-gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
-	       void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
-	       bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
+/* Definitions for data structures describing dynamic, non-contiguous arrays
+   (Note: interfaces with compiler)
+
+   The compiler generates a descriptor for each such array, places the
+   descriptor on stack, and passes the address of the descriptor to the libgomp
+   runtime as a normal map argument. The runtime then processes the array
+   data structure setup, and replaces the argument with the new actual
+   array address for the child function.
+
+   Care must be taken such that the struct field and layout assumptions
+   of struct gomp_array_dim, gomp_array_descr_type inside the compiler
+   be consistant with the below declarations.  */
+
+struct gomp_array_dim {
+  size_t base;
+  size_t length;
+  size_t elem_size;
+  size_t is_array;
+};
+
+struct gomp_array_descr_type {
+  void *ptr;
+  size_t ndims;
+  struct gomp_array_dim dims[];
+};
+
+/* Internal dynamic array info struct, used only here inside the runtime. */
+
+struct da_info
+{
+  struct gomp_array_descr_type *descr;
+  size_t map_index;
+  size_t ptrblock_size;
+  size_t data_row_num;
+  size_t data_row_size;
+};
+
+static size_t
+gomp_dynamic_array_count_rows (struct gomp_array_descr_type *descr)
+{
+  size_t nrows = 1;
+  for (size_t d = 0; d < descr->ndims - 1; d++)
+    nrows *= descr->dims[d].length / sizeof (void *);
+  return nrows;
+}
+
+static void
+gomp_dynamic_array_compute_info (struct da_info *da)
+{
+  size_t d, n = 1;
+  struct gomp_array_descr_type *descr = da->descr;
+
+  da->ptrblock_size = 0;
+  for (d = 0; d < descr->ndims - 1; d++)
+    {
+      size_t dim_count = descr->dims[d].length / descr->dims[d].elem_size;
+      size_t dim_ptrblock_size = (descr->dims[d + 1].is_array
+				  ? 0 : descr->dims[d].length * n);
+      da->ptrblock_size += dim_ptrblock_size;
+      n *= dim_count;
+    }
+  da->data_row_num = n;
+  da->data_row_size = descr->dims[d].length;
+}
+
+static void
+gomp_dynamic_array_fill_rows_1 (struct gomp_array_descr_type *descr, void *da,
+				size_t d, void ***row_ptr, size_t *count)
+{
+  if (d < descr->ndims - 1)
+    {
+      size_t elsize = descr->dims[d].elem_size;
+      size_t n = descr->dims[d].length / elsize;
+      void *p = da + descr->dims[d].base;
+      for (size_t i = 0; i < n; i++)
+	{
+	  void *ptr = p + i * elsize;
+	  /* Deref if next dimension is not array.  */
+	  if (!descr->dims[d + 1].is_array)
+	    ptr = *((void **) ptr);
+	  gomp_dynamic_array_fill_rows_1 (descr, ptr, d + 1, row_ptr, count);
+	}
+    }
+  else
+    {
+      **row_ptr = da + descr->dims[d].base;
+      *row_ptr += 1;
+      *count += 1;
+    }
+}
+
+static size_t
+gomp_dynamic_array_fill_rows (struct gomp_array_descr_type *descr, void *rows[])
+{
+  size_t count = 0;
+  void **p = rows;
+  gomp_dynamic_array_fill_rows_1 (descr, descr->ptr, 0, &p, &count);
+  return count;
+}
+
+static void *
+gomp_dynamic_array_create_ptrblock (struct da_info *da,
+				    void *tgt_addr, void *tgt_data_rows[])
+{
+  struct gomp_array_descr_type *descr = da->descr;
+  void *ptrblock = gomp_malloc (da->ptrblock_size);
+  void **curr_dim_ptrblock = (void **) ptrblock;
+  size_t n = 1;
+
+  for (size_t d = 0; d < descr->ndims - 1; d++)
+    {
+      int curr_dim_len = descr->dims[d].length;
+      int next_dim_len = descr->dims[d + 1].length;
+      int curr_dim_num = curr_dim_len / sizeof (void *);
+
+      void *next_dim_ptrblock
+	= (void *)(curr_dim_ptrblock + n * curr_dim_num);
+
+      for (int b = 0; b < n; b++)
+        for (int i = 0; i < curr_dim_num; i++)
+	  {
+	    if (d < descr->ndims - 2)
+	      {
+		void *ptr = (next_dim_ptrblock
+			     + b * curr_dim_num * next_dim_len
+			     + i * next_dim_len);
+		void *tgt_ptr = tgt_addr + (ptr - ptrblock);
+		curr_dim_ptrblock[b * curr_dim_num + i] = tgt_ptr;
+	      }
+	    else
+	      {
+		curr_dim_ptrblock[b * curr_dim_num + i]
+		  = tgt_data_rows[b * curr_dim_num + i];
+	      }
+	    void *addr = &curr_dim_ptrblock[b * curr_dim_num + i];
+	    assert (ptrblock <= addr && addr < ptrblock + da->ptrblock_size);
+	  }
+
+      n *= curr_dim_num;
+      curr_dim_ptrblock = next_dim_ptrblock;
+    }
+  assert (n == da->data_row_num);
+  return ptrblock;
+}
+
+static inline __attribute__((always_inline)) struct target_mem_desc *
+gomp_map_vars_internal (struct gomp_device_descr *devicep,
+			struct goacc_asyncqueue *aq, size_t mapnum,
+			void **hostaddrs, void **devaddrs, size_t *sizes,
+			void *kinds, bool short_mapkind,
+			enum gomp_map_vars_kind pragma_kind)
 {
   size_t i, tgt_align, tgt_size, not_found_cnt = 0;
   bool has_firstprivate = false;
@@ -494,10 +1009,39 @@
   const int typemask = short_mapkind ? 0xff : 0x7;
   struct splay_tree_s *mem_map = &devicep->mem_map;
   struct splay_tree_key_s cur_node;
-  struct target_mem_desc *tgt
-    = gomp_malloc (sizeof (*tgt) + sizeof (tgt->list[0]) * mapnum);
-  tgt->list_count = mapnum;
-  tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
+  struct target_mem_desc *tgt;
+
+  bool process_dynarrays = false;
+  size_t da_data_row_num = 0, row_start = 0;
+  size_t da_info_num = 0, da_index;
+  struct da_info *da_info = NULL;
+  struct target_var_desc *row_desc;
+  uintptr_t target_row_addr;
+  void **host_data_rows = NULL, **target_data_rows = NULL;
+  void *row;
+
+  if (mapnum > 0)
+    {
+      int kind = get_kind (short_mapkind, kinds, 0);
+      process_dynarrays = GOMP_MAP_DYNAMIC_ARRAY_P (kind & typemask);
+    }
+
+  if (process_dynarrays)
+    for (i = 0; i < mapnum; i++)
+      {
+	int kind = get_kind (short_mapkind, kinds, i);
+	if (GOMP_MAP_DYNAMIC_ARRAY_P (kind & typemask))
+	  {
+	    da_data_row_num += gomp_dynamic_array_count_rows (hostaddrs[i]);
+	    da_info_num += 1;
+	  }
+      }
+
+  tgt = gomp_malloc (sizeof (*tgt)
+		     + sizeof (tgt->list[0]) * (mapnum + da_data_row_num));
+  tgt->list_count = mapnum + da_data_row_num;
+  tgt->refcount = (pragma_kind == GOMP_MAP_VARS_ENTER_DATA
+		   || pragma_kind == GOMP_MAP_VARS_OPENACC_ENTER_DATA) ? 0 : 1;
   tgt->device_descr = devicep;
   struct gomp_coalesce_buf cbuf, *cbufp = NULL;
 
@@ -508,6 +1052,14 @@
       return tgt;
     }
 
+  if (da_info_num)
+    da_info = gomp_alloca (sizeof (struct da_info) * da_info_num);
+  if (da_data_row_num)
+    {
+      host_data_rows = gomp_malloc (sizeof (void *) * da_data_row_num);
+      target_data_rows = gomp_malloc (sizeof (void *) * da_data_row_num);
+    }
+
   tgt_align = sizeof (void *);
   tgt_size = 0;
   cbuf.chunks = NULL;
@@ -539,14 +1091,14 @@
       return NULL;
     }
 
-  for (i = 0; i < mapnum; i++)
+  for (i = 0, da_index = 0; i < mapnum; i++)
     {
       int kind = get_kind (short_mapkind, kinds, i);
       if (hostaddrs[i] == NULL
 	  || (kind & typemask) == GOMP_MAP_FIRSTPRIVATE_INT)
 	{
 	  tgt->list[i].key = NULL;
-	  tgt->list[i].offset = ~(uintptr_t) 0;
+	  tgt->list[i].offset = OFFSET_INLINED;
 	  continue;
 	}
       else if ((kind & typemask) == GOMP_MAP_USE_DEVICE_PTR)
@@ -556,6 +1108,13 @@
 	  splay_tree_key n = gomp_map_lookup (mem_map, &cur_node);
 	  if (n == NULL)
 	    {
+              if (pragma_kind == GOMP_MAP_VARS_OPENACC_IF_PRESENT)
+		{
+		  /* No error, continue using the host address.  */
+		  tgt->list[i].key = NULL;
+		  tgt->list[i].offset = OFFSET_INLINED;
+		  continue;
+		}
 	      gomp_mutex_unlock (&devicep->lock);
 	      gomp_fatal ("use_device_ptr pointer wasn't mapped");
 	    }
@@ -564,7 +1123,7 @@
 	    = (void *) (n->tgt->tgt_start + n->tgt_offset
 			+ cur_node.host_start);
 	  tgt->list[i].key = NULL;
-	  tgt->list[i].offset = ~(uintptr_t) 0;
+	  tgt->list[i].offset = OFFSET_INLINED;
 	  continue;
 	}
       else if ((kind & typemask) == GOMP_MAP_STRUCT)
@@ -575,7 +1134,7 @@
 	  cur_node.host_end = (uintptr_t) hostaddrs[last]
 			      + sizes[last];
 	  tgt->list[i].key = NULL;
-	  tgt->list[i].offset = ~(uintptr_t) 2;
+	  tgt->list[i].offset = OFFSET_STRUCT;
 	  splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
 	  if (n == NULL)
 	    {
@@ -589,8 +1148,9 @@
 	      for (i = first; i <= last; i++)
 		{
 		  tgt->list[i].key = NULL;
-		  if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
-					     & typemask))
+		  if (!aq
+		      && gomp_to_device_kind_p (get_kind (short_mapkind, kinds,
+							  i) & typemask))
 		    gomp_coalesce_buf_add (&cbuf,
 					   tgt_size - cur_node.host_end
 					   + (uintptr_t) hostaddrs[i],
@@ -600,7 +1160,7 @@
 	      continue;
 	    }
 	  for (i = first; i <= last; i++)
-	    gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+	    gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
 				      sizes, kinds, NULL);
 	  i--;
 	  continue;
@@ -608,12 +1168,38 @@
       else if ((kind & typemask) == GOMP_MAP_ALWAYS_POINTER)
 	{
 	  tgt->list[i].key = NULL;
-	  tgt->list[i].offset = ~(uintptr_t) 1;
+	  tgt->list[i].offset = OFFSET_POINTER;
 	  has_firstprivate = true;
 	  continue;
 	}
+      else if (GOMP_MAP_DYNAMIC_ARRAY_P (kind & typemask))
+	{
+	  /* Ignore dynamic arrays for now, we process them together
+	     later.  */
+	  tgt->list[i].key = NULL;
+	  tgt->list[i].offset = 0;
+	  not_found_cnt++;
+
+	  struct da_info *da = &da_info[da_index++];
+	  da->descr = (struct gomp_array_descr_type *) hostaddrs[i];
+	  da->map_index = i;
+	  continue;
+	}
+      else if ((kind & typemask) == GOMP_MAP_ATTACH)
+	{
+	  tgt->list[i].key = NULL;
+	  has_firstprivate = true;
+	  continue;
+	}
+      else if ((kind & typemask) == GOMP_MAP_NO_ALLOC)
+        {
+	  tgt->list[i].key = NULL;
+	  tgt->list[i].offset = 0;
+	  continue;
+	}
       cur_node.host_start = (uintptr_t) hostaddrs[i];
-      if (!GOMP_MAP_POINTER_P (kind & typemask))
+      if (!GOMP_MAP_POINTER_P (kind & typemask)
+          && (kind & typemask) != GOMP_MAP_ATTACH)
 	cur_node.host_end = cur_node.host_start + sizes[i];
       else
 	cur_node.host_end = cur_node.host_start + sizeof (void *);
@@ -625,8 +1211,9 @@
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	  gomp_coalesce_buf_add (&cbuf, tgt_size,
-				 cur_node.host_end - cur_node.host_start);
+	  if (!aq)
+	    gomp_coalesce_buf_add (&cbuf, tgt_size,
+				   cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  has_firstprivate = true;
 	  continue;
@@ -638,14 +1225,14 @@
 	  if (!n)
 	    {
 	      tgt->list[i].key = NULL;
-	      tgt->list[i].offset = ~(uintptr_t) 1;
+	      tgt->list[i].offset = OFFSET_POINTER;
 	      continue;
 	    }
 	}
       else
 	n = splay_tree_lookup (mem_map, &cur_node);
       if (n && n->refcount != REFCOUNT_LINK)
-	gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
+	gomp_map_vars_existing (devicep, aq, n, &cur_node, &tgt->list[i],
 				kind & typemask, NULL);
       else
 	{
@@ -656,7 +1243,7 @@
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	  if (gomp_to_device_kind_p (kind & typemask))
+	  if (!aq && gomp_to_device_kind_p (kind & typemask))
 	    gomp_coalesce_buf_add (&cbuf, tgt_size,
 				   cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
@@ -680,6 +1267,59 @@
 	}
     }
 
+  /* For dynamic arrays. Each data row is one target item, separated from
+     the normal map clause items, hence we order them after mapnum.  */
+  if (process_dynarrays)
+    {
+      for (i = 0, da_index = 0, row_start = 0; i < mapnum; i++)
+	{
+	  int kind = get_kind (short_mapkind, kinds, i);
+	  if (!GOMP_MAP_DYNAMIC_ARRAY_P (kind & typemask))
+	    continue;
+
+	  struct da_info *da = &da_info[da_index++];
+	  struct gomp_array_descr_type *descr = da->descr;
+	  size_t nr;
+
+	  gomp_dynamic_array_compute_info (da);
+
+	  /* We have allocated space in host/target_data_rows to place all the
+	     row data block pointers, now we can start filling them in.  */
+	  nr = gomp_dynamic_array_fill_rows (descr, &host_data_rows[row_start]);
+	  assert (nr == da->data_row_num);
+
+	  size_t align = (size_t) 1 << (kind >> rshift);
+	  if (tgt_align < align)
+	    tgt_align = align;
+	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  tgt_size += da->ptrblock_size;
+
+	  for (size_t j = 0; j < da->data_row_num; j++)
+	    {
+	      row = host_data_rows[row_start + j];
+	      row_desc = &tgt->list[mapnum + row_start + j];
+
+	      cur_node.host_start = (uintptr_t) row;
+	      cur_node.host_end = cur_node.host_start + da->data_row_size;
+	      splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
+	      if (n)
+		{
+		  assert (n->refcount != REFCOUNT_LINK);
+		  gomp_map_vars_existing (devicep, aq, n, &cur_node, row_desc,
+					  kind & typemask,
+					  /* TODO: cbuf? */ NULL);
+		}
+	      else
+		{
+		  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		  tgt_size += da->data_row_size;
+		  not_found_cnt++;
+		}
+	    }
+	  row_start += da->data_row_num;
+	}
+    }
+
   if (devaddrs)
     {
       if (mapnum != 1)
@@ -756,9 +1396,9 @@
 		tgt_size = (tgt_size + align - 1) & ~(align - 1);
 		tgt->list[i].offset = tgt_size;
 		len = sizes[i];
-		gomp_copy_host2dev (devicep,
+		gomp_copy_host2dev (devicep, aq,
 				    (void *) (tgt->tgt_start + tgt_size),
-				    (void *) hostaddrs[i], len, cbufp);
+				    (void *) hostaddrs[i], len, false, cbufp);
 		tgt_size += len;
 		continue;
 	      case GOMP_MAP_FIRSTPRIVATE_INT:
@@ -790,7 +1430,7 @@
 		    continue;
 		  }
 		for (i = first; i <= last; i++)
-		  gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+		  gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
 					    sizes, kinds, cbufp);
 		i--;
 		continue;
@@ -810,19 +1450,100 @@
 		  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1);
 		if (cur_node.tgt_offset)
 		  cur_node.tgt_offset -= sizes[i];
-		gomp_copy_host2dev (devicep,
-				    (void *) (n->tgt->tgt_start
-					      + n->tgt_offset
+		gomp_copy_host2dev (devicep, aq,
+				    (void *) (n->tgt->tgt_start + n->tgt_offset
 					      + cur_node.host_start
 					      - n->host_start),
 				    (void *) &cur_node.tgt_offset,
-				    sizeof (void *), cbufp);
+				    sizeof (void *), true, cbufp);
 		cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
 				      + cur_node.host_start - n->host_start;
 		continue;
+	      case GOMP_MAP_ATTACH:
+		{
+		  cur_node.host_start = (uintptr_t) hostaddrs[i];
+		  cur_node.host_end = cur_node.host_start + sizeof (void *);
+		  splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
+		  if (n != NULL)
+		    {
+		      tgt->list[i].key = n;
+		      tgt->list[i].offset = cur_node.host_start - n->host_start;
+		      tgt->list[i].length = n->host_end - n->host_start;
+		      tgt->list[i].copy_from = false;
+		      tgt->list[i].always_copy_from = false;
+		      tgt->list[i].do_detach
+		        = (pragma_kind != GOMP_MAP_VARS_OPENACC_ENTER_DATA);
+		      n->refcount++;
+		    }
+		  else
+		    {
+		      gomp_mutex_unlock (&devicep->lock);
+		      gomp_fatal ("outer struct not mapped for attach");
+		    }
+		  gomp_attach_pointer (devicep, aq, mem_map, n,
+				       (uintptr_t) hostaddrs[i], sizes[i],
+				       cbufp);
+		  continue;
+		}
+	      case GOMP_MAP_NO_ALLOC:
+		{
+		  cur_node.host_start = (uintptr_t) hostaddrs[i];
+		  cur_node.host_end = cur_node.host_start + sizes[i];
+		  splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
+		  if (n != NULL)
+		    {
+		      tgt->list[i].key = n;
+		      tgt->list[i].offset = cur_node.host_start - n->host_start;
+		      tgt->list[i].length = n->host_end - n->host_start;
+		      tgt->list[i].copy_from = false;
+		      tgt->list[i].always_copy_from = false;
+		      tgt->list[i].do_detach = false;
+		      n->refcount++;
+		    }
+		  else
+		    {
+		      tgt->list[i].key = NULL;
+		      tgt->list[i].offset = OFFSET_INLINED;
+		      tgt->list[i].length = sizes[i];
+		      tgt->list[i].copy_from = false;
+		      tgt->list[i].always_copy_from = false;
+		      tgt->list[i].do_detach = false;
+		      if (i + 1 < mapnum)
+			{
+			  int kind2 = get_kind (short_mapkind, kinds, i + 1);
+			  switch (kind2 & typemask)
+			    {
+			    case GOMP_MAP_ATTACH:
+			    case GOMP_MAP_POINTER:
+			      /* The data is not present but we have an attach
+				 or pointer clause next.  Skip over it.  */
+			      i++;
+			      tgt->list[i].key = NULL;
+			      tgt->list[i].offset = OFFSET_INLINED;
+			      tgt->list[i].length = sizes[i];
+			      tgt->list[i].copy_from = false;
+			      tgt->list[i].always_copy_from = false;
+			      tgt->list[i].do_detach = false;
+			      break;
+			    default:
+			      break;
+			    }
+			}
+		    }
+		  continue;
+		}
 	      default:
 		break;
 	      }
+
+	    if (GOMP_MAP_DYNAMIC_ARRAY_P (kind & typemask))
+	      {
+		tgt->list[i].key = &array->key;
+		tgt->list[i].key->tgt = tgt;
+		array++;
+		continue;
+	      }
+
 	    splay_tree_key k = &array->key;
 	    k->host_start = (uintptr_t) hostaddrs[i];
 	    if (!GOMP_MAP_POINTER_P (kind & typemask))
@@ -830,28 +1551,31 @@
 	    else
 	      k->host_end = k->host_start + sizeof (void *);
 	    splay_tree_key n = splay_tree_lookup (mem_map, k);
+	    /* Need to account for the case where a struct field hasn't been
+	       mapped onto the accelerator yet.  */
 	    if (n && n->refcount != REFCOUNT_LINK)
-	      gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
+	      gomp_map_vars_existing (devicep, aq, n, k, &tgt->list[i],
 				      kind & typemask, cbufp);
 	    else
 	      {
-		k->link_key = NULL;
+		k->u.link_key = NULL;
 		if (n && n->refcount == REFCOUNT_LINK)
 		  {
 		    /* Replace target address of the pointer with target address
 		       of mapped object in the splay tree.  */
 		    splay_tree_remove (mem_map, n);
-		    k->link_key = n;
+		    k->u.link_key = n;
+		    k->virtual_refcount = VREFCOUNT_LINK_KEY;
 		  }
 		size_t align = (size_t) 1 << (kind >> rshift);
 		tgt->list[i].key = k;
 		k->tgt = tgt;
-		if (field_tgt_clear != ~(size_t) 0)
+		if (field_tgt_clear != FIELD_TGT_EMPTY)
 		  {
 		    k->tgt_offset = k->host_start - field_tgt_base
 				    + field_tgt_offset;
 		    if (i == field_tgt_clear)
-		      field_tgt_clear = ~(size_t) 0;
+		      field_tgt_clear = FIELD_TGT_EMPTY;
 		  }
 		else
 		  {
@@ -862,10 +1586,12 @@
 		tgt->list[i].copy_from = GOMP_MAP_COPY_FROM_P (kind & typemask);
 		tgt->list[i].always_copy_from
 		  = GOMP_MAP_ALWAYS_FROM_P (kind & typemask);
+		tgt->list[i].do_detach = false;
 		tgt->list[i].offset = 0;
 		tgt->list[i].length = k->host_end - k->host_start;
 		k->refcount = 1;
-		k->dynamic_refcount = 0;
+		k->virtual_refcount = 0;
+		k->u.attach_count = NULL;
 		tgt->refcount++;
 		array->left = NULL;
 		array->right = NULL;
@@ -884,22 +1610,25 @@
 		  case GOMP_MAP_FORCE_TOFROM:
 		  case GOMP_MAP_ALWAYS_TO:
 		  case GOMP_MAP_ALWAYS_TOFROM:
-		    gomp_copy_host2dev (devicep,
+		    gomp_copy_host2dev (devicep, aq,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start, cbufp);
+					k->host_end - k->host_start, false,
+					cbufp);
 		    break;
 		  case GOMP_MAP_POINTER:
-		    gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
+		    gomp_map_pointer (tgt, aq,
+				      (uintptr_t) *(void **) k->host_start,
 				      k->tgt_offset, sizes[i], cbufp);
 		    break;
 		  case GOMP_MAP_TO_PSET:
-		    gomp_copy_host2dev (devicep,
+		    gomp_copy_host2dev (devicep, aq,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start, cbufp);
+					k->host_end - k->host_start, false,
+					cbufp);
 
 		    for (j = i + 1; j < mapnum; j++)
 		      if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
@@ -915,9 +1644,10 @@
 			  tgt->list[j].key = k;
 			  tgt->list[j].copy_from = false;
 			  tgt->list[j].always_copy_from = false;
+			  tgt->list[j].do_detach = false;
 			  if (k->refcount != REFCOUNT_INFINITY)
 			    k->refcount++;
-			  gomp_map_pointer (tgt,
+			  gomp_map_pointer (tgt, aq,
 					    (uintptr_t) *(void **) hostaddrs[j],
 					    k->tgt_offset
 					    + ((uintptr_t) hostaddrs[j]
@@ -946,11 +1676,11 @@
 		    break;
 		  case GOMP_MAP_FORCE_DEVICEPTR:
 		    assert (k->host_end - k->host_start == sizeof (void *));
-		    gomp_copy_host2dev (devicep,
+		    gomp_copy_host2dev (devicep, aq,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					sizeof (void *), cbufp);
+					sizeof (void *), false, cbufp);
 		    break;
 		  default:
 		    gomp_mutex_unlock (&devicep->lock);
@@ -958,19 +1688,129 @@
 				kind);
 		  }
 
-		if (k->link_key)
+		if (k->virtual_refcount == VREFCOUNT_LINK_KEY && k->u.link_key)
 		  {
 		    /* Set link pointer on target to the device address of the
 		       mapped object.  */
 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
 		    /* We intentionally do not use coalescing here, as it's not
 		       data allocated by the current call to this function.  */
-		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
-					&tgt_addr, sizeof (void *), NULL);
+		    gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
+					&tgt_addr, sizeof (void *), true, NULL);
 		  }
 		array++;
 	      }
 	  }
+
+      /* Processing of dynamic array rows.  */
+      if (process_dynarrays)
+        {
+	  for (i = 0, da_index = 0, row_start = 0; i < mapnum; i++)
+	    {
+	      int kind = get_kind (short_mapkind, kinds, i);
+	      if (!GOMP_MAP_DYNAMIC_ARRAY_P (kind & typemask))
+		continue;
+
+	      struct da_info *da = &da_info[da_index++];
+	      assert (da->descr == hostaddrs[i]);
+
+	      /* The map for the dynamic array itself is never copied from
+		 during unmapping, its the data rows that count. Set copy from
+		 flags are set to false here.  */
+	      tgt->list[i].copy_from = false;
+	      tgt->list[i].always_copy_from = false;
+	      tgt->list[i].do_detach = false;
+
+	      size_t align = (size_t) 1 << (kind >> rshift);
+	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
+
+	      /* For the map of the dynamic array itself, adjust so that the
+		 passed device address points to the beginning of the
+		 ptrblock.  */
+	      tgt->list[i].key->tgt_offset = tgt_size;
+
+	      void *target_ptrblock = (void*) tgt->tgt_start + tgt_size;
+	      tgt_size += da->ptrblock_size;
+
+	      /* Add splay key for each data row in current DA.  */
+	      for (size_t j = 0; j < da->data_row_num; j++)
+		{
+		  row = host_data_rows[row_start + j];
+		  row_desc = &tgt->list[mapnum + row_start + j];
+
+		  cur_node.host_start = (uintptr_t) row;
+		  cur_node.host_end = cur_node.host_start + da->data_row_size;
+		  splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
+		  if (n)
+		    {
+		      assert (n->refcount != REFCOUNT_LINK);
+		      gomp_map_vars_existing (devicep, aq, n, &cur_node,
+					      row_desc, kind & typemask, cbufp);
+		      target_row_addr = n->tgt->tgt_start + n->tgt_offset;
+		    }
+		  else
+		    {
+		      tgt->refcount++;
+
+		      splay_tree_key k = &array->key;
+		      k->host_start = (uintptr_t) row;
+		      k->host_end = k->host_start + da->data_row_size;
+
+		      k->tgt = tgt;
+		      k->refcount = 1;
+		      k->virtual_refcount = 0;
+		      k->u.attach_count = NULL;
+		      tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		      target_row_addr = tgt->tgt_start + tgt_size;
+		      k->tgt_offset = tgt_size;
+		      tgt_size += da->data_row_size;
+
+		      row_desc->key = k;
+		      row_desc->copy_from
+			= GOMP_MAP_COPY_FROM_P (kind & typemask);
+		      row_desc->always_copy_from
+			= GOMP_MAP_COPY_FROM_P (kind & typemask);
+		      row_desc->do_detach = false;
+		      row_desc->offset = 0;
+		      row_desc->length = da->data_row_size;
+
+		      array->left = NULL;
+		      array->right = NULL;
+		      splay_tree_insert (mem_map, array);
+
+		      if (GOMP_MAP_COPY_TO_P (kind & typemask))
+			gomp_copy_host2dev (devicep, aq,
+					    (void *) tgt->tgt_start
+						     + k->tgt_offset,
+					    (void *) k->host_start,
+					    da->data_row_size, false, cbufp);
+		      array++;
+		    }
+		  target_data_rows[row_start + j] = (void *) target_row_addr;
+		}
+
+	      /* Now we have the target memory allocated, and target offsets of
+		 all row blocks assigned and calculated, we can construct the
+		 accelerator side ptrblock and copy it in.  */
+	      if (da->ptrblock_size)
+		{
+		  void *ptrblock = gomp_dynamic_array_create_ptrblock
+		    (da, target_ptrblock, target_data_rows + row_start);
+		  gomp_copy_host2dev (devicep, aq, target_ptrblock, ptrblock,
+				      da->ptrblock_size, true, cbufp);
+		  free (ptrblock);
+		}
+
+	      row_start += da->data_row_num;
+	    }
+	  assert (row_start == da_data_row_num && da_index == da_info_num);
+        }
+    }
+
+  if (da_data_row_num)
+    {
+      free (host_data_rows);
+      free (target_data_rows);
     }
 
   if (pragma_kind == GOMP_MAP_VARS_TARGET)
@@ -978,10 +1818,10 @@
       for (i = 0; i < mapnum; i++)
 	{
 	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-	  gomp_copy_host2dev (devicep,
+	  gomp_copy_host2dev (devicep, aq,
 			      (void *) (tgt->tgt_start + i * sizeof (void *)),
 			      (void *) &cur_node.tgt_offset, sizeof (void *),
-			      cbufp);
+			      true, cbufp);
 	}
     }
 
@@ -989,11 +1829,12 @@
     {
       long c = 0;
       for (c = 0; c < cbuf.chunk_cnt; ++c)
-	gomp_copy_host2dev (devicep,
+	gomp_copy_host2dev (devicep, aq,
 			    (void *) (tgt->tgt_start + cbuf.chunks[c].start),
 			    (char *) cbuf.buf + (cbuf.chunks[c].start
 						 - cbuf.chunks[0].start),
-			    cbuf.chunks[c].end - cbuf.chunks[c].start, NULL);
+			    cbuf.chunks[c].end - cbuf.chunks[c].start, true,
+			    NULL);
       free (cbuf.buf);
       cbuf.buf = NULL;
       cbufp = NULL;
@@ -1002,8 +1843,20 @@
   /* If the variable from "omp target enter data" map-list was already mapped,
      tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
      gomp_exit_data.  */
-  if (pragma_kind == GOMP_MAP_VARS_ENTER_DATA && tgt->refcount == 0)
+  if ((pragma_kind == GOMP_MAP_VARS_ENTER_DATA
+       || pragma_kind == GOMP_MAP_VARS_OPENACC_ENTER_DATA)
+      && tgt->refcount == 0)
     {
+      /* If we're about to discard a target_mem_desc with no "structural"
+	 references (tgt->refcount == 0), any splay keys linked in the tgt's
+	 list must have their virtual refcount incremented to represent that
+	 "lost" reference in order to implement the semantics of the OpenACC
+	 "present increment" operation properly.  */
+      if (pragma_kind == GOMP_MAP_VARS_OPENACC_ENTER_DATA)
+	for (i = 0; i < tgt->list_count; i++)
+	  if (tgt->list[i].key)
+	    tgt->list[i].key->virtual_refcount++;
+
       free (tgt);
       tgt = NULL;
     }
@@ -1012,7 +1865,27 @@
   return tgt;
 }
 
-static void
+attribute_hidden struct target_mem_desc *
+gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
+	       void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
+	       bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
+{
+  return gomp_map_vars_internal (devicep, NULL, mapnum, hostaddrs, devaddrs,
+				 sizes, kinds, short_mapkind, pragma_kind);
+}
+
+attribute_hidden struct target_mem_desc *
+gomp_map_vars_async (struct gomp_device_descr *devicep,
+		     struct goacc_asyncqueue *aq, size_t mapnum,
+		     void **hostaddrs, void **devaddrs, size_t *sizes,
+		     void *kinds, bool short_mapkind,
+		     enum gomp_map_vars_kind pragma_kind)
+{
+  return gomp_map_vars_internal (devicep, aq, mapnum, hostaddrs, devaddrs,
+				 sizes, kinds, short_mapkind, pragma_kind);
+}
+
+attribute_hidden void
 gomp_unmap_tgt (struct target_mem_desc *tgt)
 {
   /* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region.  */
@@ -1023,29 +1896,77 @@
   free (tgt);
 }
 
-attribute_hidden bool
-gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
+static bool
+gomp_unref_tgt (void *ptr)
+{
+  bool is_tgt_unmapped = false;
+
+  struct target_mem_desc *tgt = (struct target_mem_desc *) ptr;
+
+  if (tgt->refcount > 1)
+    tgt->refcount--;
+  else
+    {
+      gomp_unmap_tgt (tgt);
+      is_tgt_unmapped = true;
+    }
+
+  return is_tgt_unmapped;
+}
+
+static void
+gomp_unref_tgt_void (void *ptr)
+{
+  (void) gomp_unref_tgt (ptr);
+}
+
+static inline __attribute__((always_inline)) bool
+gomp_remove_var_internal (struct gomp_device_descr *devicep, splay_tree_key k,
+			  struct goacc_asyncqueue *aq)
 {
   bool is_tgt_unmapped = false;
   splay_tree_remove (&devicep->mem_map, k);
-  if (k->link_key)
-    splay_tree_insert (&devicep->mem_map, (splay_tree_node) k->link_key);
-  if (k->tgt->refcount > 1)
-    k->tgt->refcount--;
-  else
+  if (k->virtual_refcount == VREFCOUNT_LINK_KEY)
     {
-      is_tgt_unmapped = true;
-      gomp_unmap_tgt (k->tgt);
+      if (k->u.link_key)
+	splay_tree_insert (&devicep->mem_map, (splay_tree_node) k->u.link_key);
     }
+  else if (k->u.attach_count)
+    free (k->u.attach_count);
+  if (aq)
+    devicep->openacc.async.queue_callback_func (aq, gomp_unref_tgt_void,
+						(void *) k->tgt);
+  else
+    is_tgt_unmapped = gomp_unref_tgt ((void *) k->tgt);
   return is_tgt_unmapped;
 }
 
+attribute_hidden bool
+gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
+{
+  return gomp_remove_var_internal (devicep, k, NULL);
+}
+
+/* Remove a variable asynchronously.  This actually removes the variable
+   mapping immediately, but retains the linked target_mem_desc until the
+   asynchronous operation has completed (as it may still refer to target
+   memory).  The device lock must be held before entry, and remains locked on
+   exit.  */
+
+attribute_hidden void
+gomp_remove_var_async (struct gomp_device_descr *devicep, splay_tree_key k,
+		       struct goacc_asyncqueue *aq)
+{
+  (void) gomp_remove_var_internal (devicep, k, aq);
+}
+
 /* Unmap variables described by TGT.  If DO_COPYFROM is true, copy relevant
    variables back from device to host: if it is false, it is assumed that this
    has been done already.  */
 
-attribute_hidden void
-gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
+static inline __attribute__((always_inline)) void
+gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
+			  struct goacc_asyncqueue *aq)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
 
@@ -1065,6 +1986,18 @@
     }
 
   size_t i;
+
+  /* We must perform detachments before any copies back to the host.  */
+  for (i = 0; i < tgt->list_count; i++)
+    {
+      splay_tree_key k = tgt->list[i].key;
+
+      if (k != NULL && tgt->list[i].do_detach)
+	gomp_detach_pointer (devicep, aq, k, tgt->list[i].key->host_start
+					     + tgt->list[i].offset,
+			     k->refcount == 1, NULL);
+    }
+
   for (i = 0; i < tgt->list_count; i++)
     {
       splay_tree_key k = tgt->list[i].key;
@@ -1072,7 +2005,15 @@
 	continue;
 
       bool do_unmap = false;
-      if (k->refcount > 1 && k->refcount != REFCOUNT_INFINITY)
+      if (k->tgt == tgt
+	  && k->virtual_refcount > 0
+	  && k->virtual_refcount != VREFCOUNT_LINK_KEY
+	  && k->refcount != REFCOUNT_INFINITY)
+	{
+	  k->virtual_refcount--;
+	  k->refcount--;
+	}
+      else if (k->refcount > 1 && k->refcount != REFCOUNT_INFINITY)
 	k->refcount--;
       else if (k->refcount == 1)
 	{
@@ -1082,7 +2023,7 @@
 
       if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
 	  || tgt->list[i].always_copy_from)
-	gomp_copy_dev2host (devicep,
+	gomp_copy_dev2host (devicep, aq,
 			    (void *) (k->host_start + tgt->list[i].offset),
 			    (void *) (k->tgt->tgt_start + k->tgt_offset
 				      + tgt->list[i].offset),
@@ -1091,14 +2032,28 @@
 	gomp_remove_var (devicep, k);
     }
 
-  if (tgt->refcount > 1)
-    tgt->refcount--;
+  if (aq)
+    devicep->openacc.async.queue_callback_func (aq, gomp_unref_tgt_void,
+						(void *) tgt);
   else
-    gomp_unmap_tgt (tgt);
+    gomp_unref_tgt ((void *) tgt);
 
   gomp_mutex_unlock (&devicep->lock);
 }
 
+attribute_hidden void
+gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
+{
+  gomp_unmap_vars_internal (tgt, do_copyfrom, NULL);
+}
+
+attribute_hidden void
+gomp_unmap_vars_async (struct target_mem_desc *tgt, bool do_copyfrom,
+		       struct goacc_asyncqueue *aq)
+{
+  gomp_unmap_vars_internal (tgt, do_copyfrom, aq);
+}
+
 static void
 gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
 	     size_t *sizes, void *kinds, bool short_mapkind)
@@ -1148,9 +2103,10 @@
 	    size_t size = cur_node.host_end - cur_node.host_start;
 
 	    if (GOMP_MAP_COPY_TO_P (kind & typemask))
-	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
+	      gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
+				  false, NULL);
 	    if (GOMP_MAP_COPY_FROM_P (kind & typemask))
-	      gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
+	      gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
 	  }
       }
   gomp_mutex_unlock (&devicep->lock);
@@ -1184,7 +2140,7 @@
     = devicep->load_image_func (devicep->target_id, version,
 				target_data, &target_table);
 
-  if (num_target_entries != num_funcs + num_vars)
+  if (num_target_entries < num_funcs + num_vars)
     {
       gomp_mutex_unlock (&devicep->lock);
       if (is_register_lock)
@@ -1214,8 +2170,9 @@
       k->tgt = tgt;
       k->tgt_offset = target_table[i].start;
       k->refcount = REFCOUNT_INFINITY;
-      k->dynamic_refcount = 0;
-      k->link_key = NULL;
+      k->virtual_refcount = 0;
+      k->u.attach_count = NULL;
+      k->u.link_key = NULL;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -1247,8 +2204,9 @@
       k->tgt = tgt;
       k->tgt_offset = target_var->start;
       k->refcount = target_size & link_bit ? REFCOUNT_LINK : REFCOUNT_INFINITY;
-      k->dynamic_refcount = 0;
-      k->link_key = NULL;
+      k->virtual_refcount = 0;
+      k->u.attach_count = NULL;
+      k->u.link_key = NULL;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -1428,6 +2386,9 @@
 attribute_hidden void
 gomp_init_device (struct gomp_device_descr *devicep)
 {
+  gomp_debug (0, "%s (%s; %d; %d)\n", __FUNCTION__,
+	      devicep->name, (int) devicep->type, devicep->target_id);
+
   int i;
   if (!devicep->init_device_func (devicep->target_id))
     {
@@ -1445,9 +2406,24 @@
 				   false);
     }
 
+  /* Initialize OpenACC asynchronous queues.  */
+  goacc_init_asyncqueues (devicep);
+
   devicep->state = GOMP_DEVICE_INITIALIZED;
 }
 
+/* This function finalizes the target device, specified by DEVICEP.  DEVICEP
+   must be locked on entry, and remains locked on return.  */
+
+attribute_hidden bool
+gomp_fini_device (struct gomp_device_descr *devicep)
+{
+  bool ret = goacc_fini_asyncqueues (devicep);
+  ret &= devicep->fini_device_func (devicep->target_id);
+  devicep->state = GOMP_DEVICE_FINALIZED;
+  return ret;
+}
+
 attribute_hidden void
 gomp_unload_device (struct gomp_device_descr *devicep)
 {
@@ -1467,20 +2443,50 @@
     }
 }
 
-/* Free address mapping tables.  MM must be locked on entry, and remains locked
-   on return.  */
+/* Do we have offload data available for the given offload target type?
+   Instead of verifying that *all* offload data is available that could
+   possibly be required, we instead just look for *any*.  If we later find any
+   offload data missing, that's user error.  */
 
-attribute_hidden void
-gomp_free_memmap (struct splay_tree_s *mem_map)
+attribute_hidden bool
+gomp_offload_target_available_p (int type)
 {
-  while (mem_map->root)
-    {
-      struct target_mem_desc *tgt = mem_map->root->key.tgt;
+  gomp_debug (0, "%s (%d)\n", __FUNCTION__, type);
 
-      splay_tree_remove (mem_map, &mem_map->root->key);
-      free (tgt->array);
-      free (tgt);
+  bool available = false;
+
+  /* Has the offload target already been initialized?  */
+  for (int i = 0; !available && i < num_devices; i++)
+    {
+      struct gomp_device_descr *devicep = &devices[i];
+      gomp_mutex_lock (&devicep->lock);
+      if (devicep->type == type
+	  && devicep->state == GOMP_DEVICE_INITIALIZED)
+	available = true;
+      gomp_mutex_unlock (&devicep->lock);
     }
+
+  if (!available)
+    {
+      gomp_mutex_lock (&register_lock);
+
+      /* If there is no offload data available at all, we cannot later fail to
+	 find any of it for a specific offload target.  This is the case where
+	 there are no offloaded code regions in user code, but there can still
+	 be executable directives used, or runtime library calls made.  */
+      if (num_offload_images == 0)
+	available = true;
+
+      /* Can the offload target be initialized?  */
+      for (int i = 0; !available && i < num_offload_images; i++)
+	if (offload_images[i].type == type)
+	  available = true;
+
+      gomp_mutex_unlock (&register_lock);
+    }
+
+  gomp_debug (0, "  %s (%d): %d\n", __FUNCTION__, type, (int) available);
+  return available;
 }
 
 /* Host fallback for GOMP_target{,_ext} routines.  */
@@ -1956,7 +2962,7 @@
 
 	  if ((kind == GOMP_MAP_FROM && k->refcount == 0)
 	      || kind == GOMP_MAP_ALWAYS_FROM)
-	    gomp_copy_dev2host (devicep, (void *) cur_node.host_start,
+	    gomp_copy_dev2host (devicep, NULL, (void *) cur_node.host_start,
 				(void *) (k->tgt->tgt_start + k->tgt_offset
 					  + cur_node.host_start
 					  - k->host_start),
@@ -1964,9 +2970,9 @@
 	  if (k->refcount == 0)
 	    {
 	      splay_tree_remove (&devicep->mem_map, k);
-	      if (k->link_key)
+	      if (k->virtual_refcount == VREFCOUNT_LINK_KEY && k->u.link_key)
 		splay_tree_insert (&devicep->mem_map,
-				   (splay_tree_node) k->link_key);
+				   (splay_tree_node) k->u.link_key);
 	      if (k->tgt->refcount > 1)
 		k->tgt->refcount--;
 	      else
@@ -2503,7 +3509,8 @@
       k->tgt = tgt;
       k->tgt_offset = (uintptr_t) device_ptr + device_offset;
       k->refcount = REFCOUNT_INFINITY;
-      k->dynamic_refcount = 0;
+      k->virtual_refcount = 0;
+      k->u.link_key = NULL;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -2591,6 +3598,8 @@
 gomp_load_plugin_for_device (struct gomp_device_descr *device,
 			     const char *plugin_name)
 {
+  gomp_debug (0, "%s (\"%s\")\n", __FUNCTION__, plugin_name);
+
   const char *err = NULL, *last_missing = NULL;
 
   void *plugin_handle = dlopen (plugin_name, RTLD_LAZY);
@@ -2620,6 +3629,7 @@
   DLSYM (get_caps);
   DLSYM (get_type);
   DLSYM (get_num_devices);
+  DLSYM (get_property);
   DLSYM (init_device);
   DLSYM (fini_device);
   DLSYM (load_image);
@@ -2639,20 +3649,22 @@
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
     {
       if (!DLSYM_OPT (openacc.exec, openacc_exec)
-	  || !DLSYM_OPT (openacc.register_async_cleanup,
-			 openacc_register_async_cleanup)
-	  || !DLSYM_OPT (openacc.async_test, openacc_async_test)
-	  || !DLSYM_OPT (openacc.async_test_all, openacc_async_test_all)
-	  || !DLSYM_OPT (openacc.async_wait, openacc_async_wait)
-	  || !DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async)
-	  || !DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all)
-	  || !DLSYM_OPT (openacc.async_wait_all_async,
-			 openacc_async_wait_all_async)
-	  || !DLSYM_OPT (openacc.async_set_async, openacc_async_set_async)
+	  || !DLSYM_OPT (openacc.exec_params, openacc_exec_params)
 	  || !DLSYM_OPT (openacc.create_thread_data,
 			 openacc_create_thread_data)
 	  || !DLSYM_OPT (openacc.destroy_thread_data,
-			 openacc_destroy_thread_data))
+			 openacc_destroy_thread_data)
+	  || !DLSYM_OPT (openacc.async.construct, openacc_async_construct)
+	  || !DLSYM_OPT (openacc.async.destruct, openacc_async_destruct)
+	  || !DLSYM_OPT (openacc.async.test, openacc_async_test)
+	  || !DLSYM_OPT (openacc.async.synchronize, openacc_async_synchronize)
+	  || !DLSYM_OPT (openacc.async.serialize, openacc_async_serialize)
+	  || !DLSYM_OPT (openacc.async.queue_callback,
+			 openacc_async_queue_callback)
+	  || !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
+	  || !DLSYM_OPT (openacc.async.exec_params, openacc_async_exec_params)
+	  || !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
+	  || !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev))
 	{
 	  /* Require all the OpenACC handlers if we have
 	     GOMP_OFFLOAD_CAP_OPENACC_200.  */
@@ -2703,16 +3715,199 @@
       struct gomp_device_descr *devicep = &devices[i];
       gomp_mutex_lock (&devicep->lock);
       if (devicep->state == GOMP_DEVICE_INITIALIZED)
-	{
-	  ret = devicep->fini_device_func (devicep->target_id);
-	  devicep->state = GOMP_DEVICE_FINALIZED;
-	}
+	ret = gomp_fini_device (devicep);
       gomp_mutex_unlock (&devicep->lock);
       if (!ret)
 	gomp_fatal ("device finalization failed");
     }
 }
 
+/* Helper, to translate from an offload target to the corresponding plugin name.  */
+/* TODO: this duplicates the logic/information that we already have in
+   'offload_targets' vs. 'offload_plugins' variables,
+   'libgomp/plugin/configfrag.ac'.  */
+
+static const char *
+offload_target_to_plugin_name (const char *offload_target)
+{
+  if (strstr (offload_target, "-intelmic") != NULL)
+    return "intelmic";
+  else if (strncmp (offload_target, "nvptx", 5) == 0)
+    return "nvptx";
+  else if (strncmp (offload_target, "hsa", 3) == 0)
+    return "hsa";
+  else if (strstr (offload_target, "gcn") != NULL)
+    return "gcn";
+  else
+    gomp_fatal ("Unknown offload target: %s", offload_target);
+}
+
+/* List of requested offload targets, separated by colon.  Defaults to the list
+   determined when configuring libgomp.  */
+static const char *gomp_offload_targets = OFFLOAD_TARGETS;
+static bool gomp_offload_targets_set = false;
+static bool gomp_offload_targets_malloced = false;
+
+/* This function frees gomp_offload_targets.  */
+
+static void
+free_gomp_offload_targets (void)
+{
+  free ((char *) gomp_offload_targets);
+}
+
+/* Override the list of requested offload targets.  This must be called
+   early, before gomp_target_init.  */
+
+void
+GOMP_set_offload_targets (const char *offload_targets)
+{
+  gomp_debug (0, "%s (\"%s\"): %s\n", __FUNCTION__,
+	      offload_targets, gomp_offload_targets);
+
+  /* TODO: multithreading, locking.  */
+  /* TODO: this should not (sometimes) keep a copy of the offload_target
+     pointer, so that the caller knows what to expect.  */
+  /* TODO: What actually is supposed to happen if some parts of a program are
+     compiled with, for example, "-foffload=disable" (that is, when called with
+     the empty string for offload_targets), and others for other actual
+     (possibly different) offload targets?  */
+  if (gomp_is_initialized == PTHREAD_ONCE_INIT)
+    {
+      /* If we have not yet initialized, we capture all the offload targets
+	 requested.  We do not worry that the set of requested offload targets
+	 vs. the set of available offload data will eventually match; any such
+	 inconsistencies would be user error.  (See also
+	 gomp_offload_target_available_p.)  */
+      if (!gomp_offload_targets_set)
+	gomp_offload_targets = offload_targets;
+      else if (gomp_offload_targets == offload_targets
+	       || strcmp (gomp_offload_targets, offload_targets) == 0)
+	/* Nothing to do if the same.  */;
+      else
+	{
+	  /* Merge offload_targets into gomp_offload_targets.  */
+	  /* TODO: this could be simpler if we had the data available in a
+	     different form.  */
+	  size_t gomp_offload_targets_len = strlen (gomp_offload_targets);
+	  /* Maximum length.  */
+	  size_t len = (gomp_offload_targets_len + /* ":" */ 1
+			+ strlen (offload_targets) + /* '\0' */ 1);
+	  char *gomp_offload_targets_new = gomp_malloc (len);
+	  memcpy (gomp_offload_targets_new,
+		  gomp_offload_targets, gomp_offload_targets_len);
+	  char *gomp_offload_targets_new_next
+	    = gomp_offload_targets_new + gomp_offload_targets_len;
+	  *gomp_offload_targets_new_next = '\0';
+	  const char *cur = offload_targets;
+	  while (*cur)
+	    {
+	      const char *cur_end = strchr (cur, ':');
+	      /* If no other offload target following...  */
+	      if (cur_end == NULL)
+		/* ..., point to the terminating NUL character.  */
+		cur_end = cur + strlen (cur);
+	      size_t cur_len = cur_end - cur;
+
+	      /* Do we already have this one listed?  */
+	      const char *haystack = gomp_offload_targets_new;
+	      while (haystack != NULL)
+		{
+		  if (strncmp (haystack, cur, cur_len) == 0)
+		    break;
+		  else
+		    {
+		      haystack = strchr (haystack, ':');
+		      if (haystack != NULL)
+			haystack += /* ':' */ 1;
+		    }
+		}
+	      if (haystack == NULL)
+		{
+		  /* Not yet listed; add it.  */
+		  if (gomp_offload_targets_new_next != gomp_offload_targets_new)
+		    *gomp_offload_targets_new_next++ = ':';
+		  assert (gomp_offload_targets_new_next + cur_len + /* '\0' */ 1
+			  <= gomp_offload_targets_new + len);
+		  memcpy (gomp_offload_targets_new_next, cur, cur_len);
+		  gomp_offload_targets_new_next += cur_len;
+		  *gomp_offload_targets_new_next = '\0';
+		}
+
+	      if (*cur_end == '\0')
+		break;
+	      cur = cur_end + /* : */ 1;
+	    }
+
+	  if (gomp_offload_targets_malloced)
+	    free ((char *) gomp_offload_targets);
+	  else
+	    {
+	      if (atexit (free_gomp_offload_targets) != 0)
+		gomp_fatal ("atexit failed");
+	    }
+
+	  gomp_offload_targets = gomp_offload_targets_new;
+	  gomp_offload_targets_malloced = true;
+	}
+    }
+  else
+    {
+      /* If we have already initialized (which can happen only if a shared
+	 library with another GOMP_set_offload_targets constructor call gets
+	 loaded dynamically), and the user is now requesting offload targets
+	 that were not requested previously, then we're out of luck: we can't
+	 load new plugins now.  Otherwise, we're all set.  */
+      if (gomp_offload_targets == offload_targets
+	  || strcmp (gomp_offload_targets, offload_targets) == 0)
+	/* All fine if the same.  */;
+      else
+	{
+	  /* Check offload_targets against gomp_offload_targets.  */
+	  /* TODO: this could be simpler if we had the data available in a
+	     different form.  */
+	  const char *cur = offload_targets;
+	  while (*cur)
+	    {
+	      const char *cur_end = strchr (cur, ':');
+	      /* If no other offload target following...  */
+	      if (cur_end == NULL)
+		/* ..., point to the terminating NUL character.  */
+		cur_end = cur + strlen (cur);
+	      size_t cur_len = cur_end - cur;
+
+	      /* Do we have this one listed?  */
+	      const char *haystack = gomp_offload_targets;
+	      while (haystack != NULL)
+		{
+		  if (strncmp (haystack, cur, cur_len) == 0)
+		    break;
+		  else
+		    {
+		      haystack = strchr (haystack, ':');
+		      if (haystack != NULL)
+			haystack += /* ':' */ 1;
+		    }
+		}
+	      if (haystack == NULL)
+		{
+		  /* Not listed.  */
+		  gomp_fatal ("Can't satisfy request for offload targets: %s; have loaded: %s",
+			      offload_targets, gomp_offload_targets);
+		}
+
+	      if (*cur_end == '\0')
+		break;
+	      cur = cur_end + /* : */ 1;
+	    }
+	}
+    }
+  gomp_offload_targets_set = true;
+
+  gomp_debug (0, "  %s (\"%s\"): %s\n", __FUNCTION__,
+	      offload_targets, gomp_offload_targets);
+}
+
 /* This function initializes the runtime for offloading.
    It parses the list of offload plugins, and tries to load these.
    On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP
@@ -2720,11 +3915,12 @@
    corresponding devices, first the GOMP_OFFLOAD_CAP_OPENMP_400 ones, follows
    by the others.  */
 
+static const char *gomp_plugin_prefix ="libgomp-plugin-";
+static const char *gomp_plugin_suffix = SONAME_SUFFIX (1);
+
 static void
 gomp_target_init (void)
 {
-  const char *prefix ="libgomp-plugin-";
-  const char *suffix = SONAME_SUFFIX (1);
   const char *cur, *next;
   char *plugin_name;
   int i, new_num_devices;
@@ -2732,52 +3928,58 @@
   num_devices = 0;
   devices = NULL;
 
-  cur = OFFLOAD_PLUGINS;
+  cur = gomp_offload_targets;
   if (*cur)
     do
       {
+	next = strchr (cur, ':');
+	/* If no other offload target following...  */
+	if (next == NULL)
+	  /* ..., point to the terminating NUL character.  */
+	  next = cur + strlen (cur);
+
+	size_t gomp_plugin_prefix_len = strlen (gomp_plugin_prefix);
+	size_t cur_len = next - cur;
+	size_t gomp_plugin_suffix_len = strlen (gomp_plugin_suffix);
+	plugin_name = gomp_malloc (gomp_plugin_prefix_len
+				   + cur_len
+				   + gomp_plugin_suffix_len
+				   + 1);
+	memcpy (plugin_name, gomp_plugin_prefix, gomp_plugin_prefix_len);
+	memcpy (plugin_name + gomp_plugin_prefix_len, cur, cur_len);
+	/* NUL-terminate the string here...  */
+	plugin_name[gomp_plugin_prefix_len + cur_len] = '\0';
+	/* ..., so that we can then use it to translate the offload target to
+	   the plugin name...  */
+	const char *cur_plugin_name
+	  = offload_target_to_plugin_name (plugin_name
+					   + gomp_plugin_prefix_len);
+	size_t cur_plugin_name_len = strlen (cur_plugin_name);
+	assert (cur_plugin_name_len <= cur_len);
+	/* ..., and then rewrite it.  */
+	memcpy (plugin_name + gomp_plugin_prefix_len,
+		cur_plugin_name, cur_plugin_name_len);
+	memcpy (plugin_name + gomp_plugin_prefix_len + cur_plugin_name_len,
+		gomp_plugin_suffix, gomp_plugin_suffix_len);
+	plugin_name[gomp_plugin_prefix_len
+		    + cur_plugin_name_len
+		    + gomp_plugin_suffix_len] = '\0';
+
 	struct gomp_device_descr current_device;
-	size_t prefix_len, suffix_len, cur_len;
-
-	next = strchr (cur, ',');
-
-	prefix_len = strlen (prefix);
-	cur_len = next ? next - cur : strlen (cur);
-	suffix_len = strlen (suffix);
-
-	plugin_name = (char *) malloc (prefix_len + cur_len + suffix_len + 1);
-	if (!plugin_name)
-	  {
-	    num_devices = 0;
-	    break;
-	  }
-
-	memcpy (plugin_name, prefix, prefix_len);
-	memcpy (plugin_name + prefix_len, cur, cur_len);
-	memcpy (plugin_name + prefix_len + cur_len, suffix, suffix_len + 1);
-
 	if (gomp_load_plugin_for_device (&current_device, plugin_name))
 	  {
 	    new_num_devices = current_device.get_num_devices_func ();
 	    if (new_num_devices >= 1)
 	      {
-		/* Augment DEVICES and NUM_DEVICES.  */
-
-		devices = realloc (devices, (num_devices + new_num_devices)
-				   * sizeof (struct gomp_device_descr));
-		if (!devices)
-		  {
-		    num_devices = 0;
-		    free (plugin_name);
-		    break;
-		  }
-
 		current_device.name = current_device.get_name_func ();
 		/* current_device.capabilities has already been set.  */
 		current_device.type = current_device.get_type_func ();
 		current_device.mem_map.root = NULL;
 		current_device.state = GOMP_DEVICE_UNINITIALIZED;
-		current_device.openacc.data_environ = NULL;
+		/* Augment DEVICES and NUM_DEVICES.  */
+		devices = gomp_realloc (devices,
+					((num_devices + new_num_devices)
+					 * sizeof (struct gomp_device_descr)));
 		for (i = 0; i < new_num_devices; i++)
 		  {
 		    current_device.target_id = i;
@@ -2791,18 +3993,12 @@
 	free (plugin_name);
 	cur = next + 1;
       }
-    while (next);
+    while (*next);
 
   /* In DEVICES, sort the GOMP_OFFLOAD_CAP_OPENMP_400 ones first, and set
      NUM_DEVICES_OPENMP.  */
   struct gomp_device_descr *devices_s
-    = malloc (num_devices * sizeof (struct gomp_device_descr));
-  if (!devices_s)
-    {
-      num_devices = 0;
-      free (devices);
-      devices = NULL;
-    }
+    = gomp_malloc (num_devices * sizeof (struct gomp_device_descr));
   num_devices_openmp = 0;
   for (i = 0; i < num_devices; i++)
     if (devices[i].capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
diff --git a/libgomp/team.c b/libgomp/team.c
index c422da3..b26caaa 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -239,6 +239,9 @@
   pthread_exit (NULL);
 #elif defined(__nvptx__)
   asm ("exit;");
+#elif defined(__AMDGCN__)
+  asm ("s_dcache_wb\n\t"
+       "s_endpgm");
 #else
 #error gomp_free_pool_helper must terminate the thread
 #endif
diff --git a/libgomp/testsuite/Makefile.in b/libgomp/testsuite/Makefile.in
index 80315b1..9c4d7b3 100644
--- a/libgomp/testsuite/Makefile.in
+++ b/libgomp/testsuite/Makefile.in
@@ -168,6 +168,8 @@
 INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
 LD = @LD@
 LDFLAGS = @LDFLAGS@
+LIBFFI = @LIBFFI@
+LIBFFIINCS = @LIBFFIINCS@
 LIBOBJS = @LIBOBJS@
 LIBS = @LIBS@
 LIBTOOL = @LIBTOOL@
@@ -205,6 +207,10 @@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
 PERL = @PERL@
+PLUGIN_GCN = @PLUGIN_GCN@
+PLUGIN_GCN_CPPFLAGS = @PLUGIN_GCN_CPPFLAGS@
+PLUGIN_GCN_LDFLAGS = @PLUGIN_GCN_LDFLAGS@
+PLUGIN_GCN_LIBS = @PLUGIN_GCN_LIBS@
 PLUGIN_HSA = @PLUGIN_HSA@
 PLUGIN_HSA_CPPFLAGS = @PLUGIN_HSA_CPPFLAGS@
 PLUGIN_HSA_LDFLAGS = @PLUGIN_HSA_LDFLAGS@
@@ -278,6 +284,7 @@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp
index 14d9b5f..9644176 100644
--- a/libgomp/testsuite/lib/libgomp.exp
+++ b/libgomp/testsuite/lib/libgomp.exp
@@ -120,7 +120,7 @@
     # Add liboffloadmic build directory in LD_LIBRARY_PATH to support
     # Intel MIC offloading testing.
     global offload_plugins
-    if { [string match "*,intelmic,*" ",$offload_plugins,"] } {
+    if { [string match "*:intelmic:*" ":$offload_plugins:"] } {
 	append always_ld_library_path ":${blddir}/../liboffloadmic/.libs"
 	append always_ld_library_path ":${blddir}/../liboffloadmic/plugin/.libs"
 	# libstdc++ is required by liboffloadmic
@@ -316,6 +316,9 @@
 	nvptx* {
 	    return "nvidia"
 	}
+	amdgcn* {
+	    return "gcn"
+	}
 	default {
 	    error "Unknown offload target: $offload_target"
 	}
@@ -373,22 +376,23 @@
     } "" ]
 }
 
-# Return 1 if at least one Nvidia GPU is accessible, and the OpenACC 'nvidia'
-# device type is selected.
+# Return 1 if at least one Nvidia GPU is accessible, and 'nvptx' offloading is
+# selected by 'global offload_target'.
 
 proc check_effective_target_openacc_nvidia_accel_selected { } {
     if { ![check_effective_target_openacc_nvidia_accel_present] } {
 	return 0;
     }
-    global openacc_device_type
-    return [string match "nvidia" $openacc_device_type]
+    global offload_target
+    return [string match "nvptx*" $offload_target]
 }
 
-# Return 1 if the OpenACC 'host' device type is selected.
+# Return 1 if explicit host-fallback execution is selected by 'global
+# offload_target'.
 
 proc check_effective_target_openacc_host_selected { } {
-    global openacc_device_type
-    return [string match "host" $openacc_device_type]
+    global offload_target
+    return [string match "disable" $offload_target]
 }
 
 # Return 1 if the selected OMP device is actually a HSA device
@@ -444,3 +448,28 @@
 	check_effective_target_hsa_offloading_selected_nocache
     }]
 }
+# Return 1 if at least one AMD GCN board is present.
+
+proc check_effective_target_openacc_amdgcn_accel_present { } {
+    return [check_runtime openacc_amdgcn_accel_present {
+	#include <openacc.h>
+	int main () {
+	return !(acc_get_num_devices (acc_device_gcn) > 0);
+	}
+    } "" ]
+}
+
+# Return 1 if at least one AMD GCN board is present, and the AMD GCN device
+# type is selected by default.
+
+proc check_effective_target_openacc_amdgcn_accel_selected { } {
+    if { ![check_effective_target_openacc_amdgcn_accel_present] } {
+	return 0;
+    }
+    global offload_target
+    if { [string match "amdgcn*" $offload_target] } {
+        return 1;
+    }
+    return 0;
+}
+
diff --git a/libgomp/testsuite/libgomp.c/c.exp b/libgomp/testsuite/libgomp.c/c.exp
index 31bdd57..463d953 100644
--- a/libgomp/testsuite/libgomp.c/c.exp
+++ b/libgomp/testsuite/libgomp.c/c.exp
@@ -23,10 +23,81 @@
 # Turn on OpenMP.
 lappend ALWAYS_CFLAGS "additional_flags=-fopenmp"
 
+# Generate new tests for each DO_TEST entry in TEST_LIST.
+proc generate_tests { test_list } {
+    global srcdir
+    global subdir
+
+    # Get corresponding source file.
+    set base_file [regsub "\.list" $test_list ""]
+    set base_file [regsub "$srcdir/$subdir/" $base_file ""]
+    set c_file $base_file.c
+
+    # Get dg directives from c file.
+    set dg_directives ""
+    set fp [open "$srcdir/$subdir/$c_file" r]
+    while {[gets $fp line] >= 0} {
+	if {[regexp -line -- "^/\\* \{ dg-" $line]} {
+	    if { "$dg_directives" == "" } {
+		set sep ""
+	    } else {
+		set sep "\n"
+	    }
+	    set dg_directives "$dg_directives$sep$line"
+	}
+    }
+    close $fp
+
+    # Get list of tests.
+    set fp [open "$test_list" r]
+    set file_data [read $fp]
+    close $fp
+    set file_data [regsub -all "DO_TEST" $file_data ""]
+    set file_data [regsub -all "\\(" $file_data ""]
+    set file_data [regsub -all "\\)" $file_data ""]
+    set file_data [regsub -all \[\n\] $file_data ""]
+    set file_data [string trimleft $file_data " "]
+    set tests [split $file_data]
+
+    # Create directory to generate files.
+    set test_dir [pwd]
+    set generated_dir $test_dir/generated/libgomp.c
+    file mkdir $generated_dir
+
+    # Generate tests.
+    set new_files []
+    set i 1
+    foreach test $tests {
+	set new_file "$generated_dir/$base_file-$test.c"
+
+	set fp [open "$new_file" w]
+	puts $fp "$dg_directives"
+	puts $fp "#define ONE_TEST $test"
+	puts $fp "#define TEST_NR $i"
+	puts $fp "#include \"$srcdir/$subdir/$c_file\""
+	close $fp
+
+	set i [expr $i + 1]
+	lappend new_files $new_file
+    }
+
+    return $new_files
+}
+
+# Generate tests for each .list file
+set test_lists [find $srcdir/$subdir *.list]
+set generated_tests []
+foreach test_list $test_lists {
+    set generated_tests [concat \
+			     $generated_tests \
+			     [generate_tests $test_list]]
+}
+
 # Gather a list of all tests.
 set tests [lsort [concat \
 		      [find $srcdir/$subdir *.c] \
-		      [find $srcdir/$subdir/../libgomp.c-c++-common *.c]]]
+		      [find $srcdir/$subdir/../libgomp.c-c++-common *.c] \
+		      $generated_tests]]
 
 set ld_library_path $always_ld_library_path
 append ld_library_path [gcc-set-multilib-library-path $GCC_UNDER_TEST]
diff --git a/libgomp/testsuite/libgomp.c/for-1.h b/libgomp/testsuite/libgomp.c/for-1.h
new file mode 100644
index 0000000..fa82c5b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-1.h
@@ -0,0 +1,25 @@
+#define S
+#define N(x) M(x, G, static)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(static, 32)
+#define N(x) M(x, G, static32)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(auto)
+#define N(x) M(x, G, auto)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(guided, 32)
+#define N(x) M(x, G, guided32)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(runtime)
+#define N(x) M(x, G, runtime)
+#include "for-2.h"
+#undef S
+#undef N
diff --git a/libgomp/testsuite/libgomp.c/for-2.h b/libgomp/testsuite/libgomp.c/for-2.h
new file mode 100644
index 0000000..6d8e349
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-2.h
@@ -0,0 +1,313 @@
+#ifndef VARS
+#define VARS
+int a[1500];
+float b[10][15][10];
+__attribute__((noreturn)) void
+noreturn (void)
+{
+  for (;;);
+}
+#endif
+#ifndef SC
+#define SC
+#endif
+#ifndef OMPTGT
+#define OMPTGT
+#endif
+#ifndef OMPTO
+#define OMPTO(v) do {} while (0)
+#endif
+#ifndef OMPFROM
+#define OMPFROM(v) do {} while (0)
+#endif
+
+__attribute__((noinline, noclone)) void
+N(f0) (void)
+{
+  int i;
+  OMPTGT
+#pragma omp F S
+  for (i = 0; i < 1500; i++)
+    a[i] += 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f1) (void)
+{
+  OMPTGT
+#pragma omp F S
+  for (unsigned int i = __INT_MAX__; i < 3000U + __INT_MAX__; i += 2)
+    a[(i - __INT_MAX__) >> 1] -= 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f2) (void)
+{
+  unsigned long long i;
+  OMPTGT
+#pragma omp F S
+  for (i = __LONG_LONG_MAX__ + 4500ULL - 27;
+       i > __LONG_LONG_MAX__ - 27ULL; i -= 3)
+    a[(i + 26LL - __LONG_LONG_MAX__) / 3] -= 4;
+}
+
+__attribute__((noinline, noclone)) void
+N(f3) (long long n1, long long n2, long long s3)
+{
+  OMPTGT
+#pragma omp F S
+  for (long long i = n1 + 23; i > n2 - 25; i -= s3)
+    a[i + 48] += 7;
+}
+
+__attribute__((noinline, noclone)) void
+N(f4) (void)
+{
+  unsigned int i;
+  OMPTGT
+#pragma omp F S
+  for (i = 30; i < 20; i += 2)
+    a[i] += 10;
+}
+
+__attribute__((noinline, noclone)) void
+N(f5) (int n11, int n12, int n21, int n22, int n31, int n32,
+       int s1, int s2, int s3)
+{
+  SC int v1, v2, v3;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (v1 = n11; v1 < n12; v1 += s1)
+    for (v2 = n21; v2 < n22; v2 += s2)
+      for (v3 = n31; v3 < n32; v3 += s3)
+	b[v1][v2][v3] += 2.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f6) (int n11, int n12, int n21, int n22, long long n31, long long n32,
+       int s1, int s2, long long int s3)
+{
+  SC int v1, v2;
+  SC long long v3;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (v1 = n11; v1 > n12; v1 += s1)
+    for (v2 = n21; v2 > n22; v2 += s2)
+      for (v3 = n31; v3 > n32; v3 += s3)
+	b[v1][v2 / 2][v3] -= 4.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f7) (void)
+{
+  SC unsigned int v1, v3;
+  SC unsigned long long v2;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (v1 = 0; v1 < 20; v1 += 2)
+    for (v2 = __LONG_LONG_MAX__ + 16ULL;
+	 v2 > __LONG_LONG_MAX__ - 29ULL; v2 -= 3)
+      for (v3 = 10; v3 > 0; v3--)
+	b[v1 >> 1][(v2 - __LONG_LONG_MAX__ + 64) / 3 - 12][v3 - 1] += 5.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f8) (void)
+{
+  SC long long v1, v2, v3;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (v1 = 0; v1 < 20; v1 += 2)
+    for (v2 = 30; v2 < 20; v2++)
+      for (v3 = 10; v3 < 0; v3--)
+	b[v1][v2][v3] += 5.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f9) (void)
+{
+  int i;
+  OMPTGT
+#pragma omp F S
+  for (i = 20; i < 10; i++)
+    {
+      a[i] += 2;
+      noreturn ();
+      a[i] -= 4;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+N(f10) (void)
+{
+  SC int i;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (i = 0; i < 10; i++)
+    for (int j = 10; j < 8; j++)
+      for (long k = -10; k < 10; k++)
+	{
+	  b[i][j][k] += 4;
+	  noreturn ();
+	  b[i][j][k] -= 8;
+	}
+}
+
+__attribute__((noinline, noclone)) void
+N(f11) (int n)
+{
+  int i;
+  OMPTGT
+#pragma omp F S
+  for (i = 20; i < n; i++)
+    {
+      a[i] += 8;
+      noreturn ();
+      a[i] -= 16;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+N(f12) (int n)
+{
+  SC int i;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (i = 0; i < 10; i++)
+    for (int j = n; j < 8; j++)
+      for (long k = -10; k < 10; k++)
+	{
+	  b[i][j][k] += 16;
+	  noreturn ();
+	  b[i][j][k] -= 32;
+	}
+}
+
+__attribute__((noinline, noclone)) void
+N(f13) (void)
+{
+  int *i;
+  OMPTGT
+#pragma omp F S
+  for (i = a; i < &a[1500]; i++)
+    i[0] += 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f14) (void)
+{
+  SC float *i;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (i = &b[0][0][0]; i < &b[0][0][10]; i++)
+    for (float *j = &b[0][15][0]; j > &b[0][0][0]; j -= 10)
+      for (float *k = &b[0][0][10]; k > &b[0][0][0]; --k)
+	b[i - &b[0][0][0]][(j - &b[0][0][0]) / 10 - 1][(k - &b[0][0][0]) - 1]
+	  -= 3.5;
+}
+
+__attribute__((noinline, noclone)) int
+N(test) (void)
+{
+  int i, j, k;
+  for (i = 0; i < 1500; i++)
+    a[i] = i - 25;
+  OMPTO (a);
+  N(f0) ();
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 23)
+      return 1;
+  N(f1) ();
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 25)
+      return 1;
+  N(f2) ();
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 29)
+      return 1;
+  N(f3) (1500LL - 1 - 23 - 48, -1LL + 25 - 48, 1LL);
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  N(f3) (1500LL - 1 - 23 - 48, 1500LL - 1, 7LL);
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  N(f4) ();
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	b[i][j][k] = i - 2.5 + 1.5 * j - 1.5 * k;
+  OMPTO (b);
+  N(f5) (0, 10, 0, 15, 0, 10, 1, 1, 1);
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f5) (0, 10, 30, 15, 0, 10, 4, 5, 6);
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f6) (9, -1, 29, 0, 9, -1, -1, -2, -1);
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i - 4.5 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f7) ();
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f8) ();
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f9) ();
+  N(f10) ();
+  N(f11) (10);
+  N(f12) (12);
+  OMPFROM (a);
+  OMPFROM (b);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f13) ();
+  N(f14) ();
+  OMPFROM (a);
+  OMPFROM (b);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 20)
+      return 1;
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i - 2.5 + 1.5 * j - 1.5 * k)
+	  return 1;
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-3.c b/libgomp/testsuite/libgomp.c/for-3.c
new file mode 100644
index 0000000..d040f11
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-3.c
@@ -0,0 +1,123 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#ifndef ONE_TEST
+#define TEST_ALL 1
+#else
+#define TEST_ALL 0
+#endif
+
+#pragma omp declare target
+
+#if TEST_ALL || TEST_NR == 1
+#define F distribute
+#define G d
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 2
+#define F distribute
+#define G d_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 3
+#define F distribute simd
+#define G ds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 4
+#define F distribute simd
+#define G ds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (5 <= TEST_NR && TEST_NR <= 9)
+#define F distribute parallel for
+#define G dpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (10 <= TEST_NR && TEST_NR <= 14)
+#define F distribute parallel for dist_schedule(static, 128)
+#define G dpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (15 <= TEST_NR && TEST_NR <= 19)
+#define F distribute parallel for simd
+#define G dpfs
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (20 <= TEST_NR && TEST_NR <= 24)
+#define F distribute parallel for simd dist_schedule(static, 128)
+#define G dpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#pragma omp end declare target
+
+int
+main ()
+{
+  int err = 0;
+
+  #pragma omp target teams reduction(|:err)
+  {
+#define DO_TEST_1(test) \
+    do {	      \
+      err |= test (); \
+    } while (0)
+
+#ifdef ONE_TEST
+  DO_TEST_1 (ONE_TEST);
+#else
+#define DO_TEST(test) DO_TEST_1(test);
+#include "for-3.list"
+#undef DO_TEST
+#endif
+#undef DO_TEST_1
+  }
+
+  if (err)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-3.list b/libgomp/testsuite/libgomp.c/for-3.list
new file mode 100644
index 0000000..6fb25a5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-3.list
@@ -0,0 +1,24 @@
+DO_TEST (test_d_normal)
+DO_TEST (test_d_ds128_normal)
+DO_TEST (test_ds_normal)
+DO_TEST (test_ds_ds128_normal)
+DO_TEST (test_dpf_static)
+DO_TEST (test_dpf_static32)
+DO_TEST (test_dpf_auto)
+DO_TEST (test_dpf_guided32)
+DO_TEST (test_dpf_runtime)
+DO_TEST (test_dpf_ds128_static)
+DO_TEST (test_dpf_ds128_static32)
+DO_TEST (test_dpf_ds128_auto)
+DO_TEST (test_dpf_ds128_guided32)
+DO_TEST (test_dpf_ds128_runtime)
+DO_TEST (test_dpfs_static)
+DO_TEST (test_dpfs_static32)
+DO_TEST (test_dpfs_auto)
+DO_TEST (test_dpfs_guided32)
+DO_TEST (test_dpfs_runtime)
+DO_TEST (test_dpfs_ds128_static)
+DO_TEST (test_dpfs_ds128_static32)
+DO_TEST (test_dpfs_ds128_auto)
+DO_TEST (test_dpfs_ds128_guided32)
+DO_TEST (test_dpfs_ds128_runtime)
diff --git a/libgomp/testsuite/libgomp.c/for-5.c b/libgomp/testsuite/libgomp.c/for-5.c
new file mode 100644
index 0000000..8f12579
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-5.c
@@ -0,0 +1,161 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#ifndef ONE_TEST
+#define TEST_ALL 1
+#else
+#define TEST_ALL 0
+#endif
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#if TEST_ALL || (1 <= TEST_NR && TEST_NR <= 5)
+#define F target parallel for
+#define G tpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 6
+#define F target simd
+#define G t_simd
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (7 <= TEST_NR && TEST_NR <= 11)
+#define F target parallel for simd
+#define G tpf_simd
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 12
+#define F target teams distribute
+#define G ttd
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 13
+#define F target teams distribute
+#define G ttd_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 14
+#define F target teams distribute simd
+#define G ttds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 15
+#define F target teams distribute simd
+#define G ttds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (16 <= TEST_NR && TEST_NR <= 20)
+#define F target teams distribute parallel for
+#define G ttdpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (21 <= TEST_NR && TEST_NR <= 25)
+#define F target teams distribute parallel for dist_schedule(static, 128)
+#define G ttdpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (26 <= TEST_NR && TEST_NR <= 30)
+#define F target teams distribute parallel for simd
+#define G ttdpfs
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (31 <= TEST_NR && TEST_NR <= 35)
+#define F target teams distribute parallel for simd dist_schedule(static, 128)
+#define G ttdpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+int
+main ()
+{
+#define DO_TEST_1(test)				\
+  do {						\
+    if (test ())				\
+      abort ();					\
+  } while (0)
+
+#ifdef ONE_TEST
+  DO_TEST_1 (ONE_TEST);
+#else
+#define DO_TEST(test) DO_TEST_1 (test);
+#include "for-5.list"
+#undef DO_TEST
+#endif
+#undef DO_TEST_1
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-5.list b/libgomp/testsuite/libgomp.c/for-5.list
new file mode 100644
index 0000000..48d0c3a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-5.list
@@ -0,0 +1,35 @@
+DO_TEST (test_tpf_static)
+DO_TEST (test_tpf_static32)
+DO_TEST (test_tpf_auto)
+DO_TEST (test_tpf_guided32)
+DO_TEST (test_tpf_runtime)
+DO_TEST (test_t_simd_normal)
+DO_TEST (test_tpf_simd_static)
+DO_TEST (test_tpf_simd_static32)
+DO_TEST (test_tpf_simd_auto)
+DO_TEST (test_tpf_simd_guided32)
+DO_TEST (test_tpf_simd_runtime)
+DO_TEST (test_ttd_normal)
+DO_TEST (test_ttd_ds128_normal)
+DO_TEST (test_ttds_normal)
+DO_TEST (test_ttds_ds128_normal)
+DO_TEST (test_ttdpf_static)
+DO_TEST (test_ttdpf_static32)
+DO_TEST (test_ttdpf_auto)
+DO_TEST (test_ttdpf_guided32)
+DO_TEST (test_ttdpf_runtime)
+DO_TEST (test_ttdpf_ds128_static)
+DO_TEST (test_ttdpf_ds128_static32)
+DO_TEST (test_ttdpf_ds128_auto)
+DO_TEST (test_ttdpf_ds128_guided32)
+DO_TEST (test_ttdpf_ds128_runtime)
+DO_TEST (test_ttdpfs_static)
+DO_TEST (test_ttdpfs_static32)
+DO_TEST (test_ttdpfs_auto)
+DO_TEST (test_ttdpfs_guided32)
+DO_TEST (test_ttdpfs_runtime)
+DO_TEST (test_ttdpfs_ds128_static)
+DO_TEST (test_ttdpfs_ds128_static32)
+DO_TEST (test_ttdpfs_ds128_auto)
+DO_TEST (test_ttdpfs_ds128_guided32)
+DO_TEST (test_ttdpfs_ds128_runtime)
diff --git a/libgomp/testsuite/libgomp.c/for-6.c b/libgomp/testsuite/libgomp.c/for-6.c
new file mode 100644
index 0000000..50a866a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-6.c
@@ -0,0 +1,135 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#ifndef ONE_TEST
+#define TEST_ALL 1
+#else
+#define TEST_ALL 0
+#endif
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPTGT
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPTGT DO_PRAGMA (omp target)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#if TEST_ALL || TEST_NR == 1
+#define F teams distribute
+#define G td
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 2
+#define F teams distribute
+#define G td_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 3
+#define F teams distribute simd
+#define G tds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 4
+#define F teams distribute simd
+#define G tds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (5 <= TEST_NR && TEST_NR <= 9)
+#define F teams distribute parallel for
+#define G tdpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (10 <= TEST_NR && TEST_NR <= 14)
+#define F teams distribute parallel for dist_schedule(static, 128)
+#define G tdpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (15 <= TEST_NR && TEST_NR <= 19)
+#define F teams distribute parallel for simd
+#define G tdpfs
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (20 <= TEST_NR && TEST_NR <= 24)
+#define F teams distribute parallel for simd dist_schedule(static, 128)
+#define G tdpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+int
+main ()
+{
+#define DO_TEST_1(test)				\
+  do {						\
+    if (test ())				\
+      abort ();					\
+  } while (0)
+
+#ifdef ONE_TEST
+  DO_TEST_1 (ONE_TEST);
+#else
+#define DO_TEST(test) DO_TEST_1 (test);
+#include "for-6.list"
+#undef DO_TEST
+#endif
+#undef DO_TEST_1
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-6.list b/libgomp/testsuite/libgomp.c/for-6.list
new file mode 100644
index 0000000..438ecff
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-6.list
@@ -0,0 +1,24 @@
+DO_TEST (test_td_normal)
+DO_TEST (test_td_ds128_normal)
+DO_TEST (test_tds_normal)
+DO_TEST (test_tds_ds128_normal)
+DO_TEST (test_tdpf_static)
+DO_TEST (test_tdpf_static32)
+DO_TEST (test_tdpf_auto)
+DO_TEST (test_tdpf_guided32)
+DO_TEST (test_tdpf_runtime)
+DO_TEST (test_tdpf_ds128_static)
+DO_TEST (test_tdpf_ds128_static32)
+DO_TEST (test_tdpf_ds128_auto)
+DO_TEST (test_tdpf_ds128_guided32)
+DO_TEST (test_tdpf_ds128_runtime)
+DO_TEST (test_tdpfs_static)
+DO_TEST (test_tdpfs_static32)
+DO_TEST (test_tdpfs_auto)
+DO_TEST (test_tdpfs_guided32)
+DO_TEST (test_tdpfs_runtime)
+DO_TEST (test_tdpfs_ds128_static)
+DO_TEST (test_tdpfs_ds128_static32)
+DO_TEST (test_tdpfs_ds128_auto)
+DO_TEST (test_tdpfs_ds128_guided32)
+DO_TEST (test_tdpfs_ds128_runtime)
diff --git a/libgomp/testsuite/libgomp.c/target-print-1.c b/libgomp/testsuite/libgomp.c/target-print-1.c
new file mode 100644
index 0000000..5857b87
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-print-1.c
@@ -0,0 +1,17 @@
+/* Ensure that printf on the offload device works.  */
+
+/* { dg-do run } */
+/* { dg-output "The answer is 42(\n|\r\n|\r)+" } */
+
+#include <stdio.h>
+
+int var = 42;
+
+int
+main ()
+{
+#pragma omp target
+    {
+      printf ("The answer is %d\n", var);
+    }
+}
diff --git a/libgomp/testsuite/libgomp.fortran/fortran.exp b/libgomp/testsuite/libgomp.fortran/fortran.exp
index d848ed4..f6b6c59 100644
--- a/libgomp/testsuite/libgomp.fortran/fortran.exp
+++ b/libgomp/testsuite/libgomp.fortran/fortran.exp
@@ -7,7 +7,7 @@
 
 set shlib_ext [get_shlib_extension]
 set lang_library_path	"../libgfortran/.libs"
-set lang_link_flags	"-lgfortran -foffload=-lgfortran"
+set lang_link_flags	"-lgfortran -foffload=-lgfortran -lquadmath"
 if [info exists lang_include_flags] then {
     unset lang_include_flags
 }
diff --git a/libgomp/testsuite/libgomp.fortran/target-print-1.f90 b/libgomp/testsuite/libgomp.fortran/target-print-1.f90
new file mode 100644
index 0000000..f4f00e2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-print-1.f90
@@ -0,0 +1,15 @@
+! Ensure that printf on the offload device works.
+
+! { dg-do run }
+! { dg-output "The answer is 42(\n|\r\n|\r)+" }
+! { dg-xfail-if "no write for nvidia" { openacc_nvidia_accel_selected } }
+
+program main
+  implicit none
+  integer :: var = 42
+
+!$omp target
+  write (0, '("The answer is ", I2)') var
+!$omp end target
+
+end program main
diff --git a/libgomp/testsuite/libgomp.fortran/use_device_ptr1.f90 b/libgomp/testsuite/libgomp.fortran/use_device_ptr1.f90
new file mode 100644
index 0000000..59eb446
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/use_device_ptr1.f90
@@ -0,0 +1,608 @@
+module offloading
+  use iso_c_binding
+  implicit none
+contains
+  subroutine copy3_array_data_int(from, to, N)
+    !$omp declare target
+    type(c_ptr), value :: from, to
+    integer, value :: N
+
+    real(c_double), pointer :: from_ptr(:)
+    real(c_double), pointer :: to_ptr(:)
+    integer :: i
+
+    call c_f_pointer(from, from_ptr, shape=[N])
+    call c_f_pointer(to, to_ptr, shape=[N])
+    !$omp parallel do
+    do i = 1, N
+      to_ptr(i) = 3 * from_ptr(i)
+    end do
+    !$omp end parallel do
+  end subroutine copy3_array_data_int
+
+  subroutine copy3_array_data(from, to, N)
+    type(c_ptr), value :: from, to
+    integer, value :: N
+    !$omp target is_device_ptr(from, to)
+    call copy3_array_data_int(from, to, N)
+    !$omp end target
+  end subroutine copy3_array_data
+
+  subroutine copy3_array1(from, to)
+    real(c_double), target :: from(:), to(:)
+    integer :: N
+    N = size(from)
+
+    !$omp target is_device_ptr(from, to)
+    call copy3_array_data_int(c_loc(from), c_loc(to), N)
+    !$omp end target
+  end subroutine copy3_array1
+
+! ICE - the following code gives (currently) an ICE
+! It is accepted by the frontend but it is invalid
+! OpenMP 5 as only "a dummy argument that does not have the
+! ALLOCATABLE, POINTER or VALUE attribute."
+!
+!  subroutine copy3_array2(from, to)
+!    real(c_double), pointer :: from(:), to(:)
+!    integer :: N
+!    N = size(from)
+!
+!    !$omp target is_device_ptr(from, to)
+!    call copy3_array_data_int(c_loc(from), c_loc(to), N)
+!    !$omp end target
+!  end subroutine copy3_array2
+
+  subroutine copy3_array3(from, to)
+    real(c_double), optional, target :: from(:), to(:)
+    integer :: N
+    N = size(from)
+
+    !$omp target is_device_ptr(from, to)
+    call copy3_array_data_int(c_loc(from), c_loc(to), N)
+    !$omp end target
+  end subroutine copy3_array3
+
+! ICE - the following code gives (currently) an ICE
+! It is accepted by the frontend but it is invalid
+! OpenMP 5 as only "a dummy argument that does not have the
+! ALLOCATABLE, POINTER or VALUE attribute."
+!
+!  subroutine copy3_array4(from, to)
+!    real(c_double), optional, pointer :: from(:), to(:)
+!    integer :: N
+!    N = size(from)
+!
+!    !$omp target is_device_ptr(from, to)
+!    call copy3_array_data_int(c_loc(from), c_loc(to), N)
+!    !$omp end target
+!  end subroutine copy3_array4
+end module offloading
+
+
+
+module offloading2
+  use iso_c_binding
+  use offloading
+  implicit none
+contains
+  ! Same as main program but uses dummy *nonoptional* arguments
+  subroutine use_device_ptr_sub(AA, BB, CC, DD, EE, FF, AptrA, BptrB, N)
+    real(c_double), pointer :: AA(:), BB(:)
+    real(c_double), allocatable, target :: CC(:), DD(:)
+    real(c_double), target :: EE(N), FF(N), dummy(1)
+    real(c_double), pointer :: AptrA(:), BptrB(:)
+    intent(inout) :: AA, BB, CC, DD, EE, FF
+    integer, value :: N
+
+    type(c_ptr) :: tgt_aptr, tgt_bptr, tgt_cptr, tgt_dptr, tgt_eptr, tgt_fptr
+
+    AA = 11.0_c_double
+    BB = 22.0_c_double
+    CC = 33.0_c_double
+    DD = 44.0_c_double
+    EE = 55.0_c_double
+    FF = 66.0_c_double
+
+    ! NOTE: OpenMP 5's use_device_addr is (at time of writing) not yet supported
+
+    ! pointer-type array to use_device_ptr
+    !$omp target data map(to:AA) map(from:BB)
+    !$omp target data map(alloc:dummy) use_device_ptr(AA,BB)
+    call copy3_array_data(c_loc(AA), c_loc(BB), N)
+    !$omp end target data
+    !$omp end target data
+
+    if (any(abs(AA - 11.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    ! allocatable array to use_device_ptr
+    !$omp target data map(to:CC) map(from:DD)
+    !$omp target data map(alloc:dummy) use_device_ptr(CC,DD)
+    call copy3_array_data(c_loc(CC), c_loc(DD), N)
+    !$omp end target data
+    !$omp end target data
+
+    if (any(abs(CC - 33.0_c_double) > 10.0_c_double * epsilon(CC))) call abort()
+    if (any(abs(3.0_c_double * CC - DD) > 10.0_c_double * epsilon(CC))) call abort()
+
+    ! fixed-size decriptorless array to use_device_ptr
+    !$omp target data map(to:EE) map(from:FF)
+    !$omp target data map(alloc:dummy) use_device_ptr(EE,FF)
+    call copy3_array_data(c_loc(EE), c_loc(FF), N)
+    !$omp end target data
+    !$omp end target data
+
+    if (any(abs(EE - 55.0_c_double) > 10.0_c_double * epsilon(EE))) call abort()
+    if (any(abs(3.0_c_double * EE - FF) > 10.0_c_double * epsilon(EE))) call abort()
+
+
+
+    AA = 111.0_c_double
+    BB = 222.0_c_double
+    CC = 333.0_c_double
+    DD = 444.0_c_double
+    EE = 555.0_c_double
+    FF = 666.0_c_double
+
+    ! pointer-type array to use_device_ptr
+    !$omp target data map(to:AA) map(from:BB)
+    !$omp target data map(alloc:dummy) use_device_ptr(AA,BB)
+    tgt_aptr = c_loc(AA)
+    tgt_bptr = c_loc(BB)
+    AptrA => AA
+    BptrB => BB
+    !$omp end target data
+
+    call copy3_array_data(tgt_aptr, tgt_bptr, N)
+    !$omp target update from(BB)
+    if (any(abs(AA - 111.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    AA = 1111.0_c_double
+    !$omp target update to(AA)
+    call copy3_array_data(tgt_aptr, tgt_bptr, N)
+    !$omp target update from(BB)
+    if (any(abs(AA - 1111.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    ! AprtA tests
+    AA = 7.0_c_double
+    !$omp target update to(AA)
+    call copy3_array_data(c_loc(AptrA), c_loc(BptrB), N)
+    !$omp target update from(BB)
+    if (any(abs(AA - 7.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    AA = 77.0_c_double
+    !$omp target update to(AA)
+    call copy3_array1(AptrA, BptrB)
+    !$omp target update from(BB)
+    if (any(abs(AA - 77.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+!    AA = 777.0_c_double
+!    !$omp target update to(AA)
+!    call copy3_array2(AptrA, BptrB)
+!    !$omp target update from(BB)
+!    if (any(abs(AA - 777.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+!    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    AA = 7777.0_c_double
+    !$omp target update to(AA)
+    call copy3_array3(AptrA, BptrB)
+    !$omp target update from(BB)
+    if (any(abs(AA - 7777.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+!    AA = 77777.0_c_double
+!    !$omp target update to(AA)
+!    call copy3_array4(AptrA, BptrB)
+!    !$omp target update from(BB)
+    !$omp end target data
+!
+!    if (any(abs(AA - 77777.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+!    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+
+
+    ! allocatable array to use_device_ptr
+    !$omp target data map(to:CC) map(from:DD)
+    !$omp target data map(alloc:dummy) use_device_ptr(CC,DD)
+    tgt_cptr = c_loc(CC)
+    tgt_dptr = c_loc(DD)
+    !$omp end target data
+
+    call copy3_array_data(tgt_cptr, tgt_dptr, N)
+    !$omp target update from(DD)
+    if (any(abs(CC - 333.0_c_double) > 10.0_c_double * epsilon(CC))) call abort()
+    if (any(abs(3.0_c_double * CC - DD) > 10.0_c_double * epsilon(CC))) call abort()
+
+    CC = 3333.0_c_double
+    !$omp target update to(CC)
+    call copy3_array_data(tgt_cptr, tgt_dptr, N)
+    !$omp target update from(DD)
+    !$omp end target data
+
+    if (any(abs(CC - 3333.0_c_double) > 10.0_c_double * epsilon(CC))) call abort()
+    if (any(abs(3.0_c_double * CC - DD) > 10.0_c_double * epsilon(CC))) call abort()
+
+
+
+    ! fixed-size decriptorless array to use_device_ptr
+    !$omp target data map(to:EE) map(from:FF)
+    !$omp target data map(alloc:dummy) use_device_ptr(EE,FF)
+    tgt_eptr = c_loc(EE)
+    tgt_fptr = c_loc(FF)
+    !$omp end target data
+
+    call copy3_array_data(tgt_eptr, tgt_fptr, N)
+    !$omp target update from(FF)
+    if (any(abs(EE - 555.0_c_double) > 10.0_c_double * epsilon(EE))) call abort()
+    if (any(abs(3.0_c_double * EE - FF) > 10.0_c_double * epsilon(EE))) call abort()
+
+    EE = 5555.0_c_double
+    !$omp target update to(EE)
+    call copy3_array_data(tgt_eptr, tgt_fptr, N)
+    !$omp target update from(FF)
+    !$omp end target data
+
+    if (any(abs(EE - 5555.0_c_double) > 10.0_c_double * epsilon(EE))) call abort()
+    if (any(abs(3.0_c_double * EE - FF) > 10.0_c_double * epsilon(EE))) call abort()
+  end subroutine use_device_ptr_sub
+
+
+
+  ! Same as main program but uses dummy *optional* arguments
+  subroutine use_device_ptr_sub2(AA, BB, CC, DD, EE, FF, AptrA, BptrB, N)
+    real(c_double), optional, pointer :: AA(:), BB(:)
+    real(c_double), optional, allocatable, target :: CC(:), DD(:)
+    real(c_double), optional, target :: EE(N), FF(N)
+    real(c_double), pointer :: AptrA(:), BptrB(:)
+    intent(inout) :: AA, BB, CC, DD, EE, FF
+    real(c_double), target :: dummy(1)
+    integer, value :: N
+
+    type(c_ptr) :: tgt_aptr, tgt_bptr, tgt_cptr, tgt_dptr, tgt_eptr, tgt_fptr
+
+    AA = 11.0_c_double
+    BB = 22.0_c_double
+    CC = 33.0_c_double
+    DD = 44.0_c_double
+    EE = 55.0_c_double
+    FF = 66.0_c_double
+
+    ! NOTE: OpenMP 5's use_device_addr is (at time of writing) not yet supported
+
+    ! pointer-type array to use_device_ptr
+    !$omp target data map(to:AA) map(from:BB)
+    !$omp target data map(alloc:dummy) use_device_ptr(AA,BB)
+    call copy3_array_data(c_loc(AA), c_loc(BB), N)
+    !$omp end target data
+    !$omp end target data
+
+    if (any(abs(AA - 11.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    ! allocatable array to use_device_ptr
+    !$omp target data map(to:CC) map(from:DD)
+    !$omp target data map(alloc:dummy) use_device_ptr(CC,DD)
+    call copy3_array_data(c_loc(CC), c_loc(DD), N)
+    !$omp end target data
+    !$omp end target data
+
+    if (any(abs(CC - 33.0_c_double) > 10.0_c_double * epsilon(CC))) call abort()
+    if (any(abs(3.0_c_double * CC - DD) > 10.0_c_double * epsilon(CC))) call abort()
+
+    ! fixed-size decriptorless array to use_device_ptr
+    !$omp target data map(to:EE) map(from:FF)
+    !$omp target data map(alloc:dummy) use_device_ptr(EE,FF)
+    call copy3_array_data(c_loc(EE), c_loc(FF), N)
+    !$omp end target data
+    !$omp end target data
+
+    if (any(abs(EE - 55.0_c_double) > 10.0_c_double * epsilon(EE))) call abort()
+    if (any(abs(3.0_c_double * EE - FF) > 10.0_c_double * epsilon(EE))) call abort()
+
+
+
+    AA = 111.0_c_double
+    BB = 222.0_c_double
+    CC = 333.0_c_double
+    DD = 444.0_c_double
+    EE = 555.0_c_double
+    FF = 666.0_c_double
+
+    ! pointer-type array to use_device_ptr
+    !$omp target data map(to:AA) map(from:BB)
+    !$omp target data map(alloc:dummy) use_device_ptr(AA,BB)
+    tgt_aptr = c_loc(AA)
+    tgt_bptr = c_loc(BB)
+    AptrA => AA
+    BptrB => BB
+    !$omp end target data
+
+    call copy3_array_data(tgt_aptr, tgt_bptr, N)
+    !$omp target update from(BB)
+    if (any(abs(AA - 111.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    AA = 1111.0_c_double
+    !$omp target update to(AA)
+    call copy3_array_data(tgt_aptr, tgt_bptr, N)
+    !$omp target update from(BB)
+    if (any(abs(AA - 1111.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    ! AprtA tests
+    AA = 7.0_c_double
+    !$omp target update to(AA)
+    call copy3_array_data(c_loc(AptrA), c_loc(BptrB), N)
+    !$omp target update from(BB)
+    if (any(abs(AA - 7.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    AA = 77.0_c_double
+    !$omp target update to(AA)
+    call copy3_array1(AptrA, BptrB)
+    !$omp target update from(BB)
+    if (any(abs(AA - 77.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+!    AA = 777.0_c_double
+!    !$omp target update to(AA)
+!    call copy3_array2(AptrA, BptrB)
+!    !$omp target update from(BB)
+!    if (any(abs(AA - 777.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+!    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+    AA = 7777.0_c_double
+    !$omp target update to(AA)
+    call copy3_array3(AptrA, BptrB)
+    !$omp target update from(BB)
+    if (any(abs(AA - 7777.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+!    AA = 77777.0_c_double
+!    !$omp target update to(AA)
+!    call copy3_array4(AptrA, BptrB)
+!    !$omp target update from(BB)
+    !$omp end target data
+!
+!    if (any(abs(AA - 77777.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+!    if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+
+
+    ! allocatable array to use_device_ptr
+    !$omp target data map(to:CC) map(from:DD)
+    !$omp target data map(alloc:dummy) use_device_ptr(CC,DD)
+    tgt_cptr = c_loc(CC)
+    tgt_dptr = c_loc(DD)
+    !$omp end target data
+
+    call copy3_array_data(tgt_cptr, tgt_dptr, N)
+    !$omp target update from(DD)
+    if (any(abs(CC - 333.0_c_double) > 10.0_c_double * epsilon(CC))) call abort()
+    if (any(abs(3.0_c_double * CC - DD) > 10.0_c_double * epsilon(CC))) call abort()
+
+    CC = 3333.0_c_double
+    !$omp target update to(CC)
+    call copy3_array_data(tgt_cptr, tgt_dptr, N)
+    !$omp target update from(DD)
+    !$omp end target data
+
+    if (any(abs(CC - 3333.0_c_double) > 10.0_c_double * epsilon(CC))) call abort()
+    if (any(abs(3.0_c_double * CC - DD) > 10.0_c_double * epsilon(CC))) call abort()
+
+
+
+    ! fixed-size decriptorless array to use_device_ptr
+    !$omp target data map(to:EE) map(from:FF)
+    !$omp target data map(alloc:dummy) use_device_ptr(EE,FF)
+    tgt_eptr = c_loc(EE)
+    tgt_fptr = c_loc(FF)
+    !$omp end target data
+
+    call copy3_array_data(tgt_eptr, tgt_fptr, N)
+    !$omp target update from(FF)
+    if (any(abs(EE - 555.0_c_double) > 10.0_c_double * epsilon(EE))) call abort()
+    if (any(abs(3.0_c_double * EE - FF) > 10.0_c_double * epsilon(EE))) call abort()
+
+    EE = 5555.0_c_double
+    !$omp target update to(EE)
+    call copy3_array_data(tgt_eptr, tgt_fptr, N)
+    !$omp end target data
+
+    if (any(abs(EE - 5555.0_c_double) > 10.0_c_double * epsilon(EE))) call abort()
+    if (any(abs(3.0_c_double * EE - FF) > 10.0_c_double * epsilon(EE))) call abort()
+  end subroutine use_device_ptr_sub2
+end module offloading2
+
+
+
+program omp_device_ptr
+  use iso_c_binding
+  use offloading
+  use offloading2
+  implicit none
+
+  integer, parameter :: N = 1000
+  real(c_double), pointer :: AA(:), BB(:), arg_AA(:), arg_BB(:), arg2_AA(:), arg2_BB(:)
+  real(c_double), allocatable, target :: CC(:), DD(:), arg_CC(:), arg_DD(:), arg2_CC(:), arg2_DD(:)
+  real(c_double), target :: EE(N), FF(N), dummy(1), arg_EE(N), arg_FF(N), arg2_EE(N), arg2_FF(N)
+
+  real(c_double), pointer :: AptrA(:), BptrB(:)
+  type(c_ptr) :: tgt_aptr, tgt_bptr, tgt_cptr, tgt_dptr, tgt_eptr, tgt_fptr
+
+  allocate(AA(N), BB(N), CC(N), DD(N))
+
+  AA = 11.0_c_double
+  BB = 22.0_c_double
+  CC = 33.0_c_double
+  DD = 44.0_c_double
+  EE = 55.0_c_double
+  FF = 66.0_c_double
+
+  ! NOTE: OpenMP 5's use_device_addr is (at time of writing) not yet supported
+
+  ! pointer-type array to use_device_ptr
+  !$omp target data map(to:AA) map(from:BB)
+  !$omp target data map(alloc:dummy) use_device_ptr(AA,BB)
+  call copy3_array_data(c_loc(AA), c_loc(BB), N)
+  !$omp end target data
+  !$omp end target data
+
+  if (any(abs(AA - 11.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+  if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+  ! allocatable array to use_device_ptr
+  !$omp target data map(to:CC) map(from:DD)
+  !$omp target data map(alloc:dummy) use_device_ptr(CC,DD)
+  call copy3_array_data(c_loc(CC), c_loc(DD), N)
+  !$omp end target data
+  !$omp end target data
+
+  if (any(abs(CC - 33.0_c_double) > 10.0_c_double * epsilon(CC))) call abort()
+  if (any(abs(3.0_c_double * CC - DD) > 10.0_c_double * epsilon(CC))) call abort()
+
+  ! fixed-size decriptorless array to use_device_ptr
+  !$omp target data map(to:EE) map(from:FF)
+  !$omp target data map(alloc:dummy) use_device_ptr(EE,FF)
+  call copy3_array_data(c_loc(EE), c_loc(FF), N)
+  !$omp end target data
+  !$omp end target data
+
+  if (any(abs(EE - 55.0_c_double) > 10.0_c_double * epsilon(EE))) call abort()
+  if (any(abs(3.0_c_double * EE - FF) > 10.0_c_double * epsilon(EE))) call abort()
+
+
+
+  AA = 111.0_c_double
+  BB = 222.0_c_double
+  CC = 333.0_c_double
+  DD = 444.0_c_double
+  EE = 555.0_c_double
+  FF = 666.0_c_double
+
+  ! pointer-type array to use_device_ptr
+  !$omp target data map(to:AA) map(from:BB)
+  !$omp target data map(alloc:dummy) use_device_ptr(AA,BB)
+  tgt_aptr = c_loc(AA)
+  tgt_bptr = c_loc(BB)
+  AptrA => AA
+  BptrB => BB
+  !$omp end target data
+
+  call copy3_array_data(tgt_aptr, tgt_bptr, N)
+  !$omp target update from(BB)
+  if (any(abs(AA - 111.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+  if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+  AA = 1111.0_c_double
+  !$omp target update to(AA)
+  call copy3_array_data(tgt_aptr, tgt_bptr, N)
+  !$omp target update from(BB)
+  if (any(abs(AA - 1111.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+  if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+  ! AprtA tests
+  AA = 7.0_c_double
+  !$omp target update to(AA)
+  call copy3_array_data(c_loc(AptrA), c_loc(BptrB), N)
+  !$omp target update from(BB)
+  if (any(abs(AA - 7.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+  if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+  AA = 77.0_c_double
+  !$omp target update to(AA)
+  call copy3_array1(AptrA, BptrB)
+  !$omp target update from(BB)
+  if (any(abs(AA - 77.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+  if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+!  AA = 777.0_c_double
+!  !$omp target update to(AA)
+!  call copy3_array2(AptrA, BptrB)
+!  !$omp target update from(BB)
+!  if (any(abs(AA - 777.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+!  if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+  AA = 7777.0_c_double
+  !$omp target update to(AA)
+  call copy3_array3(AptrA, BptrB)
+  !$omp target update from(BB)
+  if (any(abs(AA - 7777.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+  if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+!  AA = 77777.0_c_double
+!  !$omp target update to(AA)
+!  call copy3_array4(AptrA, BptrB)
+!  !$omp target update from(BB)
+  !$omp end target data
+!
+!  if (any(abs(AA - 77777.0_c_double) > 10.0_c_double * epsilon(AA))) call abort()
+!  if (any(abs(3.0_c_double * AA - BB) > 10.0_c_double * epsilon(AA))) call abort()
+
+
+
+  ! allocatable array to use_device_ptr
+  !$omp target data map(to:CC) map(from:DD)
+  !$omp target data map(alloc:dummy) use_device_ptr(CC,DD)
+  tgt_cptr = c_loc(CC)
+  tgt_dptr = c_loc(DD)
+  !$omp end target data
+
+  call copy3_array_data(tgt_cptr, tgt_dptr, N)
+  !$omp target update from(DD)
+  if (any(abs(CC - 333.0_c_double) > 10.0_c_double * epsilon(CC))) call abort()
+  if (any(abs(3.0_c_double * CC - DD) > 10.0_c_double * epsilon(CC))) call abort()
+
+  CC = 3333.0_c_double
+  !$omp target update to(CC)
+  call copy3_array_data(tgt_cptr, tgt_dptr, N)
+  !$omp target update from(DD)
+  !$omp end target data
+
+  if (any(abs(CC - 3333.0_c_double) > 10.0_c_double * epsilon(CC))) call abort()
+  if (any(abs(3.0_c_double * CC - DD) > 10.0_c_double * epsilon(CC))) call abort()
+
+
+
+  ! fixed-size decriptorless array to use_device_ptr
+  !$omp target data map(to:EE) map(from:FF)
+  !$omp target data map(alloc:dummy) use_device_ptr(EE,FF)
+  tgt_eptr = c_loc(EE)
+  tgt_fptr = c_loc(FF)
+  !$omp end target data
+
+  call copy3_array_data(tgt_eptr, tgt_fptr, N)
+  !$omp target update from(FF)
+  if (any(abs(EE - 555.0_c_double) > 10.0_c_double * epsilon(EE))) call abort()
+  if (any(abs(3.0_c_double * EE - FF) > 10.0_c_double * epsilon(EE))) call abort()
+
+  EE = 5555.0_c_double
+  !$omp target update to(EE)
+  call copy3_array_data(tgt_eptr, tgt_fptr, N)
+  !$omp target update from(FF)
+  !$omp end target data
+
+  if (any(abs(EE - 5555.0_c_double) > 10.0_c_double * epsilon(EE))) call abort()
+  if (any(abs(3.0_c_double * EE - FF) > 10.0_c_double * epsilon(EE))) call abort()
+
+
+
+  deallocate(AA, BB)  ! Free pointers only
+
+  AptrA => null()
+  BptrB => null()
+  allocate(arg_AA(N), arg_BB(N), arg_CC(N), arg_DD(N))
+  call use_device_ptr_sub(arg_AA, arg_BB, arg_CC, arg_DD, arg_EE, arg_FF, AptrA, BptrB, N)
+  deallocate(arg_AA, arg_BB)
+
+  AptrA => null()
+  BptrB => null()
+  allocate(arg2_AA(N), arg2_BB(N), arg2_CC(N), arg2_DD(N))
+  call use_device_ptr_sub2(arg2_AA, arg2_BB, arg2_CC, arg2_DD, arg2_EE, arg2_FF, AptrA, BptrB, N)
+  deallocate(arg2_AA, arg2_BB)
+end program omp_device_ptr
diff --git a/libgomp/testsuite/libgomp.oacc-c++/c++.exp b/libgomp/testsuite/libgomp.oacc-c++/c++.exp
index dcefa79..86aacff 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/c++.exp
+++ b/libgomp/testsuite/libgomp.oacc-c++/c++.exp
@@ -78,11 +78,13 @@
     }
 
     # Test with all available offload targets, and with offloading disabled.
-    foreach offload_target [concat [split $offload_targets ","] "disable"] {
-	global openacc_device_type
-	set openacc_device_type [offload_target_to_openacc_device_type $offload_target]
-	set tagopt "-DACC_DEVICE_TYPE_$openacc_device_type=1"
+    set SAVE_ALWAYS_CFLAGS "$ALWAYS_CFLAGS"
+    global offload_target
+    foreach offload_target [concat [split $offload_targets ":"] "disable"] {
+	set ALWAYS_CFLAGS "$SAVE_ALWAYS_CFLAGS"
 
+	set openacc_device_type [offload_target_to_openacc_device_type $offload_target]
+	set tagopt "-DACC_DEVICE_TYPE_$openacc_device_type=\"$offload_target\""
 	switch $openacc_device_type {
 	    "" {
 		unsupported "$subdir $offload_target offloading"
@@ -90,6 +92,9 @@
 	    }
 	    host {
 		set acc_mem_shared 1
+
+		# Special case: pass the empty string instead of "disable".
+		set tagopt "-DACC_DEVICE_TYPE_$openacc_device_type=\"\""
 	    }
 	    nvidia {
 		if { ![check_effective_target_openacc_nvidia_accel_present] } {
@@ -106,6 +111,9 @@
 
 		set acc_mem_shared 0
 	    }
+	    gcn {
+		set acc_mem_shared 0
+	    }
 	    default {
 		error "Unknown OpenACC device type: $openacc_device_type (offload target: $offload_target)"
 	    }
@@ -116,8 +124,6 @@
 	# handling in test cases, by default only build for the offload target
 	# that we're actually going to test.
 	set tagopt "$tagopt -foffload=$offload_target"
-	# Force usage of the corresponding OpenACC device type.
-	setenv ACC_DEVICE_TYPE $openacc_device_type
 
 	# To get better test coverage for device-specific code that is only
 	# ever used in offloading configurations, we'd like more thorough
diff --git a/libgomp/testsuite/libgomp.oacc-c++/deep-copy-12.C b/libgomp/testsuite/libgomp.oacc-c++/deep-copy-12.C
new file mode 100644
index 0000000..771876a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/deep-copy-12.C
@@ -0,0 +1,72 @@
+#include <stdlib.h>
+
+/* Test attach/detach with dereferences of reference to pointer to struct.  */
+
+typedef struct {
+  int *a;
+  int *b;
+  int *c;
+} mystruct;
+
+int main(int argc, char* argv[])
+{
+  const int N = 1024;
+  mystruct *m = (mystruct *) malloc (sizeof (*m));
+  mystruct *&mref = m;
+  int i;
+
+  mref->a = (int *) malloc (N * sizeof (int));
+  m->b = (int *) malloc (N * sizeof (int));
+  m->c = (int *) malloc (N * sizeof (int));
+
+  for (i = 0; i < N; i++)
+    {
+      mref->a[i] = 0;
+      m->b[i] = 0;
+      m->c[i] = 0;
+    }
+
+#pragma acc enter data copyin(m[0:1])
+
+  for (int i = 0; i < 99; i++)
+    {
+      int j;
+#pragma acc parallel loop copy(mref->a[0:N])
+      for (j = 0; j < N; j++)
+        mref->a[j]++;
+#pragma acc parallel loop copy(mref->b[0:N], m->c[5:N-10])
+      for (j = 0; j < N; j++)
+	{
+	  mref->b[j]++;
+	  if (j > 5 && j < N - 5)
+	    m->c[j]++;
+	}
+    }
+
+#pragma acc exit data copyout(m[0:1])
+
+  for (i = 0; i < N; i++)
+    {
+      if (m->a[i] != 99)
+        abort ();
+      if (m->b[i] != 99)
+        abort ();
+      if (i > 5 && i < N-5)
+	{
+	  if (m->c[i] != 99)
+	    abort ();
+	}
+      else
+	{
+	  if (m->c[i] != 0)
+	    abort ();
+	}
+    }
+
+  free (m->a);
+  free (m->b);
+  free (m->c);
+  free (m);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c++/deep-copy-13.C b/libgomp/testsuite/libgomp.oacc-c++/deep-copy-13.C
new file mode 100644
index 0000000..98cf450
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/deep-copy-13.C
@@ -0,0 +1,72 @@
+#include <stdlib.h>
+
+/* Test array slice with reference to pointer.  */
+
+typedef struct {
+  int *a;
+  int *b;
+  int *c;
+} mystruct;
+
+int main(int argc, char* argv[])
+{
+  const int N = 1024;
+  mystruct *m = (mystruct *) malloc (sizeof (*m));
+  int i;
+
+  m->a = (int *) malloc (N * sizeof (int));
+  m->b = (int *) malloc (N * sizeof (int));
+  m->c = (int *) malloc (N * sizeof (int));
+
+  for (i = 0; i < N; i++)
+    {
+      m->a[i] = 0;
+      m->b[i] = 0;
+      m->c[i] = 0;
+    }
+
+#pragma acc enter data copyin(m[0:1])
+
+  for (int i = 0; i < 99; i++)
+    {
+      int j;
+      int *&ptr = m->a;
+#pragma acc parallel loop copy(ptr[0:N])
+      for (j = 0; j < N; j++)
+        ptr[j]++;
+#pragma acc parallel loop copy(m->b[0:N], m->c[5:N-10])
+      for (j = 0; j < N; j++)
+	{
+	  m->b[j]++;
+	  if (j > 5 && j < N - 5)
+	    m->c[j]++;
+	}
+    }
+
+#pragma acc exit data copyout(m[0:1])
+
+  for (i = 0; i < N; i++)
+    {
+      if (m->a[i] != 99)
+        abort ();
+      if (m->b[i] != 99)
+        abort ();
+      if (i > 5 && i < N-5)
+	{
+	  if (m->c[i] != 99)
+	    abort ();
+	}
+      else
+	{
+	  if (m->c[i] != 0)
+	    abort ();
+	}
+    }
+
+  free (m->a);
+  free (m->b);
+  free (m->c);
+  free (m);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c++/firstprivate-int.C b/libgomp/testsuite/libgomp.oacc-c++/firstprivate-int.C
new file mode 100644
index 0000000..86b8722
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/firstprivate-int.C
@@ -0,0 +1,83 @@
+/* Verify the GOMP_MAP_FIRSTPRIVATE_INT optimization on various types.
+   This test is similer to the test in libgomp.oacc-c-c++-common, but
+   it focuses on reference types. */
+
+#include <assert.h>
+#include <stdint.h>
+#include <complex.h>
+
+void test_ref (int8_t &i8i, int8_t &i8o, int16_t &i16i, int16_t &i16o,
+	       int32_t &i32i, int32_t &i32o, int64_t &i64i, int64_t &i64o,
+	       uint8_t &u8i, uint8_t &u8o, uint16_t &u16i, uint16_t &u16o,
+	       uint32_t &u32i, uint32_t &u32o, uint64_t &u64i, uint64_t &u64o,
+	       float &r32i, float &r32o, double &r64i, double &r64o,
+	       int _Complex &cii, int _Complex &cio,
+	       float _Complex &cfi, float _Complex &cfo,
+	       double _Complex &cdi, double _Complex &cdo)
+{
+#pragma acc parallel firstprivate (i8i,i16i,i32i,i64i,u8i,u16i,u32i,u64i) \
+  firstprivate(r32i,r64i,cii,cfi,cdi) copyout(i8o,i16o,i32o,i64o) \
+  copyout(u8o,u16o,u32o,u64o,r32o,r64o,cio,cfo,cdo) num_gangs(1)
+  {
+    i8o = i8i;
+    i16o = i16i;
+    i32o = i32i;
+    i64o = i64i;
+
+    u8o = u8i;
+    u16o = u16i;
+    u32o = u32i;
+    u64o = u64i;
+
+    r32o = r32i;
+    r64o = r64i;
+
+    cio = cii;
+    cfo = cfi;
+    cdo = cdi;
+  }
+}
+
+int
+main ()
+{
+  int8_t  i8i  = -1, i8o;
+  int16_t i16i = -2, i16o;
+  int32_t i32i = -3, i32o;
+  int64_t i64i = -4, i64o;
+
+  uint8_t  u8i  = 1,  u8o;
+  uint16_t u16i = 2, u16o;
+  uint32_t u32i = 3, u32o;
+  uint64_t u64i = 4, u64o;
+
+  float  r32i = .5, r32o;
+  double r64i = .25, r64o;
+
+  int _Complex    cii = 2, cio;
+  float _Complex  cfi = 4, cfo;
+  double _Complex cdi = 8, cdo;
+
+  test_ref (i8i, i8o, i16i, i16o, i32i, i32o, i64i, i64o, u8i, u8o, u16i,
+	    u16o, u32i, u32o, u64i, u64o, r32i, r32o, r64i, r64o, cii, cio,
+	    cfi, cfo, cdi, cdo);
+
+  assert (i8o == i8i);
+  assert (i16o == i16i);
+  assert (i32o == i32i);
+  assert (i64o == i64i);
+
+  assert (u8o == u8i);
+  assert (u16o == u16i);
+  assert (u32o == u32i);
+  assert (u64o == u64i);
+
+  assert (r32o == r32i);
+  assert (r64o == r64i);
+
+  assert (cio == cii);
+  assert (cfo == cfi);
+  assert (cdo == cdi);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-2.C b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-2.C
new file mode 100644
index 0000000..3884f16
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-2.C
@@ -0,0 +1,64 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+void workers (void)
+{
+  double res[65536];
+  int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(64)
+  {
+    int i, j;
+#pragma acc loop gang
+    for (i = 0; i < 256; i++)
+      {
+#pragma acc loop worker
+	for (j = 0; j < 256; j++)
+	  {
+	    int tmpvar;
+	    int &tmpref = tmpvar;
+	    tmpref = (i * 256 + j) * 99;
+	    res[i * 256 + j] = tmpref;
+	  }
+      }
+  }
+
+  for (i = 0; i < 65536; i++)
+    if (res[i] != i * 99)
+      abort ();
+}
+
+void vectors (void)
+{
+  double res[65536];
+  int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(64)
+  {
+    int i, j;
+#pragma acc loop gang worker
+    for (i = 0; i < 256; i++)
+      {
+#pragma acc loop vector
+	for (j = 0; j < 256; j++)
+	  {
+	    int tmpvar;
+	    int &tmpref = tmpvar;
+	    tmpref = (i * 256 + j) * 101;
+	    res[i * 256 + j] = tmpref;
+	  }
+      }
+  }
+
+  for (i = 0; i < 65536; i++)
+    if (res[i] != i * 101)
+      abort ();
+}
+
+int main (int argc, char *argv[])
+{
+  workers ();
+  vectors ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C
new file mode 100644
index 0000000..c1a10cb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c++/privatized-ref-3.C
@@ -0,0 +1,64 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+void workers (void)
+{
+  double res[65536];
+  int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(64)
+  {
+    int i, j;
+    int tmpvar;
+    int &tmpref = tmpvar;
+#pragma acc loop gang
+    for (i = 0; i < 256; i++)
+      {
+#pragma acc loop worker private(tmpref)
+	for (j = 0; j < 256; j++)
+	  {
+	    tmpref = (i * 256 + j) * 99;
+	    res[i * 256 + j] = tmpref;
+	  }
+      }
+  }
+
+  for (i = 0; i < 65536; i++)
+    if (res[i] != i * 99)
+      abort ();
+}
+
+void vectors (void)
+{
+  double res[65536];
+  int i;
+
+#pragma acc parallel copyout(res) num_gangs(64) num_workers(64)
+  {
+    int i, j;
+    int tmpvar;
+    int &tmpref = tmpvar;
+#pragma acc loop gang worker
+    for (i = 0; i < 256; i++)
+      {
+#pragma acc loop vector private(tmpref)
+	for (j = 0; j < 256; j++)
+	  {
+	    tmpref = (i * 256 + j) * 101;
+	    res[i * 256 + j] = tmpref;
+	  }
+      }
+  }
+
+  for (i = 0; i < 65536; i++)
+    if (res[i] != i * 101)
+      abort ();
+}
+
+int main (int argc, char *argv[])
+{
+  workers ();
+  vectors ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc-get-property.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc-get-property.c
new file mode 100644
index 0000000..5bf1b84
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc-get-property.c
@@ -0,0 +1,37 @@
+/* Test the `acc_get_property' and '`acc_get_property_string' library
+   functions. */
+/* { dg-do run } */
+
+#include <openacc.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+int main ()
+{
+  const char *s;
+  size_t v;
+  int r;
+
+  /* Verify that the vendor is a proper non-empty string.  */
+  s = acc_get_property_string (0, acc_device_default, acc_property_vendor);
+  r = !s || !strlen (s);
+  if (s)
+    printf ("OpenACC vendor: %s\n", s);
+
+  /* For the rest just check that they do not crash.  */
+  v = acc_get_property (0, acc_device_default, acc_property_memory);
+  if (v)
+    printf ("OpenACC total memory: %zd\n", v);
+  v = acc_get_property (0, acc_device_default, acc_property_free_memory);
+  if (v)
+    printf ("OpenACC free memory: %zd\n", v);
+  s = acc_get_property_string (0, acc_device_default, acc_property_name);
+  if (s)
+    printf ("OpenACC name: %s\n", s);
+  s = acc_get_property_string (0, acc_device_default, acc_property_driver);
+  if (s)
+    printf ("OpenACC driver: %s\n", s);
+
+  return r;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
index bfcb67d..758b1fc 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
@@ -14,7 +14,7 @@
 
   int expect = 1;
   
-#if  ACC_DEVICE_TYPE_host
+#ifdef ACC_DEVICE_TYPE_host
   expect = 0;
 #endif
   
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
index 8112745..0270d06 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
@@ -37,7 +37,7 @@
   }
 
 
-#if !ACC_DEVICE_TYPE_host
+#ifndef ACC_DEVICE_TYPE_host
 
   /* Offloaded.  */
 
@@ -49,7 +49,7 @@
       abort ();
     if (!acc_on_device (acc_device_not_host))
       abort ();
-#if ACC_DEVICE_TYPE_nvidia
+#ifdef ACC_DEVICE_TYPE_nvidia
     if (!acc_on_device (acc_device_nvidia))
       abort ();
 #else
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
new file mode 100644
index 0000000..a9a8c74
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
@@ -0,0 +1,351 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit 'copyin' clauses, to work around "'firstprivate'
+   optimizations", which will cause the value at the point of call to be used
+   (*before* any potential modifications done in callbacks), as opposed to its
+   address being taken, which then later gets dereferenced (*after* any
+   modifications done in callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+
+#define STATE_OP(state, op) \
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static void cb_compute_construct_start_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 0
+	  || state == 10
+	  || state == 30
+	  || state == 41
+	  || state == 51
+	  || state == 91
+	  || state == 101
+	  || state == 151);
+  STATE_OP (state, ++);
+}
+
+static void cb_compute_construct_start_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 1
+	  || state == 11
+	  || state == 40
+	  || state == 50
+	  || state == 90
+	  || state == 100
+	  || state == 150);
+  STATE_OP (state, ++);
+}
+
+static void cb_compute_construct_end_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 14
+	  || state == 21
+	  || state == 32
+	  || state == 42
+	  || state == 80
+	  || state == 103
+	  || state == 152);
+  STATE_OP (state, ++);
+}
+
+static void cb_compute_construct_end_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 13
+	  || state == 43
+	  || state == 102
+	  || state == 154);
+  STATE_OP (state, ++);
+}
+
+static void cb_compute_construct_end_3 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 12
+	  || state == 20
+	  || state == 31
+	  || state == 44
+	  || state == 81
+	  || state == 104
+	  || state == 153);
+  STATE_OP (state, ++);
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 2);
+  }
+  assert (state == 2);
+
+  STATE_OP (state, = 10);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 12);
+  }
+  assert (state == 15);
+
+  STATE_OP (state, = 20);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 20);
+  }
+  assert (state == 20);
+
+  STATE_OP (state, = 30);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 31);
+  }
+  assert (state == 33);
+
+  STATE_OP (state, = 40);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 42);
+  }
+  assert (state == 45);
+
+  STATE_OP (state, = 50);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 52);
+  }
+  assert (state == 52);
+
+  STATE_OP (state, = 60);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 60);
+  }
+  assert (state == 60);
+
+  STATE_OP (state, = 70);
+  unreg (acc_ev_compute_construct_start, NULL, acc_toggle);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 70);
+  }
+  assert (state == 70);
+
+  STATE_OP (state, = 80);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 80);
+  }
+  assert (state == 82);
+
+  STATE_OP (state, = 90);
+  reg (acc_ev_compute_construct_start, NULL, acc_toggle);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 92);
+  }
+  assert (state == 92);
+
+  STATE_OP (state, = 100);
+  reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 102);
+  }
+  assert (state == 105);
+
+  STATE_OP (state, = 110);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 110);
+  }
+  assert (state == 110);
+
+  STATE_OP (state, = 120);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 120);
+  }
+  assert (state == 120);
+
+  STATE_OP (state, = 130);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 130);
+  }
+  assert (state == 130);
+
+  STATE_OP (state, = 140);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 140);
+  }
+  assert (state == 140);
+
+  STATE_OP (state, = 150);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 152);
+  }
+  assert (state == 155);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
new file mode 100644
index 0000000..1af53cb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
@@ -0,0 +1,319 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit 'copyin' clauses, to work around "'firstprivate'
+   optimizations", which will cause the value at the point of call to be used
+   (*before* any potential modifications done in callbacks), as opposed to its
+   address being taken, which then later gets dereferenced (*after* any
+   modifications done in callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+
+#define STATE_OP(state, op) \
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+
+struct tool_info
+{
+  acc_event_info event_info;
+  struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 0
+	  || state == 100);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_device_init_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  if (state == 1)
+    assert (prof_info->device_type == acc_device_host);
+  else
+    assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+}
+
+static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 1
+	  || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
+
+  assert (prof_info->event_type == acc_ev_device_init_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  if (state == 2)
+    assert (prof_info->device_type == acc_device_host);
+  else
+    assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 10
+	  || state == 110);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_compute_construct_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+}
+
+static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 11
+	  || state == 111);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+
+  assert (prof_info->event_type == acc_ev_compute_construct_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else if (acc_device_type == acc_device_gcn)
+    assert (api_info->device_api == acc_device_api_other);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+  assert (state == 0);
+
+  acc_init (acc_device_host);
+  assert (state == 2);
+
+  STATE_OP (state, = 10);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  acc_async = 12;
+
+  {
+    int state_init;
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+#pragma acc wait
+    assert (state_init == 11);
+  }
+  assert (state == 12);
+
+  STATE_OP (state, = 90);
+  acc_shutdown (acc_device_host);
+  assert (state == 90);
+
+
+  STATE_OP (state, = 100);
+  acc_init (acc_device_default);
+  assert (state == 102);
+
+  STATE_OP (state, = 110);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  acc_async = 12;
+
+  {
+    int state_init;
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+#pragma acc wait
+    assert (state_init == 111);
+  }
+  assert (state == 112);
+
+  STATE_OP (state, = 190);
+  acc_shutdown (acc_device_default);
+  assert (state == 190);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
new file mode 100644
index 0000000..9c1cfbe
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
@@ -0,0 +1,240 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit 'copyin' clauses, to work around "'firstprivate'
+   optimizations", which will cause the value at the point of call to be used
+   (*before* any potential modifications done in callbacks), as opposed to its
+   address being taken, which then later gets dereferenced (*after* any
+   modifications done in callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
+   'libgomp.texi'.  */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+volatile // TODO PR90488
+static int state = -1;
+
+#define STATE_OP(state, op) \
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int num_gangs, num_workers, vector_length;
+static int async;
+
+
+static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 0);
+  STATE_OP (state, = 1);
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == NULL);
+  assert (event_info->launch_event.kernel_name != NULL);
+  {
+    const char *s = strstr (event_info->launch_event.kernel_name, "main");
+    assert (s != NULL);
+    s = strstr (s, "omp_fn");
+    assert (s != NULL);
+  }
+  if (num_gangs < 1)
+    assert (event_info->launch_event.num_gangs >= 1);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.num_gangs == num_gangs);
+#else
+      /* No parallelized OpenACC 'kernels' constructs.  Unparallelized OpenACC
+	 'kernels' constructs must get launched as 1 x 1 x 1 GPU kernels.  */
+      assert (event_info->launch_event.num_gangs == 1);
+#endif
+    }
+  if (num_workers < 1)
+    assert (event_info->launch_event.num_workers >= 1);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.num_workers == num_workers);
+#else
+      /* See 'num_gangs' above.  */
+      assert (event_info->launch_event.num_workers == 1);
+#endif
+    }
+  if (vector_length < 1)
+    assert (event_info->launch_event.vector_length >= 1);
+  else if (acc_device_type == acc_device_nvidia) /* ... is special.  */
+    assert (event_info->launch_event.vector_length == 32);
+  else if (acc_device_type == acc_device_gcn) /* ...and so is this.  */
+    assert (event_info->launch_event.vector_length == 64);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.vector_length == vector_length);
+#else
+      /* See 'num_gangs' above.  */
+      assert (event_info->launch_event.vector_length == 1);
+#endif
+    }
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else if (acc_device_type == acc_device_gcn)
+    assert (api_info->device_api == acc_device_api_other);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+  assert (state == 0);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  assert (state == 0);
+
+  STATE_OP (state, = 0);
+  /* Implicit async.  */
+  async = acc_async_noval;
+  /* Parallelism dimensions: compiler/runtime decides.  */
+  num_gangs = num_workers = vector_length = 0;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels
+    {
+      for (int i = 0; i < N; ++i)
+	x[i] = i * i;
+    }
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No 'acc_ev_enqueue_launch_start'.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+	__builtin_abort ();
+#undef N
+  }
+
+  STATE_OP (state, = 0);
+  /* Explicit async: without argument.  */
+  async = acc_async_noval;
+  /* Parallelism dimensions: literal.  */
+  num_gangs = 30;
+  num_workers = 3;
+  vector_length = 5;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels \
+  async \
+  num_gangs (30) num_workers (3) vector_length (5)
+    /* { dg-prune-output "using vector_length \\(32\\), ignoring 5" } */
+    {
+      for (int i = 0; i < N; ++i)
+	x[i] = i * i;
+    }
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No 'acc_ev_enqueue_launch_start'.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+	__builtin_abort ();
+#undef N
+  }
+
+  STATE_OP (state, = 0);
+  /* Explicit async: variable.  */
+  async = 123;
+  /* Parallelism dimensions: variable.  */
+  num_gangs = 22;
+  num_workers = 5;
+  vector_length = 7;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels \
+  async (async) \
+  num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
+    /* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
+    {
+      for (int i = 0; i < N; ++i)
+	x[i] = i * i;
+    }
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No 'acc_ev_enqueue_launch_start'.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+	__builtin_abort ();
+#undef N
+  }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
new file mode 100644
index 0000000..0cb03691
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
@@ -0,0 +1,689 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit 'copyin' clauses, to work around "'firstprivate'
+   optimizations", which will cause the value at the point of call to be used
+   (*before* any potential modifications done in callbacks), as opposed to its
+   address being taken, which then later gets dereferenced (*after* any
+   modifications done in callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
+   libgomp.texi.  */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+
+/* Do we expect to see 'acc_ev_exit_data_start' and 'acc_ev_exit_data_end'
+   after a compute construct with an 'async' clause?  */
+#define ASYNC_EXIT_DATA 1
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+
+#define STATE_OP(state, op) \
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+
+struct tool_info
+{
+  acc_event_info event_info;
+  struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 1
+	  || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+#else
+  assert (state == 0
+	  || state == 100);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+#endif
+
+  assert (prof_info->event_type == acc_ev_device_init_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+#else
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+#endif
+}
+
+static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 2
+	  || state == 102);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == acc_ev_device_init_start);
+#else
+  assert (state == 1
+	  || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
+#endif
+
+  assert (prof_info->event_type == acc_ev_device_init_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (event_info->other_event.tool_info == tool_info->nested);
+#else
+  assert (event_info->other_event.tool_info == tool_info);
+#endif
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+#else
+  free (tool_info);
+  tool_info = NULL;
+#endif
+}
+
+static void cb_enter_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 3
+	  || state == 103);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_enter_data_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+static void cb_enter_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 4
+	  || state == 104);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == acc_ev_enter_data_start);
+
+  assert (prof_info->event_type == acc_ev_enter_data_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == tool_info->nested);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else if (acc_device_type == acc_device_gcn)
+    assert (api_info->device_api == acc_device_api_other);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+static void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 5
+#if ASYNC_EXIT_DATA
+	  || state == 105
+#endif
+	  );
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_exit_data_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else if (acc_device_type == acc_device_gcn)
+    assert (api_info->device_api == acc_device_api_other);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+static void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 6
+#if ASYNC_EXIT_DATA
+	  || state == 106
+#endif
+	  );
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == acc_ev_exit_data_start);
+
+  assert (prof_info->event_type == acc_ev_exit_data_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == tool_info->nested);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else if (acc_device_type == acc_device_gcn)
+    assert (api_info->device_api == acc_device_api_other);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+static void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 0
+	  || state == 100);
+  if (state == 100)
+    {
+      /* Compensate for the missing 'acc_ev_device_init_start' and
+	 'acc_ev_device_init_end'.  */
+      state += 2;
+    }
+#else
+  if (state == 100)
+    {
+      /* Compensate for the missing 'acc_ev_device_init_start' and
+	 'acc_ev_device_init_end'.  */
+      state += 2;
+    }
+  assert (state == 2
+	  || state == 102);
+#endif
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_compute_construct_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+
+  if (acc_device_type == acc_device_host)
+    {
+      /* Compensate for the missing 'acc_ev_enter_data_start'.  */
+      state += 1;
+    }
+}
+
+static void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  if (acc_device_type == acc_device_host)
+    {
+      /* Compensate for the missing 'acc_ev_enter_data_end'.  */
+      state += 1;
+      /* Compensate for the missing 'acc_ev_exit_data_start' and
+	 'acc_ev_exit_data_end'.  */
+      state += 2;
+    }
+#if !ASYNC_EXIT_DATA
+  else if (acc_async != acc_async_sync)
+    {
+      /* Compensate for the missing 'acc_ev_exit_data_start' and
+	 'acc_ev_exit_data_end'.  */
+      state += 2;
+    }
+#endif
+  assert (state == 7
+	  || state == 107);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+
+  assert (prof_info->event_type == acc_ev_compute_construct_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else if (acc_device_type == acc_device_gcn)
+    assert (api_info->device_api == acc_device_api_other);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == NULL);
+  assert (event_info->launch_event.kernel_name != NULL);
+  {
+    const char *s = strstr (event_info->launch_event.kernel_name, "main");
+    assert (s != NULL);
+    s = strstr (s, "omp_fn");
+    assert (s != NULL);
+  }
+  assert (event_info->launch_event.num_gangs >= 1);
+  assert (event_info->launch_event.num_workers >= 1);
+  assert (event_info->launch_event.vector_length >= 1);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else if (acc_device_type == acc_device_gcn)
+    assert (api_info->device_api == acc_device_api_other);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+}
+
+static void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.kernel_name != NULL);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else if (acc_device_type == acc_device_gcn)
+    assert (api_info->device_api == acc_device_api_other);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+  reg (acc_ev_enter_data_start, cb_enter_data_start, acc_reg);
+  reg (acc_ev_enter_data_end, cb_enter_data_end, acc_reg);
+  reg (acc_ev_exit_data_start, cb_exit_data_start, acc_reg);
+  reg (acc_ev_exit_data_end, cb_exit_data_end, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+  reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+  reg (acc_ev_enqueue_launch_end, cb_enqueue_launch_end, acc_reg);
+  assert (state == 0);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  acc_async = acc_async_sync;
+  assert (state == 0);
+
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      asm volatile ("" : : : "memory"); // TODO PR90488
+
+      state_init = state;
+    }
+    assert (state_init == 4);
+  }
+  assert (state == 8);
+
+  STATE_OP (state, = 100);
+
+  acc_async = 12;
+  {
+    int state_init;
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      asm volatile ("" : : : "memory"); // TODO PR90488
+
+      state_init = state;
+    }
+#pragma acc wait
+    assert (state_init == 104);
+  }
+  assert (state == 108);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
new file mode 100644
index 0000000..a723ad9
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
@@ -0,0 +1,224 @@
+/* Test the 'valid_bytes' magic.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int ev_count_data;
+
+static void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->data_event.valid_bytes == _ACC_DATA_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+
+  ++ev_count_data;
+}
+
+static int ev_count_launch;
+
+static void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+
+  ++ev_count_launch;
+}
+
+static int ev_count_other;
+
+static void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+
+  ++ev_count_other;
+}
+
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg_ (acc_ev_device_init_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_init_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg);
+  reg_ (acc_ev_create, cb_data_event, acc_reg);
+  reg_ (acc_ev_delete, cb_data_event, acc_reg);
+  reg_ (acc_ev_alloc, cb_data_event, acc_reg);
+  reg_ (acc_ev_free, cb_data_event, acc_reg);
+  reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_update_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_update_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg);
+  reg_ (acc_ev_wait_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_wait_end, cb_other_event, acc_reg);
+}
+
+
+/* Basic struct.  */
+typedef struct A
+{
+  int a;
+  int b;
+#define VALID_BYTES_A \
+  _ACC_PROF_VALID_BYTES_STRUCT (A, b, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (int))
+} A;
+
+/* Add a 'char' field.  */
+typedef struct B
+{
+  int a;
+  int b;
+  char c;
+#define VALID_BYTES_B \
+  _ACC_PROF_VALID_BYTES_STRUCT (B, c, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} B;
+
+/* Add another 'char' field.  */
+typedef struct C
+{
+  int a;
+  int b;
+  char c, d;
+#define VALID_BYTES_C \
+  _ACC_PROF_VALID_BYTES_STRUCT (C, d, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} C;
+
+/* Add two 'void *' fields.  */
+typedef struct D
+{
+  int a;
+  int b;
+  char c, d;
+  void *e;
+  void *f;
+#define VALID_BYTES_D \
+  _ACC_PROF_VALID_BYTES_STRUCT (D, f, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} D;
+
+/* Add another three 'char' fields.  */
+typedef struct E
+{
+  int a;
+  int b;
+  char c, d;
+  void *e;
+  void *f;
+  char g, h, i;
+#define VALID_BYTES_E \
+  _ACC_PROF_VALID_BYTES_STRUCT (E, i, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} E;
+
+
+int main()
+{
+  A A1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
+  assert (VALID_BYTES_A <= sizeof A1);
+  DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b);
+  assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1));
+
+  B B1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B);
+  assert (VALID_BYTES_B <= sizeof B1);
+  DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c);
+  assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1));
+
+  assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char));
+
+  C C1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C);
+  assert (VALID_BYTES_C <= sizeof C1);
+  DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d);
+  assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1));
+
+  assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char));
+
+  D D1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D);
+  assert (VALID_BYTES_D <= sizeof D1);
+  DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f);
+  assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1));
+
+  assert (VALID_BYTES_D > VALID_BYTES_C);
+
+  E E1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E);
+  assert (VALID_BYTES_E <= sizeof E1);
+  DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i);
+  assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1));
+
+  assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char));
+
+  ev_count_data = 0;
+  ev_count_launch = 0;
+  ev_count_other = 0;
+
+  /* Trigger tests done in 'cb_*' functions.  */
+  int host;
+#pragma acc parallel copyout (host)
+  {
+    asm volatile ("" : : : "memory"); // TODO PR90488
+
+    host = acc_on_device (acc_device_host);
+  }
+
+  DEBUG_printf ("ev_count_data = %d\n", ev_count_data);
+  if (host)
+    assert (ev_count_data == 0);
+  else
+    {
+      /* We don't know exactly how many data events to expect, but we at least
+	 expect some.  */
+      assert (ev_count_data > 0);
+    }
+
+  DEBUG_printf ("ev_count_launch = %d\n", ev_count_launch);
+  if (host)
+    assert (ev_count_data == 0);
+  else
+    {
+      /* We expect two launch events, 'acc_ev_enqueue_launch_start',
+	 'acc_ev_enqueue_launch_end'.  */
+      assert (ev_count_launch == 2);
+    }
+
+  DEBUG_printf ("ev_count_other = %d\n", ev_count_other);
+  /* We don't know exactly how many other events to expect, but we at least
+     expect 'acc_ev_device_init_start', 'acc_ev_device_init_end',
+     'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'.  */
+  assert (ev_count_other >= 4);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
new file mode 100644
index 0000000..0f9e956
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
@@ -0,0 +1,74 @@
+/* Test the 'version' field of 'acc_prof_info'.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int ev_count;
+
+
+static void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->version == 201711);
+
+  ++ev_count;
+}
+
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg_ (acc_ev_device_init_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_init_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg);
+  reg_ (acc_ev_create, cb_any_event, acc_reg);
+  reg_ (acc_ev_delete, cb_any_event, acc_reg);
+  reg_ (acc_ev_alloc, cb_any_event, acc_reg);
+  reg_ (acc_ev_free, cb_any_event, acc_reg);
+  reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_update_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_update_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_wait_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_wait_end, cb_any_event, acc_reg);
+}
+
+
+int main()
+{
+  ev_count = 0;
+
+  /* Trigger tests done in 'cb_*' functions.  */
+#pragma acc parallel
+  {
+    asm volatile ("" : : : "memory"); // TODO PR90488
+  }
+
+  DEBUG_printf ("ev_count = %d\n", ev_count);
+  /* We don't know exactly how many events to expect, but we at least expect
+     'acc_ev_device_init_start', 'acc_ev_device_init_end',
+     'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'.  */
+  assert (ev_count >= 4);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
index 544b19f..4f9e53d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
@@ -1,3 +1,5 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+
 /* Test mapping of async values to specific underlying queues.  */
 
 #undef NDEBUG
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-nop-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-nop-1.c
index 4ab6736..840052f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-nop-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-nop-1.c
@@ -26,6 +26,8 @@
   acc_device_t d;
 #if defined ACC_DEVICE_TYPE_nvidia
   d = acc_device_nvidia;
+#elif defined ACC_DEVICE_TYPE_gcn
+  d = acc_device_gcn;
 #elif defined ACC_DEVICE_TYPE_host
   d = acc_device_host;
 #else
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c
index 842f2de..6a24a2d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c
@@ -37,11 +37,9 @@
       imin = idata[i] < imin ? idata[i] : imin;
     }
 
-  if (imax != 1234 || imin != 0)
+  if (imax != 1234 || imin < 0 || imin > 1)
     abort ();
 
-  return 0;
-
   igot = 0;
   iexp = 32;
 
@@ -443,17 +441,16 @@
     }
   }
 
+  int ones = 0, zeros = 0;
+
   for (i = 0; i < N; i++)
-    if (i % 2 == 0)
-      {
-	if (idata[i] != 1)
-	  abort ();
-      }
-    else
-      {
-	if (idata[i] != 0)
-	  abort ();
-      }
+    if (idata[i] == 1)
+      ones++;
+    else if (idata[i] == 0)
+      zeros++;
+
+  if (ones != N / 2 || zeros != N / 2)
+    abort ();
 
   if (iexp != igot)
     abort ();
@@ -491,17 +488,16 @@
       }
   }
 
+  ones = zeros = 0;
+
   for (i = 0; i < N; i++)
-    if (i % 2 == 0)
-      {
-	if (idata[i] != 0)
-	  abort ();
-      }
-    else
-      {
-	if (idata[i] != 1)
-	  abort ();
-      }
+    if (idata[i] == 1)
+      ones++;
+    else if (idata[i] == 0)
+      zeros++;
+
+  if (ones != N / 2 || zeros != N / 2)
+    abort ();
 
   if (iexp != igot)
     abort ();
@@ -579,7 +575,7 @@
   if (lexp != lgot)
     abort ();
 
-  lgot = 2LL;
+  lgot = 2LL << N;
   lexp = 2LL;
 
 #pragma acc data copy (lgot, ldata[0:N])
@@ -587,7 +583,7 @@
 #pragma acc parallel loop
     for (i = 0; i < N; i++)
       {
-        long long expr = 1LL << N;
+        long long expr = 2LL;
 
 #pragma acc atomic capture
         { lgot = lgot / expr; ldata[i] = lgot; }
@@ -1450,17 +1446,16 @@
       }
   }
 
+  ones = zeros = 0;
+
   for (i = 0; i < N; i++)
-    if (i % 2 == 0)
-      {
-	if (fdata[i] != 1.0)
-	  abort ();
-      }
-    else
-      {
-	if (fdata[i] != 0.0)
-	  abort ();
-      }
+    if (fdata[i] == 1.0)
+      ones++;
+    else if (fdata[i] == 0.0)
+      zeros++;
+
+  if (ones != N / 2 || zeros != N / 2)
+    abort ();
 
   if (fexp != fgot)
     abort ();
@@ -1498,17 +1493,16 @@
       }
   }
 
+  ones = zeros = 0;
+
   for (i = 0; i < N; i++)
-    if (i % 2 == 0)
-      {
-	if (fdata[i] != 0.0)
-	  abort ();
-      }
-    else
-      {
-	if (fdata[i] != 1.0)
-	  abort ();
-      }
+    if (fdata[i] == 1.0)
+      ones++;
+    else if (fdata[i] == 0.0)
+      zeros++;
+
+  if (ones != N / 2 || zeros != N / 2)
+    abort ();
 
   if (fexp != fgot)
     abort ();
@@ -1569,7 +1563,7 @@
     abort ();
 
   fgot = 8192.0*8192.0*64.0;
-  fexp = 1.0;
+  fexp = fgot;
 
 #pragma acc data copy (fgot, fdata[0:N])
   {
@@ -1586,15 +1580,15 @@
   if (fexp != fgot)
     abort ();
 
-  fgot = 4.0;
-  fexp = 4.0;
+  fgot = 2.0 * (1LL << N);
+  fexp = 2.0;
 
 #pragma acc data copy (fgot, fdata[0:N])
   {
 #pragma acc parallel loop
     for (i = 0; i < N; i++)
       {
-        long long expr = 1LL << N;
+        long long expr = 2LL;
 
 #pragma acc atomic capture
         { fgot = fgot / expr; fdata[i] = fgot; }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-directives-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-directives-1.c
index dad6d13..c6abc1d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-directives-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/combined-directives-1.c
@@ -1,6 +1,11 @@
 /* This test exercises combined directives.  */
 
+/* This test falls back to host execution because struct alias
+   analysis is deactivated on OpenACC parallel regions.  Consequently,
+   parloops can no longer disambiguate arrays a and b.  */
+
 /* { dg-do run } */
+/* { dg-xfail-if "n/a" { openacc_nvidia_accel_selected } { "-O2" } { "" } } */
 
 #include <stdlib.h>
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/context-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/context-2.c
index 6a52f74..6bdcfe7 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/context-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/context-2.c
@@ -182,13 +182,13 @@
         exit (EXIT_FAILURE);
     }
 
+    acc_delete (&h_X[0], N * sizeof (float));
+    acc_delete (&h_Y1[0], N * sizeof (float));
+
     free (h_X);
     free (h_Y1);
     free (h_Y2);
 
-    acc_free (d_X);
-    acc_free (d_Y);
-
     context_check (pctx);
 
     s = cublasDestroy (h);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/context-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/context-4.c
index 71365e8..b403a5c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/context-4.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/context-4.c
@@ -176,13 +176,13 @@
         exit (EXIT_FAILURE);
     }
 
+    acc_delete (&h_X[0], N * sizeof (float));
+    acc_delete (&h_Y1[0], N * sizeof (float));
+
     free (h_X);
     free (h_Y1);
     free (h_Y2);
 
-    acc_free (d_X);
-    acc_free (d_Y);
-
     context_check (pctx);
 
     s = cublasDestroy (h);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/da-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-1.c
new file mode 100644
index 0000000..c1c205d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-1.c
@@ -0,0 +1,103 @@
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <stdlib.h>
+#include <assert.h>
+
+#define n 100
+#define m 100
+
+int b[n][m];
+
+void
+test1 (void)
+{
+  int i, j, *a[100];
+
+  /* Array of pointers form test.  */
+  for (i = 0; i < n; i++)
+    {
+      a[i] = (int *)malloc (sizeof (int) * m);
+      for (j = 0; j < m; j++)
+	b[i][j] = j - i;
+    }
+
+  #pragma acc parallel loop copyout(a[0:n][0:m]) copyin(b)
+  for (i = 0; i < n; i++)
+    #pragma acc loop
+    for (j = 0; j < m; j++)
+      a[i][j] = b[i][j];
+
+  for (i = 0; i < n; i++)
+    {
+      for (j = 0; j < m; j++)
+	assert (a[i][j] == b[i][j]);
+      /* Clean up.  */
+      free (a[i]);
+    }
+}
+
+void
+test2 (void)
+{
+  int i, j, **a = (int **) malloc (sizeof (int *) * n);
+
+  /* Separately allocated blocks.  */
+  for (i = 0; i < n; i++)
+    {
+      a[i] = (int *)malloc (sizeof (int) * m);
+      for (j = 0; j < m; j++)
+	b[i][j] = j - i;
+    }
+
+  #pragma acc parallel loop copyout(a[0:n][0:m]) copyin(b)
+  for (i = 0; i < n; i++)
+    #pragma acc loop
+    for (j = 0; j < m; j++)
+      a[i][j] = b[i][j];
+
+  for (i = 0; i < n; i++)
+    {
+      for (j = 0; j < m; j++)
+	assert (a[i][j] == b[i][j]);
+      /* Clean up.  */
+      free (a[i]);
+    }
+  free (a);
+}
+
+void
+test3 (void)
+{
+  int i, j, **a = (int **) malloc (sizeof (int *) * n);
+  a[0] = (int *) malloc (sizeof (int) * n * m);
+
+  /* Rows allocated in one contiguous block.  */
+  for (i = 0; i < n; i++)
+    {
+      a[i] = *a + i * m;
+      for (j = 0; j < m; j++)
+	b[i][j] = j - i;
+    }
+
+  #pragma acc parallel loop copyout(a[0:n][0:m]) copyin(b)
+  for (i = 0; i < n; i++)
+    #pragma acc loop
+    for (j = 0; j < m; j++)
+      a[i][j] = b[i][j];
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < m; j++)
+      assert (a[i][j] == b[i][j]);
+
+  free (a[0]);
+  free (a);
+}
+
+int
+main (void)
+{
+  test1 ();
+  test2 ();
+  test3 ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/da-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-2.c
new file mode 100644
index 0000000..6ee7855
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-2.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <assert.h>
+#include "da-utils.h"
+
+int
+main (void)
+{
+  int n = 10;
+  int ***a = (int ***) create_da (sizeof (int), n, 3);
+  int ***b = (int ***) create_da (sizeof (int), n, 3);
+  int ***c = (int ***) create_da (sizeof (int), n, 3);
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	{
+	  a[i][j][k] = i + j * k + k;
+	  b[i][j][k] = j + k * i + i * j;
+	  c[i][j][k] = a[i][j][k];
+	}
+
+  #pragma acc parallel copy (a[0:n][0:n][0:n]) copyin (b[0:n][0:n][0:n])
+  {
+    for (int i = 0; i < n; i++)
+      for (int j = 0; j < n; j++)
+	for (int k = 0; k < n; k++)
+	  a[i][j][k] += b[k][j][i] + i + j + k;
+  }
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	assert (a[i][j][k] == c[i][j][k] + b[k][j][i] + i + j + k);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/da-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-3.c
new file mode 100644
index 0000000..877c6df
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-3.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <assert.h>
+#include "da-utils.h"
+
+int main (void)
+{
+  int n = 20, x = 5, y = 12;
+  int *****a = (int *****) create_da (sizeof (int), n, 5);
+
+  int sum1 = 0, sum2 = 0, sum3 = 0;
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	for (int l = 0; l < n; l++)
+	  for (int m = 0; m < n; m++)
+	    {
+	      a[i][j][k][l][m] = 1;
+	      sum1++;
+	    }
+
+  #pragma acc parallel copy (a[x:y][x:y][x:y][x:y][x:y]) copy(sum2)
+  {
+    for (int i = x; i < x + y; i++)
+      for (int j = x; j < x + y; j++)
+	for (int k = x; k < x + y; k++)
+	  for (int l = x; l < x + y; l++)
+	    for (int m = x; m < x + y; m++)
+	      {
+		a[i][j][k][l][m] = 0;
+		sum2++;
+	      }
+  }
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	for (int l = 0; l < n; l++)
+	  for (int m = 0; m < n; m++)
+	    sum3 += a[i][j][k][l][m];
+
+  assert (sum1 == sum2 + sum3);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/da-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-4.c
new file mode 100644
index 0000000..2059c5f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-4.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <assert.h>
+#include "da-utils.h"
+
+int main (void)
+{
+  int n = 128;
+  double ***a = (double ***) create_da (sizeof (double), n, 3);
+  double ***b = (double ***) create_da (sizeof (double), n, 3);
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	a[i][j][k] = i + j + k + i * j * k;
+
+  /* This test exercises async copyout of dynamic array rows.  */
+  #pragma acc parallel copyin(a[0:n][0:n][0:n]) copyout(b[0:n][0:n][0:n]) async(5)
+  {
+    #pragma acc loop gang
+    for (int i = 0; i < n; i++)
+      #pragma acc loop vector
+      for (int j = 0; j < n; j++)
+	for (int k = 0; k < n; k++)
+	  b[i][j][k] = a[i][j][k] * 2.0;
+  }
+
+  #pragma acc wait (5)
+
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < n; j++)
+      for (int k = 0; k < n; k++)
+	assert (b[i][j][k] == a[i][j][k] * 2.0);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/da-utils.h b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-utils.h
new file mode 100644
index 0000000..2f87795
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/da-utils.h
@@ -0,0 +1,44 @@
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+
+/* Allocate and create a pointer based NDIMS-dimensional array,
+   each dimension DIMLEN long, with ELSIZE sized data elements.  */
+void *
+create_da (size_t elsize, int dimlen, int ndims)
+{
+  size_t blk_size = 0;
+  size_t n = 1;
+
+  for (int i = 0; i < ndims - 1; i++)
+    {
+      n *= dimlen;
+      blk_size += sizeof (void *) * n;
+    }
+  size_t data_rows_num = n;
+  size_t data_rows_offset = blk_size;
+  blk_size += elsize * n * dimlen;
+
+  void *blk = (void *) malloc (blk_size);
+  memset (blk, 0, blk_size);
+  void **curr_dim = (void **) blk;
+  n = 1;
+
+  for (int d = 0; d < ndims - 1; d++)
+    {
+      uintptr_t next_dim = (uintptr_t) (curr_dim + n * dimlen);
+      size_t next_dimlen = dimlen * (d < ndims - 2 ? sizeof (void *) : elsize);
+
+      for (int b = 0; b < n; b++)
+        for (int i = 0; i < dimlen; i++)
+	  if (d < ndims - 1)
+	    curr_dim[b * dimlen + i]
+	      = (void*) (next_dim + b * dimlen * next_dimlen + i * next_dimlen);
+
+      n *= dimlen;
+      curr_dim = (void**) next_dim;
+    }
+  assert (n == data_rows_num);
+  return blk;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
index e9d1eda..98d0c97 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
@@ -155,11 +155,16 @@
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 
+  acc_wait_async (14, 10);
   acc_copyout_async (a, nbytes, 10);
+  acc_wait_async (14, 11);
   acc_copyout_async (b, nbytes, 11);
+  acc_wait_async (14, 12);
   acc_copyout_async (c, nbytes, 12);
+  acc_wait_async (14, 13);
   acc_copyout_async (d, nbytes, 13);
   acc_copyout_async (e, nbytes, 14);
+  acc_wait_async (14, 15);
   acc_delete_async (&N, sizeof (int), 15);
   acc_wait_all ();
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
index 2fc4a59..6f330ee 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
@@ -149,11 +149,16 @@
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 
+#pragma acc wait (14) async (10)
 #pragma acc exit data copyout (a[0:N]) async (10)
+#pragma acc wait (14) async (11)
 #pragma acc exit data copyout (b[0:N]) async (11)
+#pragma acc wait (14) async (12)
 #pragma acc exit data copyout (c[0:N]) async (12)
+#pragma acc wait (14) async (13)
 #pragma acc exit data copyout (d[0:N]) async (13)
 #pragma acc exit data copyout (e[0:N]) async (14)
+#pragma acc wait (14) async (15)
 #pragma acc exit data delete (N) async (15)
 #pragma acc wait
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c
new file mode 100644
index 0000000..d8d7067
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-1.c
@@ -0,0 +1,24 @@
+#include <stdlib.h>
+#include <assert.h>
+
+struct dc
+{
+  int a;
+  int *b;
+};
+
+int
+main ()
+{
+  int n = 100, i;
+  struct dc v = { .a = 3, .b = (int *) malloc (sizeof (int) * n) };
+
+#pragma acc parallel loop copy(v.a, v.b[:n])
+  for (i = 0; i < n; i++)
+    v.b[i] = v.a;
+
+  for (i = 0; i < 10; i++)
+    assert (v.b[i] == v.a);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c
new file mode 100644
index 0000000..2ccb5f5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c
@@ -0,0 +1,55 @@
+#include <stdlib.h>
+
+#define ITERATIONS 1023
+
+/* Test asynchronous attach and detach operation.  */
+
+typedef struct {
+  int *a;
+  int *b;
+} mystruct;
+
+int
+main (int argc, char* argv[])
+{
+  const int N = 1024;
+  mystruct m;
+  int i;
+
+  m.a = (int *) malloc (N * sizeof (int));
+  m.b = (int *) malloc (N * sizeof (int));
+
+  for (i = 0; i < N; i++)
+    {
+      m.a[i] = 0;
+      m.b[i] = 0;
+    }
+
+#pragma acc enter data copyin(m)
+
+  for (int i = 0; i < ITERATIONS; i++)
+    {
+      int j;
+#pragma acc parallel loop copy(m.a[0:N]) async(0)
+      for (j = 0; j < N; j++)
+        m.a[j]++;
+#pragma acc parallel loop copy(m.b[0:N]) async(1)
+      for (j = 0; j < N; j++)
+        m.b[j]++;
+    }
+
+#pragma acc exit data copyout(m) wait(0, 1)
+
+  for (i = 0; i < N; i++)
+    {
+      if (m.a[i] != ITERATIONS)
+	abort ();
+      if (m.b[i] != ITERATIONS)
+	abort ();
+    }
+
+  free (m.a);
+  free (m.b);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-11.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-11.c
new file mode 100644
index 0000000..ed8ddcc
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-11.c
@@ -0,0 +1,72 @@
+#include <stdlib.h>
+
+/* Test multiple struct dereferences on one directive, and slices starting at
+   non-zero.  */
+
+typedef struct {
+  int *a;
+  int *b;
+  int *c;
+} mystruct;
+
+int main(int argc, char* argv[])
+{
+  const int N = 1024;
+  mystruct *m = (mystruct *) malloc (sizeof (*m));
+  int i;
+
+  m->a = (int *) malloc (N * sizeof (int));
+  m->b = (int *) malloc (N * sizeof (int));
+  m->c = (int *) malloc (N * sizeof (int));
+
+  for (i = 0; i < N; i++)
+    {
+      m->a[i] = 0;
+      m->b[i] = 0;
+      m->c[i] = 0;
+    }
+
+#pragma acc enter data copyin(m[0:1])
+
+  for (int i = 0; i < 99; i++)
+    {
+      int j;
+#pragma acc parallel loop copy(m->a[0:N])
+      for (j = 0; j < N; j++)
+        m->a[j]++;
+#pragma acc parallel loop copy(m->b[0:N], m->c[5:N-10])
+      for (j = 0; j < N; j++)
+	{
+	  m->b[j]++;
+	  if (j > 5 && j < N - 5)
+	    m->c[j]++;
+	}
+    }
+
+#pragma acc exit data copyout(m[0:1])
+
+  for (i = 0; i < N; i++)
+    {
+      if (m->a[i] != 99)
+        abort ();
+      if (m->b[i] != 99)
+        abort ();
+      if (i > 5 && i < N-5)
+	{
+	  if (m->c[i] != 99)
+	    abort ();
+	}
+      else
+	{
+	  if (m->c[i] != 0)
+	    abort ();
+	}
+    }
+
+  free (m->a);
+  free (m->b);
+  free (m->c);
+  free (m);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-14.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-14.c
new file mode 100644
index 0000000..04c70ae
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-14.c
@@ -0,0 +1,63 @@
+#include <openacc.h>
+#include <stdlib.h>
+
+/* Test attach/detach operation with chained dereferences.  */
+
+typedef struct mystruct {
+  int *a;
+  struct mystruct *next;
+} mystruct;
+
+int
+main (int argc, char* argv[])
+{
+  const int N = 1024;
+  mystruct *m = (mystruct *) malloc (sizeof (*m));
+  int i;
+
+  m->a = (int *) malloc (N * sizeof (int));
+  m->next = (mystruct *) malloc (sizeof (*m));
+  m->next->a = (int *) malloc (N * sizeof (int));
+  m->next->next = NULL;
+
+  for (i = 0; i < N; i++)
+    {
+      m->a[i] = 0;
+      m->next->a[i] = 0;
+    }
+
+#pragma acc enter data copyin(m[0:1])
+  acc_copyin (m->next, sizeof (*m));
+
+  for (int i = 0; i < 99; i++)
+    {
+      int j;
+      acc_copyin (m->next->a, N * sizeof (int));
+      acc_attach ((void **) &m->next);
+      /* This will attach only the innermost pointer, i.e. "a[0:N]".  That's
+	 why we have to attach the "m->next" pointer manually above.  */
+#pragma acc parallel loop copy(m->next->a[0:N])
+      for (j = 0; j < N; j++)
+        m->next->a[j]++;
+      acc_detach ((void **) &m->next);
+      acc_copyout (m->next->a, N * sizeof (int));
+    }
+
+  acc_copyout (m->next, sizeof (*m));
+#pragma acc exit data copyout(m[0:1])
+
+  for (i = 0; i < N; i++)
+    {
+      if (m->a[i] != 0)
+        abort ();
+      if (m->next->a[i] != 99)
+        abort ();
+    }
+
+  free (m->next->a);
+  free (m->next);
+  free (m->a);
+  free (m);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-2.c
new file mode 100644
index 0000000..7e26e9a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-2.c
@@ -0,0 +1,29 @@
+#include <assert.h>
+#include <stdlib.h>
+
+int
+main(int argc, char* argv[])
+{
+  struct foo {
+    int *a, *b, c, d, *e;
+  } s;
+
+  s.a = (int *) malloc (16 * sizeof (int));
+  s.b = (int *) malloc (16 * sizeof (int));
+  s.e = (int *) malloc (16 * sizeof (int));
+
+  #pragma acc data copy(s)
+  {
+    #pragma acc data copy(s.a[0:10])
+    {
+      #pragma acc parallel loop attach(s.a)
+      for (int i = 0; i < 10; i++)
+	s.a[i] = i;
+    }
+  }
+
+  for (int i = 0; i < 10; i++)
+    assert (s.a[i] == i);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-3.c
new file mode 100644
index 0000000..cec764b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-3.c
@@ -0,0 +1,34 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <openacc.h>
+
+int
+main ()
+{
+  int n = 100, i;
+  int *a = (int *) malloc (sizeof (int) * n);
+  int *b;
+
+  for (i = 0; i < n; i++)
+    a[i] = i+1;
+
+#pragma acc enter data copyin(a[:n]) create(b)
+
+  b = a;
+  acc_attach ((void **)&b);
+
+#pragma acc parallel loop present (b[:n])
+  for (i = 0; i < n; i++)
+    b[i] = i+1;
+
+  acc_detach ((void **)&b);
+
+#pragma acc exit data copyout(a[:n], b)
+
+  for (i = 0; i < 10; i++)
+    assert (a[i] == b[i]);
+
+  free (a);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-4.c
new file mode 100644
index 0000000..8874ca0
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-4.c
@@ -0,0 +1,87 @@
+#include <assert.h>
+#include <stdlib.h>
+
+#define LIST_LENGTH 10
+
+struct node
+{
+  struct node *next;
+  int val;
+};
+
+int
+sum_nodes (struct node *head)
+{
+  int i = 0, sum = 0;
+
+#pragma acc parallel reduction(+:sum) present(head[:1])
+  {
+    for (; head != NULL; head = head->next)
+      sum += head->val;
+  }
+
+  return sum;
+}
+
+void
+insert (struct node *head, int val)
+{
+  struct node *n = (struct node *) malloc (sizeof (struct node));
+
+  if (head->next)
+    {
+#pragma acc exit data detach(head->next)
+    }
+
+  n->val = val;
+  n->next = head->next;
+  head->next = n;
+
+#pragma acc enter data copyin(n[:1])
+#pragma acc enter data attach(head->next)
+  if (n->next)
+    {
+#pragma acc enter data attach(n->next)
+    }
+}
+
+void
+destroy (struct node *head)
+{
+  while (head->next != NULL)
+    {
+#pragma acc exit data detach(head->next)
+      struct node * n = head->next;
+      head->next = n->next;
+      if (n->next)
+	{
+#pragma acc exit data detach(n->next)
+	}
+#pragma acc exit data delete (n[:1])
+      if (head->next)
+	{
+#pragma acc enter data attach(head->next)
+	}
+      free (n);
+    }
+}
+
+int
+main ()
+{
+  struct node list = { .next = NULL, .val = 0 };
+  int i;
+
+#pragma acc enter data copyin(list)
+
+  for (i = 0; i < LIST_LENGTH; i++)
+    insert (&list, i + 1);
+
+  assert (sum_nodes (&list) == (LIST_LENGTH * LIST_LENGTH + LIST_LENGTH) / 2);
+
+  destroy (&list);
+
+#pragma acc exit data delete(list)
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-5.c
new file mode 100644
index 0000000..89cafbb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-5.c
@@ -0,0 +1,81 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <openacc.h>
+
+struct node
+{
+  struct node *next;
+  int val;
+};
+
+int
+sum_nodes (struct node *head)
+{
+  int i = 0, sum = 0;
+
+#pragma acc parallel reduction(+:sum) present(head[:1])
+  {
+    for (; head != NULL; head = head->next)
+      sum += head->val;
+  }
+
+  return sum;
+}
+
+void
+insert (struct node *head, int val)
+{
+  struct node *n = (struct node *) malloc (sizeof (struct node));
+
+  if (head->next)
+    acc_detach ((void **) &head->next);
+
+  n->val = val;
+  n->next = head->next;
+  head->next = n;
+
+  acc_copyin (n, sizeof (struct node));
+  acc_attach((void **) &head->next);
+
+  if (n->next)
+    acc_attach ((void **) &n->next);
+}
+
+void
+destroy (struct node *head)
+{
+  while (head->next != NULL)
+    {
+      acc_detach ((void **) &head->next);
+      struct node * n = head->next;
+      head->next = n->next;
+      if (n->next)
+	acc_detach ((void **) &n->next);
+
+      acc_delete (n, sizeof (struct node));
+      if (head->next)
+	acc_attach((void **) &head->next);
+
+      free (n);
+    }
+}
+
+int
+main ()
+{
+  struct node list = { .next = NULL, .val = 0 };
+  int i;
+
+  acc_copyin (&list, sizeof (struct node));
+
+  for (i = 0; i < 10; i++)
+    insert (&list, 2);
+
+  assert (sum_nodes (&list) == 10 * 2);
+
+  destroy (&list);
+
+  acc_delete (&list, sizeof (struct node));
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-6.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-6.c
new file mode 100644
index 0000000..81c1c5e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-6.c
@@ -0,0 +1,59 @@
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <openacc.h>
+
+struct dc
+{
+  int a;
+  int **b;
+};
+
+int
+main ()
+{
+  int n = 100, i, j, k;
+  struct dc v = { .a = 3 };
+
+  v.b = (int **) malloc (sizeof (int *) * n);
+  for (i = 0; i < n; i++)
+    v.b[i] = (int *) malloc (sizeof (int) * n);
+
+  for (k = 0; k < 16; k++)
+    {
+#pragma acc data copy(v)
+      {
+#pragma acc data copy(v.b[:n])
+	{
+	  for (i = 0; i < n; i++)
+	    {
+	      acc_copyin (v.b[i], sizeof (int) * n);
+	      acc_attach ((void **) &v.b[i]);
+	    }
+
+#pragma acc parallel loop
+	  for (i = 0; i < n; i++)
+	    for (j = 0; j < n; j++)
+	      v.b[i][j] = v.a + i + j;
+
+	  for (i = 0; i < n; i++)
+	    {
+	      acc_detach ((void **) &v.b[i]);
+	      acc_copyout (v.b[i], sizeof (int) * n);
+	    }
+	}
+      }
+
+      for (i = 0; i < n; i++)
+	for (j = 0; j < n; j++)
+	  assert (v.b[i][j] == v.a + i + j);
+
+      assert (!acc_is_present (&v, sizeof (v)));
+      assert (!acc_is_present (v.b, sizeof (int *) * n));
+      for (i = 0; i < n; i++)
+        assert (!acc_is_present (v.b[i], sizeof (int) * n));
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-7.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-7.c
new file mode 100644
index 0000000..a59047a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-7.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <openacc.h>
+
+struct dc
+{
+  int a;
+  int *b;
+};
+
+int
+main ()
+{
+  int n = 100, i, j, k;
+  struct dc v = { .a = 3 };
+
+  v.b = (int *) malloc (sizeof (int) * n);
+
+  for (k = 0; k < 16; k++)
+    {
+      /* Here, we do not explicitly copy the enclosing structure, but work
+	 with fields directly.  Make sure attachment counters and reference
+	 counters work properly in that case.  */
+#pragma acc enter data copyin(v.a, v.b[0:n])
+#pragma acc enter data pcopyin(v.b[0:n])
+#pragma acc enter data pcopyin(v.b[0:n])
+
+#pragma acc parallel loop present(v.a, v.b)
+      for (i = 0; i < n; i++)
+	v.b[i] = v.a + i;
+
+#pragma acc exit data copyout(v.b[:n]) finalize
+#pragma acc exit data delete(v.a)
+
+      for (i = 0; i < n; i++)
+	assert (v.b[i] == v.a + i);
+
+      assert (!acc_is_present (&v, sizeof (v)));
+      assert (!acc_is_present (v.b, sizeof (int *) * n));
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-8.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-8.c
new file mode 100644
index 0000000..0ca5990
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-8.c
@@ -0,0 +1,54 @@
+/* { dg-do run { target { ! openacc_host_selected } } } */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <openacc.h>
+
+struct dc
+{
+  int a;
+  int *b;
+  int *c;
+  int *d;
+};
+
+int
+main ()
+{
+  int n = 100, i, j, k;
+  struct dc v = { .a = 3 };
+
+  v.b = (int *) malloc (sizeof (int) * n);
+  v.c = (int *) malloc (sizeof (int) * n);
+  v.d = (int *) malloc (sizeof (int) * n);
+
+#pragma acc enter data copyin(v)
+
+  for (k = 0; k < 16; k++)
+    {
+#pragma acc enter data copyin(v.a, v.b[:n], v.c[:n], v.d[:n])
+
+#pragma acc parallel loop
+      for (i = 0; i < n; i++)
+	v.b[i] = v.a + i;
+
+#pragma acc exit data copyout(v.b[:n])
+#pragma acc exit data copyout(v.c[:n])
+#pragma acc exit data copyout(v.d[:n])
+#pragma acc exit data copyout(v.a)
+
+      for (i = 0; i < n; i++)
+	assert (v.b[i] == v.a + i);
+
+      assert (acc_is_present (&v, sizeof (v)));
+      assert (!acc_is_present (v.b, sizeof (int *) * n));
+      assert (!acc_is_present (v.c, sizeof (int *) * n));
+      assert (!acc_is_present (v.d, sizeof (int *) * n));
+    }
+
+#pragma acc exit data copyout(v)
+
+  assert (!acc_is_present (&v, sizeof (v)));
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-9.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-9.c
new file mode 100644
index 0000000..28535c9
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-9.c
@@ -0,0 +1,53 @@
+#include <stdlib.h>
+
+typedef struct {
+  int *a;
+  int *b;
+} mystruct;
+
+int
+main (int argc, char* argv[])
+{
+  const int N = 1024;
+  mystruct *m = (mystruct *) malloc (sizeof (*m));
+  int i;
+
+  m->a = (int *) malloc (N * sizeof (int));
+  m->b = (int *) malloc (N * sizeof (int));
+
+  for (i = 0; i < N; i++)
+    {
+      m->a[i] = 0;
+      m->b[i] = 0;
+    }
+
+#pragma acc enter data copyin(m[0:1])
+
+  for (int i = 0; i < 99; i++)
+    {
+      int j;
+      int *ptr = m->a;
+#pragma acc parallel loop copy(m->a[0:N])
+      for (j = 0; j < N; j++)
+        m->a[j]++;
+#pragma acc parallel loop copy(m->b[0:N]) 
+      for (j = 0; j < N; j++)
+	m->b[j]++;
+    }
+
+#pragma acc exit data copyout(m[0:1])
+
+  for (i = 0; i < N; i++)
+    {
+      if (m->a[i] != 99)
+        abort ();
+      if (m->b[i] != 99)
+        abort ();
+    }
+
+  free (m->a);
+  free (m->b);
+  free (m);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c
index 689a443..14bc3af 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-1.c
@@ -117,6 +117,8 @@
     arr[i] = 3;
 
 #pragma acc parallel firstprivate(x) copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 119 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 119 } */
   {
 #pragma acc loop gang
     for (i = 0; i < 32; i++)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-int.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-int.c
new file mode 100644
index 0000000..6d14599
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/firstprivate-int.c
@@ -0,0 +1,67 @@
+/* Verify the GOMP_MAP_FIRSTPRIVATE_INT optimization on various types.  */
+
+#include <assert.h>
+#include <stdint.h>
+#include <complex.h>
+
+int
+main ()
+{
+  int8_t  i8i  = -1, i8o;
+  int16_t i16i = -2, i16o;
+  int32_t i32i = -3, i32o;
+  int64_t i64i = -4, i64o;
+
+  uint8_t  u8i  = 1,  u8o;
+  uint16_t u16i = 2, u16o;
+  uint32_t u32i = 3, u32o;
+  uint64_t u64i = 4, u64o;
+
+  float  r32i = .5, r32o;
+  double r64i = .25, r64o;
+
+  int _Complex    cii = 2, cio;
+  float _Complex  cfi = 4, cfo;
+  double _Complex cdi = 8, cdo;
+
+#pragma acc parallel firstprivate (i8i,i16i,i32i,i64i,u8i,u16i,u32i,u64i) \
+  firstprivate(r32i,r64i,cii,cfi,cdi) copyout(i8o,i16o,i32o,i64o) \
+  copyout(u8o,u16o,u32o,u64o,r32o,r64o,cio,cfo,cdo) num_gangs(1)
+  {
+    i8o = i8i;
+    i16o = i16i;
+    i32o = i32i;
+    i64o = i64i;
+
+    u8o = u8i;
+    u16o = u16i;
+    u32o = u32i;
+    u64o = u64i;
+
+    r32o = r32i;
+    r64o = r64i;
+
+    cio = cii;
+    cfo = cfi;
+    cdo = cdi;
+  }
+
+  assert (i8o == i8i);
+  assert (i16o == i16i);
+  assert (i32o == i32i);
+  assert (i64o == i64i);
+
+  assert (u8o == u8i);
+  assert (u16o == u16i);
+  assert (u32o == u32i);
+  assert (u64o == u64i);
+
+  assert (r32o == r32i);
+  assert (r64o == r64i);
+
+  assert (cio == cii);
+  assert (cfo == cfi);
+  assert (cdo == cdi);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/function-not-offloaded.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/function-not-offloaded.c
index fdf4eb0..517004a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/function-not-offloaded.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/function-not-offloaded.c
@@ -1,11 +1,11 @@
 /* { dg-do link } */
-/* { dg-excess-errors "lto1, mkoffload and lto-wrapper fatal errors" { target openacc_nvidia_accel_selected } } */
+/* { dg-excess-errors "lto1, mkoffload and lto-wrapper fatal errors" { target { openacc_nvidia_accel_selected || openacc_amdgcn_accel_selected } } } */
 
 int var;
 #pragma acc declare create (var)
 
 void __attribute__((noinline, noclone))
-foo () /* { dg-error "function 'foo' has been referenced in offloaded code but hasn't been marked to be included in the offloaded code" "" { target openacc_nvidia_accel_selected } } */
+foo () /* { dg-error "function 'foo' has been referenced in offloaded code but hasn't been marked to be included in the offloaded code" "" { target { openacc_nvidia_accel_selected || openacc_amdgcn_accel_selected } } } */
 {
   var++;
 }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c
new file mode 100644
index 0000000..f378346
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c
@@ -0,0 +1,38 @@
+#include <assert.h>
+
+int main (void)
+{
+  int ret;
+
+  #pragma acc parallel num_gangs(1) num_workers(32) copyout(ret)
+  {
+    int w = 0;
+
+    #pragma acc loop worker
+    for (int i = 0; i < 32; i++)
+      {
+        #pragma acc atomic update
+	w++;
+      }
+
+    ret = (w == 32);
+  }
+  assert (ret);
+
+  #pragma acc parallel num_gangs(1) vector_length(32) copyout(ret)
+  {
+    int v = 0;
+
+    #pragma acc loop vector
+    for (int i = 0; i < 32; i++)
+      {
+        #pragma acc atomic update
+	v++;
+      }
+
+    ret = (v == 32);
+  }
+  assert (ret);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-6.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-6.c
new file mode 100644
index 0000000..c5744fe
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-6.c
@@ -0,0 +1,53 @@
+/* Test if, if_present clauses on host_data construct.  */
+
+#include <assert.h>
+#include <stdint.h>
+
+void
+foo (float *p, intptr_t host_p, int shared_mem_p, int cond)
+{
+  assert (p == (float *) host_p);
+
+#pragma acc data copyin(host_p)
+  {
+#pragma acc host_data use_device(p) if_present
+    /* p not mapped yet, so it will be equal to the host pointer.  */
+    assert (p == (float *) host_p);
+
+#pragma acc data copy(p[0:100])
+    {
+      /* Not inside a host_data construct, so p is still the host pointer.  */
+      assert (p == (float *) host_p);
+
+      if (!shared_mem_p)
+      {
+#pragma acc host_data use_device(p)
+        /* The device address is different from the host address.  */
+        assert (p != (float *) host_p);
+
+#pragma acc host_data use_device(p) if_present
+        /* p is present now, so this is the same as above.  */
+        assert (p != (float *) host_p);
+      }
+
+#pragma acc host_data use_device(p) if(cond)
+      /* p is the device pointer iff cond is true and device memory is
+         separate from host memory.  */
+      assert ((p != (float *) host_p) == (cond && !shared_mem_p));
+    }
+  }
+}
+
+int
+main (void)
+{
+  float arr[100];
+  int shared_mem_p = 0;
+#if ACC_MEM_SHARED
+  shared_mem_p = 1;
+#endif
+  foo (arr, (intptr_t) arr, shared_mem_p, 0);
+  foo (arr, (intptr_t) arr, shared_mem_p, 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-2.c
index e8d65df..e40d5ab 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-2.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-fipa-pta" } */
 
 #include <stdlib.h>
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-3.c
index dd8ca87..e2cf3d7 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-3.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-3.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-fipa-pta" } */
 
 #include <stdlib.h>
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta.c
index 50e7dc1..5f89d48 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-fipa-pta" } */
 
 #include <stdlib.h>
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c
new file mode 100644
index 0000000..45d786d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c
@@ -0,0 +1,29 @@
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+
+#undef NDEBUG
+#include <assert.h>
+
+int main()
+{
+  int a = 0;
+#define N 123
+  int b[N] = { 0 };
+
+#pragma acc kernels
+  {
+    int c = 234; /* { dg-warning "optimized: beginning .gang-single. region in OpenACC .kernels. construct" } */
+
+#pragma acc loop independent gang /* { dg-warning "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-warning "optimized: parallelized loop nest in OpenACC .kernels. construct" "" { target *-*-* } 16 } */
+    for (int i = 0; i < N; ++i)
+      b[i] = c;
+
+    a = c; /* { dg-warning "optimized: beginning .gang-single. region in OpenACC .kernels. construct" } */
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert (b[i] == 234);
+  assert (a == 234);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-empty.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-empty.c
index a68a7cd..8ee0da7 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-empty.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-empty.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 int
 main (void)
 {
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-for-index-reuse-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-for-index-reuse-1.c
new file mode 100644
index 0000000..dafe412
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-for-index-reuse-1.c
@@ -0,0 +1,36 @@
+/* { dg-xfail-run-if "unhandled case" { *-*-* } } */
+
+/* Test reuse of loop index variables in kernels region.  */
+
+#include <assert.h>
+
+#define SIZE 16384
+
+int
+main (int argc, char* argv[])
+{
+  float arr[SIZE], arr_o[SIZE];
+  int i, o;
+
+  for (i = 0; i < SIZE; i++)
+    arr[i] = i;
+
+  i = 15;
+  #pragma acc kernels
+  {
+    i *= 30;
+    o = i;
+
+    #pragma acc loop independent
+    for (i = 0; i < SIZE; i++)
+      arr_o[i] = arr[i] * 2;
+  }
+
+  assert (i == SIZE);
+  assert (o == 450);
+
+  for (i = 0; i < SIZE; i++)
+    assert (arr_o[i] == arr[i] * 2);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c
index b840888..4aeeed1 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N (1024 * 512)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-3.c
index 31114ac..9cbace1 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-3.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-3.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N (1024 * 512)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-2.c
index d36592f..cae8439 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-2.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N 32
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-3.c
index e622971..d53e393 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-3.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-3.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N 32
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-4.c
index c731278..7435c85 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-4.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-4.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N 32
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-5.c
index 67dcce2..32f47fb 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-5.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-5.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N 32
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-6.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-6.c
index b8b5dde..a310de7 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-6.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq-6.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N 32
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq.c
index 9d9308a..e00177e 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-and-seq.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N 32
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-collapse.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-collapse.c
index 997d6c7..15e2dc1 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-collapse.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-collapse.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N 100
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c
index 607c350..337ad91 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-2.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N (1024 * 512)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c
index 8b9dd5f..214dd7e 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit-2.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N (1024 * 512)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c
index 5d5da6f..7d097da 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-enter-exit.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N (1024 * 512)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c
index c111c8f..661cb28 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data-update.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N (1024 * 512)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c
index 947bcda..2f4f699 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-data.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N (1024 * 512)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-g.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-g.c
index 88258be..e5a556b 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-g.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-g.c
@@ -1,3 +1,5 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
 /* { dg-additional-options "-g" } */
 
 #include "kernels-loop.c"
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-mod-not-zero.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-mod-not-zero.c
index 147ebb5..eeb318e 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-mod-not-zero.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-mod-not-zero.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N ((1024 * 512) + 1)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-n.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-n.c
index 9a3eaca..eeccc1d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-n.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-n.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N ((1024 * 512) + 1)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c
index 28c725a..c59c47e 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N 1000
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop.c
index 355123c..36eabb9 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N (1024 * 512)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c
index ebcc6e1..fe30318 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-parallel-loop-data-enter-exit.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N (1024 * 512)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c
index 95f1b77..3d02d53 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c
@@ -1,6 +1,9 @@
 /* Verify that a simple, explicit acc loop reduction works inside
  a kernels region.  */
 
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define N 100
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c
index 8647a94..e67340c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction.c
@@ -1,3 +1,6 @@
+/* { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+
 #include <stdlib.h>
 
 #define n 10000
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-43.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-43.c
deleted file mode 100644
index 5db2912..0000000
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-43.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Exercise acc_update_device with a NULL data address on nvidia targets.  */
-
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <openacc.h>
-
-int
-main (int argc, char **argv)
-{
-  const int N = 256;
-  int i;
-  unsigned char *h;
-  void *d;
-
-  h = (unsigned char *) malloc (N);
-
-  for (i = 0; i < N; i++)
-    {
-      h[i] = i;
-    }
-
-  d = acc_copyin (h, N);
-  if (!d)
-    abort ();
-
-  for (i = 0; i < N; i++)
-    {
-      h[i] = 0xab;
-    }
-
-  fprintf (stderr, "CheCKpOInT\n");
-  acc_update_device (0, N);
-
-  acc_copyout (h, N);
-
-  for (i = 0; i < N; i++)
-    {
-      if (h[i] != 0xab)
-	abort ();
-    }
-
-  free (h);
-
-  return 0;
-}
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "\\\[\[^\n\r]*,256\\\] is not mapped" } */
-/* { dg-shouldfail "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-47.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-47.c
deleted file mode 100644
index c214042..0000000
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-47.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Exercise acc_update_self with a NULL data mapping on nvidia targets.  */
-
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <openacc.h>
-
-int
-main (int argc, char **argv)
-{
-  const int N = 256;
-  int i;
-  unsigned char *h;
-  void *d;
-
-  h = (unsigned char *) malloc (N);
-
-  for (i = 0; i < N; i++)
-    {
-      h[i] = i;
-    }
-
-  d = acc_copyin (h, N);
-  if (!d)
-    abort ();
-
-  memset (&h[0], 0, N);
-
-  fprintf (stderr, "CheCKpOInT\n");
-  acc_update_self (0, N);
-
-  for (i = 0; i < N; i++)
-    {
-      if (h[i] != i)
-	abort ();
-    }
-
-  acc_delete (h, N);
-
-  free (h);
-
-  return 0;
-}
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "\\\[\[^\n\r]*,256\\\] is not mapped" } */
-/* { dg-shouldfail "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
index c10beba..455ca5b 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
@@ -9,46 +9,14 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuModuleLoad (&module, "./subr.ptx");
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuModuleLoad failed: %d\n", r);
@@ -62,20 +30,6 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
     abort ();
@@ -90,7 +44,7 @@
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -118,11 +72,6 @@
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
index 912b266..ee06898 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
@@ -1,6 +1,7 @@
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
 /* { dg-additional-options "-lcuda" } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -10,47 +11,17 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 3;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t diff;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -65,20 +36,6 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -96,9 +53,29 @@
 	  abort ();
     }
 
+  gettimeofday (&tv1, NULL);
+
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[0], NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
+
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxLaunch failed: %d\n", r);
+      abort ();
+    }
+
+  gettimeofday (&tv2, NULL);
+
+  diff = tv2.tv_sec - tv1.tv_sec;
+
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,7 +89,7 @@
 	}
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  sleep ((diff + 1) * N);
 
   for (i = 0; i < N; i++)
     {
@@ -123,10 +100,6 @@
 	}
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
index e383ba0..920ff5f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
@@ -10,45 +10,13 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -63,20 +31,6 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
     {
@@ -87,7 +41,7 @@
   if (!acc_set_cuda_stream (0, stream))
     abort ();
     
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -100,7 +54,12 @@
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize () failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -108,11 +67,6 @@
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
index 43a8b7e..4fa9d5a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
@@ -1,6 +1,7 @@
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
 /* { dg-additional-options "-lcuda" } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -10,47 +11,15 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 6;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -65,20 +34,6 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -98,13 +53,12 @@
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
     }
 
   if (acc_async_test_all () != 0)
@@ -113,7 +67,12 @@
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -121,11 +80,6 @@
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
index 0efcf0d..e25d894 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
@@ -5,50 +5,20 @@
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -63,19 +33,25 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
+  gettimeofday (&tv2, NULL);
 
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -91,11 +67,9 @@
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -103,38 +77,31 @@
     }
 
   acc_wait (0);
-  /* Test unseen async-argument.  */
-  acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait (0);
-  /* Test unseen async-argument.  */
-  acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.010 < atime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
index 1942211..53e285f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
@@ -6,52 +6,22 @@
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,18 +36,25 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -93,16 +70,11 @@
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,27 +84,18 @@
       acc_wait (0);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
index 11d9d62..787dcb8 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
@@ -6,52 +6,22 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,18 +36,25 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   streams = (CUstream *) malloc (N * sizeof (void *));
 
@@ -98,16 +75,11 @@
 	  abort ();
     }
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -117,27 +89,19 @@
       acc_wait (i);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
index 4f58fb2..0bed15f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
@@ -6,50 +6,20 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -64,19 +34,25 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
+  gettimeofday (&tv2, NULL);
 
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -92,11 +68,9 @@
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -105,33 +79,30 @@
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > (t1 + (t1 * 0.10)))
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.010 < atime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
index b2e2687..16eb446 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
@@ -6,54 +6,22 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
-
-  devnum = 2;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -68,18 +36,25 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
@@ -105,16 +80,11 @@
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -156,7 +126,7 @@
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
   if (acc_async_test (0) != 1)
     abort ();
@@ -164,25 +134,16 @@
   if (acc_async_test (1) != 1)
     abort ();
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
index d5f18f0..77de9ba 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
@@ -6,52 +6,22 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams, stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,18 +36,25 @@
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 500.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+	abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+	abort ();
+    }
 
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   streams = (CUstream *) malloc (N * sizeof (void *));
 
@@ -98,11 +75,6 @@
 	  abort ();
     }
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (N);
   if (stream != NULL)
     abort ();
@@ -117,11 +89,11 @@
   if (!acc_set_cuda_stream (N, stream))
     abort ();
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -129,6 +101,10 @@
 	}
     }
 
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
   acc_wait_all_async (N);
 
   for (i = 0; i <= N; i++)
@@ -145,15 +121,13 @@
 	abort ();
     }
 
-  atime = stop_timer (0);
-
-  if (atime < dtime)
+  if ((t1 * N) < t2)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   stream = (CUstream) acc_get_cuda_stream (N + 1);
   if (stream != NULL)
@@ -173,35 +147,33 @@
 
   acc_wait (N + 1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.10 < atime)
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all_async (N);
 
   acc_wait (N);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.10 < atime)
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 3\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
index be30a7f..ecf7488 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
@@ -10,46 +10,18 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay2;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 32;
   int i;
   CUstream *streams;
-  unsigned long **a, **d_a, *tid, ticks;
+  unsigned long **a, **d_a, *tid;
   int nbytes;
-  void *kargs[3];
-  int clkrate;
-  int devnum, nprocs;
+  void *kargs[2];
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,10 +38,6 @@
 
   nbytes = sizeof (int);
 
-  ticks = (unsigned long) (200.0 * clkrate);
-
-  N = nprocs;
-
   streams = (CUstream *) malloc (N * sizeof (void *));
 
   a = (unsigned long **) malloc (N * sizeof (unsigned long *));
@@ -103,8 +71,7 @@
   for (i = 0; i < N; i++)
     {
       kargs[0] = (void *) &d_a[i];
-      kargs[1] = (void *) &ticks;
-      kargs[2] = (void *) &tid[i];
+      kargs[1] = (void *) &tid[i];
 
       r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
       if (r != CUDA_SUCCESS)
@@ -112,8 +79,6 @@
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
-      ticks = (unsigned long) (50.0 * clkrate);
     }
 
   acc_wait_all_async (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-84.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-84.c
index d793c74..e2d86de 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-84.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-84.c
@@ -38,6 +38,12 @@
       if (streams[i] != NULL)
 	abort ();
 
+      /* The no-value async may be an alias for a numbered async stream.
+	 Skip over setting it below else the above NULL check will fail for
+	 the aliased stream.  */
+      if (i == acc_async_noval)
+	continue;
+
       r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
       if (r != CUDA_SUCCESS)
 	{
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-85.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-85.c
index 141c83b..2298241 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-85.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-85.c
@@ -38,6 +38,12 @@
       if (streams[i] != NULL)
 	abort ();
 
+      /* The no-value async may be an alias for a numbered async stream.
+         Skip over setting it below else the above NULL check will fail for
+         the aliased stream.  */
+      if (i == acc_async_noval)
+	continue;
+
       r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
       if (r != CUDA_SUCCESS)
 	{
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c
new file mode 100644
index 0000000..07d2541
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c
@@ -0,0 +1,19 @@
+/* { dg-do run { target { ! openacc_nvidia_accel_selected } } } */
+
+#include <stdio.h>
+#include <openacc.h>
+
+int
+main (void)
+{
+  fprintf (stderr, "CheCKpOInT\n");
+  acc_init (acc_device_nvidia);
+
+  acc_shutdown (acc_device_nvidia);
+
+  return 0;
+}
+
+/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
+/* { dg-output "device type nvidia not supported" } */
+/* { dg-shouldfail "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c
index 5449723..baa3ac8 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-94.c
@@ -22,10 +22,10 @@
 
   acc_copyin_async (h, N, async);
 
-  memset (h, 0, N);
-
   acc_wait (async);
 
+  memset (h, 0, N);
+
   acc_copyout_async (h, N, async + 1);
 
   acc_wait (async + 1);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
index 34bc57e..05946c9 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
@@ -1,3 +1,6 @@
+/* AMD GCN does not use 32-lane vectors.
+   { dg-skip-if "unsuitable dimensions" { openacc_amdgcn_accel_selected } { "*" } { "" } } */
+
 /* { dg-additional-options "-fopenacc-dim=32" } */
 
 #include <stdio.h>
@@ -101,7 +104,7 @@
   {
 #pragma acc loop gang
     for (int jx = 0; jx < 1; jx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int ix = 0; ix < size; ix++)
 	ary[ix] = place ();
   }
@@ -117,7 +120,7 @@
   {
 #pragma acc loop worker
     for (int jx = 0; jx < size  / 64; jx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int ix = 0; ix < 64; ix++)
 	ary[ix + jx * 64] = place ();
   }
@@ -133,7 +136,7 @@
   {
 #pragma acc loop gang
     for (int kx = 0; kx < 1; kx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int jx = 0; jx <  size  / 64; jx++)
 #pragma acc loop vector
 	for (int ix = 0; ix < 64; ix++)
@@ -147,9 +150,9 @@
 {
   clear (ary, size);
   
-#pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
+#pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } } */
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int jx = 0; jx <  size  / 64; jx++)
 #pragma acc loop worker
       for (int ix = 0; ix < 64; ix++)
@@ -165,11 +168,11 @@
   
 #pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int kx = 0; kx < size / (32 * 32); kx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int jx = 0; jx <  32; jx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
 	for (int ix = 0; ix < 32; ix++)
 	  ary[ix + jx * 32 + kx * 32 * 32] = place ();
   }
@@ -183,9 +186,9 @@
   
 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int jx = 0; jx <  size  / 64; jx++)
-#pragma acc loop auto
+#pragma acc loop auto independent
       for (int ix = 0; ix < 64; ix++)
 	ary[ix + jx * 64] = place ();
   }
@@ -199,7 +202,7 @@
   
 #pragma acc parallel vector_length(32) copy(ary[0:size]) firstprivate (size)
   {
-#pragma acc loop auto
+#pragma acc loop auto independent
     for (int jx = 0; jx <  size; jx++)
       ary[jx] = place ();
   }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c
new file mode 100644
index 0000000..6c479e4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-default-compile.c
@@ -0,0 +1,13 @@
+/* { dg-additional-options "-fopenacc-dim=16:16" } */
+/* This code uses nvptx inline assembly guarded with acc_on_device, which is
+   not optimized away at -O0, and then confuses the target assembler.
+   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+/* { dg-set-target-env-var "GOMP_OPENACC_DIM" "8:8" } */
+
+#include "loop-default.h"
+
+int main ()
+{
+  /* Environment should be ignored.  */
+  return test_1 (16, 16, 32);
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
index dd8107c..5cd0e31 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
@@ -80,13 +80,18 @@
 	exit = 1;
       }
   
+#ifndef ACC_DEVICE_TYPE_gcn
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
   for (ix = 0; ix < vp; ix++)
     if (vectors[ix] != vectors[0])
       {
-	printf ("vector %d not used %d times\n", ix, vectors[0]);
-	exit = 1;
+	printf ("vector %d not used %d times\n", ix, vectors[0]); exit
+	= 1;
       }
-  
+#endif
+
   return exit;
 }
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c
index 98f02e9..5831327 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-w" } */
 #include <stdio.h>
 #include <openacc.h>
 #include <gomp-constants.h>
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c
index 4152a4e..82e8aae 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-w" } */
 #include <stdio.h>
 #include <openacc.h>
 #include <gomp-constants.h>
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
index 766e578..5c84301 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
@@ -9,11 +9,13 @@
   int ix;
   int exit = 0;
   int ondev = 0;
+  int gangsize, workersize, vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
+	    copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
   {
 #pragma acc loop gang worker vector
     for (unsigned ix = 0; ix < N; ix++)
@@ -32,6 +34,10 @@
 	else
 	  ary[ix] = ix;
       }
+
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -39,11 +45,12 @@
       int expected = ix;
       if(ondev)
 	{
-	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+			   / (gangsize * workersize * vectorsize);
 	  
-	  int g = ix / (chunk_size * 32 * 32);
-	  int w = ix / 32 % 32;
-	  int v = ix % 32;
+	  int g = ix / (chunk_size * workersize * vectorsize);
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c
new file mode 100644
index 0000000..a4f81a3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <openacc.h>
+#include <alloca.h>
+#include <string.h>
+#include <gomp-constants.h>
+#include <stdlib.h>
+
+#if 0
+#define DEBUG(DIM, IDX, VAL) \
+  fprintf (stderr, "%sdist[%d] = %d\n", (DIM), (IDX), (VAL))
+#else
+#define DEBUG(DIM, IDX, VAL)
+#endif
+
+#define N (32*32*32)
+
+int
+check (const char *dim, int *dist, int dimsize)
+{
+  int ix;
+  int exit = 0;
+
+  for (ix = 0; ix < dimsize; ix++)
+    {
+      DEBUG(dim, ix, dist[ix]);
+      if (dist[ix] < (N) / (dimsize + 0.5)
+	  || dist[ix] > (N) / (dimsize - 0.5))
+	{
+	  fprintf (stderr, "did not distribute to %ss (%d not between %d "
+		   "and %d)\n", dim, dist[ix], (int) ((N) / (dimsize + 0.5)),
+		   (int) ((N) / (dimsize - 0.5)));
+	  exit |= 1;
+	}
+    }
+
+  return exit;
+}
+
+int main ()
+{
+  int ary[N];
+  int ix;
+  int exit = 0;
+  int gangsize = 0, workersize = 0, vectorsize = 0;
+  int *gangdist, *workerdist, *vectordist;
+
+  for (ix = 0; ix < N;ix++)
+    ary[ix] = -1;
+
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
+	    copy(ary) copyout(gangsize, workersize, vectorsize)
+  {
+#pragma acc loop gang worker vector
+    for (unsigned ix = 0; ix < N; ix++)
+      {
+	int g, w, v;
+
+	g = __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
+	w = __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
+	v = __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
+
+	ary[ix] = (g << 16) | (w << 8) | v;
+      }
+
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
+  }
+
+  gangdist = (int *) alloca (gangsize * sizeof (int));
+  workerdist = (int *) alloca (workersize * sizeof (int));
+  vectordist = (int *) alloca (vectorsize * sizeof (int));
+  memset (gangdist, 0, gangsize * sizeof (int));
+  memset (workerdist, 0, workersize * sizeof (int));
+  memset (vectordist, 0, vectorsize * sizeof (int));
+
+  /* Test that work is shared approximately equally amongst each active
+     gang/worker/vector.  */
+  for (ix = 0; ix < N; ix++)
+    {
+      int g = (ary[ix] >> 16) & 255;
+      int w = (ary[ix] >> 8) & 255;
+      int v = ary[ix] & 255;
+
+      gangdist[g]++;
+      workerdist[w]++;
+      vectordist[v]++;
+    }
+
+  exit = check ("gang", gangdist, gangsize);
+  exit |= check ("worker", workerdist, workersize);
+  exit |= check ("vector", vectordist, vectorsize);
+
+  return exit;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c
index 7107502..2f3a44f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-g-1.c
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-w" } */
 #include <stdio.h>
 #include <openacc.h>
 #include <gomp-constants.h>
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
index 0bec6e1..9c4a85f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
@@ -8,8 +8,10 @@
   int ix;
   int ondev = 0;
   int t = 0, h = 0;
-  
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ondev)
+  int gangsize, workersize, vectorsize;
+
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
+	copy(ondev) copyout(gangsize, workersize, vectorsize)
   {
 #pragma acc loop gang worker vector reduction(+:t)
     for (unsigned ix = 0; ix < N; ix++)
@@ -28,18 +30,22 @@
 	  }
 	t += val;
       }
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
     {
       int val = ix;
-      if(ondev)
+      if (ondev)
 	{
-	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+			   / (gangsize * workersize * vectorsize);
 	  
-	  int g = ix / (chunk_size * 32 * 32);
-	  int w = ix / 32 % 32;
-	  int v = ix % 32;
+	  int g = ix / (chunk_size * vectorsize * workersize);
+	  int w = ix / vectorsize % workersize;
+	  int v = ix % vectorsize;
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
index da4921d..1173c1f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
@@ -9,8 +9,9 @@
   int ix;
   int ondev = 0;
   int t = 0,  h = 0;
+  int vectorsize;
 
-#pragma acc parallel vector_length(32) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ondev) copyout(vectorsize)
   {
 #pragma acc loop vector reduction (+:t)
     for (unsigned ix = 0; ix < N; ix++)
@@ -29,6 +30,7 @@
 	  }
 	t += val;
       }
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -38,7 +40,7 @@
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
index 15e2bc2..84c2296 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
@@ -9,8 +9,9 @@
   int ix;
   int ondev = 0;
   int q = 0,  h = 0;
+  int vectorsize;
 
-#pragma acc parallel vector_length(32) copy(q) copy(ondev)
+#pragma acc parallel vector_length(32) copy(q) copy(ondev) copyout(vectorsize)
   {
     int t = q;
     
@@ -32,6 +33,7 @@
 	t += val;
       }
     q = t;
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -41,7 +43,7 @@
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
index 6bbd04f..7344fa8 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-w" } */
 #include <stdio.h>
 #include <openacc.h>
 #include <gomp-constants.h>
@@ -8,8 +9,10 @@
   int ix;
   int ondev = 0;
   int t = 0,  h = 0;
+  int workersize;
 
-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
+	    copyout(workersize)
   {
 #pragma acc loop worker reduction(+:t)
     for (unsigned ix = 0; ix < N; ix++)
@@ -28,6 +31,7 @@
 	  }
 	t += val;
       }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -36,7 +40,7 @@
       if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;
 
 	  val = (g << 16) | (w << 8) | v;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
index c63a5d4..d99877a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
@@ -1,3 +1,4 @@
+/* { dg-additional-options "-w" } */
 #include <stdio.h>
 #include <openacc.h>
 #include <gomp-constants.h>
@@ -8,8 +9,10 @@
   int ix;
   int ondev = 0;
   int q = 0,  h = 0;
+  int workersize;
 
-#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev) \
+	    copyout(workersize)
   {
     int t = q;
     
@@ -31,6 +34,7 @@
 	t += val;
       }
     q = t;
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -39,7 +43,7 @@
       if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;
 
 	  val = (g << 16) | (w << 8) | v;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
index 71d3969..c360ad1 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
@@ -8,8 +8,10 @@
   int ix;
   int ondev = 0;
   int t = 0, h = 0;
+  int workersize, vectorsize;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
+	    copyout(workersize, vectorsize)
   {
 #pragma acc loop worker vector reduction (+:t)
     for (unsigned ix = 0; ix < N; ix++)
@@ -28,6 +30,8 @@
 	  }
 	t += val;
       }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -36,8 +40,8 @@
       if(ondev)
 	{
 	  int g = 0;
-	  int w = (ix / 32) % 32;
-	  int v = ix % 32;
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
index 6010cd2..8c858f3 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
@@ -9,11 +9,13 @@
   int ix;
   int exit = 0;
   int ondev = 0;
+  int vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
+	    copyout(vectorsize)
   {
 #pragma acc loop vector
     for (unsigned ix = 0; ix < N; ix++)
@@ -31,6 +33,7 @@
 	else
 	  ary[ix] = ix;
       }
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -40,7 +43,7 @@
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
index fa6fb91..8731c80 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
@@ -9,11 +9,14 @@
   int ix;
   int exit = 0;
   int ondev = 0;
+  int workersize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 17 } */
   {
 #pragma acc loop worker
     for (unsigned ix = 0; ix < N; ix++)
@@ -31,6 +34,7 @@
 	else
 	  ary[ix] = ix;
       }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -39,7 +43,7 @@
       if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;
 
 	  expected = (g << 16) | (w << 8) | v;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-warn-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-warn-1.c
new file mode 100644
index 0000000..20a022f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-warn-1.c
@@ -0,0 +1,37 @@
+
+/* Check warnings about suboptimal partitioning choices.  */
+
+int main ()
+{
+  int ary[10];
+
+#pragma acc parallel copy(ary) num_gangs (1) /* { dg-warning "is not gang partitioned" } */
+  {
+    #pragma acc loop gang
+    for (int  i = 0; i < 10; i++)
+      ary[i] = i;
+  }
+
+#pragma acc parallel copy(ary) num_workers (1) /* { dg-warning "is not worker partitioned" } */
+  {
+    #pragma acc loop worker
+    for (int  i = 0; i < 10; i++)
+      ary[i] = i;
+  }
+
+#pragma acc parallel copy(ary) num_gangs (8) /* { dg-warning "is gang partitioned" } */
+  {
+    #pragma acc loop worker
+    for (int  i = 0; i < 10; i++)
+      ary[i] = i;
+  }
+
+#pragma acc parallel copy(ary) num_workers (8) /* { dg-warning "is worker partitioned" } */
+  {
+    #pragma acc loop gang
+    for (int  i = 0; i < 10; i++)
+      ary[i] = i;
+  }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
index cd4cc99..fd4e4cf 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
@@ -9,11 +9,13 @@
   int ix;
   int exit = 0;
   int ondev = 0;
+  int workersize, vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize, vectorsize)
   {
 #pragma acc loop worker vector
     for (unsigned ix = 0; ix < N; ix++)
@@ -31,6 +33,8 @@
 	else
 	  ary[ix] = ix;
       }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -39,8 +43,8 @@
       if(ondev)
 	{
 	  int g = 0;
-	  int w = (ix / 32) % 32;
-	  int v = ix % 32;
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c
index 4474c12..f62daf0 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c
@@ -287,6 +287,7 @@
   int n = 0;
   #pragma acc parallel copy(n) \
 		       num_gangs(1) num_workers(1) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 288 } */
   {
     n++;
   }
@@ -310,6 +311,7 @@
 
       #pragma acc parallel copy(arr) \
 			   num_gangs(gangs) num_workers(1) vector_length(32)
+      /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 312 } */
       {
 	int j;
 	#pragma acc loop gang
@@ -339,6 +341,7 @@
 
       #pragma acc parallel copy(arr) \
 			   num_gangs(gangs) num_workers(1) vector_length(32)
+      /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 342 } */
       {
 	int j;
 	#pragma acc loop gang
@@ -371,6 +374,7 @@
 
       #pragma acc parallel copy(arr) \
 			   num_gangs(gangs) num_workers(1) vector_length(32)
+      /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 375 } */
       {
 	int j;
 	#pragma acc loop gang
@@ -404,6 +408,7 @@
 
   #pragma acc parallel copy(arr) \
 		       num_gangs(1024) num_workers(1) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 409 } */
   {
     int j;
 
@@ -442,6 +447,7 @@
 
   #pragma acc parallel copyout(fizz, buzz, fizzbuzz) \
 		       num_gangs(NUM_GANGS) num_workers(1) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 448 } */
   {
     int j;
     
@@ -488,6 +494,7 @@
 
   #pragma acc parallel copy(arr) \
 		       num_gangs(8) num_workers(8) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 495 } */
   {
     int j;
     #pragma acc loop gang
@@ -613,6 +620,7 @@
 
   #pragma acc parallel copy(n, arr) \
 		       num_gangs(8) num_workers(16) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 621 } */
   {
     int j;
     #pragma acc loop gang
@@ -665,6 +673,7 @@
 
 	#pragma acc parallel copyin(arr_a) copyout(arr_b) \
 			     num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
+	/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 674 } */
 	{
 	  int j;
 	  #pragma acc loop gang
@@ -882,6 +891,8 @@
 
   #pragma acc parallel copy(arr) \
 		       num_gangs(8) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 892 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 892 } */
   {
     int j;
     #pragma acc loop gang
@@ -905,6 +916,8 @@
 
   #pragma acc parallel copy(arr) \
 		       num_gangs(8) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 917 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 917 } */
   {
     int j;
     #pragma acc loop gang
@@ -931,6 +944,8 @@
 
   #pragma acc parallel copy(arr) \
 		       num_gangs(8) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 945 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 945 } */
   {
     int j;
     #pragma acc loop gang
@@ -957,6 +972,8 @@
 
   #pragma acc parallel copy(arr) \
 		       num_gangs(8) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 973 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 973 } */
   {
     int j;
     #pragma acc loop gang
@@ -988,6 +1005,7 @@
 
   #pragma acc parallel copy(arr) \
 		       num_gangs(8) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 1006 } */
   {
     int j;
     #pragma acc loop gang
@@ -1020,6 +1038,7 @@
 
   #pragma acc parallel copy(arr) \
 		       num_gangs(8) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 1039 } */
   {
     int j;
     #pragma acc loop gang
@@ -1070,6 +1089,8 @@
 
   #pragma acc parallel copy(n, arr) copyout(ondev) \
 	  num_gangs(ACTUAL_GANGS) num_workers(8) vector_length(32)
+  /* { dg-warning "region is gang partitioned but does not contain gang partitioned code" "gang" { target *-*-* } 1090 } */
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 1090 } */
   {
     int j;
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-1.c
new file mode 100644
index 0000000..c7a1bd9
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-1.c
@@ -0,0 +1,40 @@
+/* Test no_create clause when data is present on the device.  */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <openacc.h>
+
+#define N 128
+
+int
+main (int argc, char *argv[])
+{
+  int *arr = (int *) malloc (N * sizeof (*arr));
+  int *devptr;
+
+  acc_copyin (arr, N * sizeof (*arr));
+
+  #pragma acc parallel no_create(arr[0:N]) copyout(devptr)
+  {
+    devptr = &arr[2];
+  }
+
+#if !ACC_MEM_SHARED
+  if (acc_hostptr (devptr) != (void *) &arr[2])
+    __builtin_abort ();
+#endif
+
+  acc_delete (arr, N * sizeof (*arr));
+
+#if ACC_MEM_SHARED
+  if (&arr[2] != devptr)
+    __builtin_abort ();
+#else
+  if (&arr[2] == devptr)
+    __builtin_abort ();
+#endif
+
+  free (arr);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-2.c
new file mode 100644
index 0000000..2964a40
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-2.c
@@ -0,0 +1,28 @@
+/* Test no_create clause when data is not present on the device.  */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N 128
+
+int
+main (int argc, char *argv[])
+{
+  int *arr = (int *) malloc (N * sizeof (*arr));
+  int *devptr;
+
+  #pragma acc data no_create(arr[0:N])
+  {
+    #pragma acc parallel copyout(devptr)
+    {
+      devptr = &arr[2];
+    }
+  }
+
+  if (devptr != &arr[2])
+    __builtin_abort ();
+
+  free (arr);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-3.c
new file mode 100644
index 0000000..618af6e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-3.c
@@ -0,0 +1,38 @@
+/* Test no_create clause with attach/detach when data is not present on the
+   device.  */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <openacc.h>
+
+#define N 128
+
+typedef struct {
+  int x;
+  int *y;
+} mystruct;
+
+int
+main (int argc, char *argv[])
+{
+  int *devptr;
+  mystruct s;
+
+  s.x = 5;
+  s.y = (int *) malloc (N * sizeof (int));
+
+  #pragma acc data copyin(s)
+  {
+    #pragma acc parallel no_create(s.y[0:N]) copyout(devptr)
+    {
+      devptr = &s.y[2];
+    }
+  }
+
+  if (devptr != &s.y[2])
+    __builtin_abort ();
+
+  free (s.y);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-4.c
new file mode 100644
index 0000000..75ab616
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/nocreate-4.c
@@ -0,0 +1,42 @@
+/* Test no_create clause with attach/detach when data is present on the
+   device.  */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <openacc.h>
+
+#define N 128
+
+typedef struct {
+  int x;
+  int *y;
+} mystruct;
+
+int
+main (int argc, char *argv[])
+{
+  int *devptr;
+  mystruct s;
+
+  s.x = 5;
+  s.y = (int *) malloc (N * sizeof (int));
+
+  #pragma acc data copyin(s)
+  {
+    #pragma acc enter data copyin(s.y[0:N])
+
+    #pragma acc parallel no_create(s.y[0:N]) copyout(devptr)
+    {
+      devptr = &s.y[2];
+    }
+  }
+
+  if (devptr != acc_deviceptr (&s.y[2]))
+    __builtin_abort ();
+
+  #pragma acc exit data delete(s.y[0:N])
+
+  free (s.y);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-1.c
index 5e82e1d..91fe772 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-1.c
@@ -15,7 +15,7 @@
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     reduction(+:res) copy(res)
   {
-    #pragma acc loop gang
+    #pragma acc loop gang reduction(+:res)
     for (j = 0; j < 32; j++)
       {
 	#pragma acc loop worker reduction(+:res)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-2.c
index a339f32..8a7bc72 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-2.c
@@ -14,7 +14,7 @@
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
 		       reduction(^:res)
   {
-    #pragma acc loop gang
+    #pragma acc loop gang reduction(^:res)
     for (j = 0; j < 32; j++)
       {
 	#pragma acc loop worker vector reduction(^:res)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-3.c
index 6369d7f..eba5c6a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-3.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-3.c
@@ -16,7 +16,7 @@
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     reduction(+:res) copy(res)
   {
-    #pragma acc loop gang
+    #pragma acc loop gang reduction(+:res)
     for (j = 0; j < 32; j++)
       {
 	#pragma acc loop worker vector reduction(+:res)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c
index 140c322..12b823f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c
@@ -16,7 +16,7 @@
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     reduction(+:res) reduction(max:mres) copy(res, mres)
   {
-    #pragma acc loop gang
+    #pragma acc loop gang reduction(+:res) reduction(max:mres)
     for (j = 0; j < 32; j++)
       {
 	#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c
new file mode 100644
index 0000000..856ef0e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c
@@ -0,0 +1,29 @@
+/* Check a parallel reduction which is are explicitly initialized by
+   the user.  */
+
+#include <assert.h>
+
+int
+main ()
+{
+  int n = 10;
+  float accel = 1.0, host = 1.0;
+  int i;
+
+#pragma acc parallel copyin(n) reduction(*:accel)
+  {
+    accel = 1.0;
+#pragma acc loop gang reduction(*:accel)
+    for( i = 1; i <= n; i++)
+      {
+	accel *= 2.0;
+      }
+  }
+
+  for (i = 1; i <= n; i++)
+    host *= 2.0;
+
+  assert (accel == host);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index 7e699f4..2c14f9c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -12,7 +12,8 @@
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
     return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
   else
     __builtin_abort ();
@@ -23,7 +24,8 @@
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
     return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
   else
     __builtin_abort ();
@@ -34,7 +36,8 @@
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
     return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
   else
     __builtin_abort ();
@@ -96,7 +99,7 @@
     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
     gangs_min = workers_min = vectors_min = INT_MAX;
     gangs_max = workers_max = vectors_max = INT_MIN;
-#pragma acc parallel copy (gangs_actual) \
+#pragma acc parallel copy (gangs_actual) /* { dg-warning "region contains gang partitioned code but is not gang partitioned" } */ \
   num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
     {
       /* We're actually executing with num_gangs (1).  */
@@ -125,7 +128,7 @@
     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
     gangs_min = workers_min = vectors_min = INT_MAX;
     gangs_max = workers_max = vectors_max = INT_MIN;
-#pragma acc parallel copy (workers_actual) \
+#pragma acc parallel copy (workers_actual) /* { dg-warning "region contains worker partitioned code but is not worker partitioned" } */ \
   num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
     {
       /* We're actually executing with num_workers (1).  */
@@ -154,7 +157,8 @@
     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
     gangs_min = workers_min = vectors_min = INT_MAX;
     gangs_max = workers_max = vectors_max = INT_MIN;
-#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "region contains vector partitioned code but is not vector partitioned" } */ \
+  /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } 160 } */ \
   vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
     {
       /* We're actually executing with vector_length (1), just the GCC nvptx
@@ -176,9 +180,8 @@
 	if (vectors_actual != 32)
 	  __builtin_abort ();
       }
-    else
-      if (vectors_actual != 1)
-	__builtin_abort ();
+    else if (vectors_actual != 1)
+      __builtin_abort ();
     if (gangs_min != 0 || gangs_max != 0
 	|| workers_min != 0 || workers_max != 0
 	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
@@ -198,7 +201,7 @@
     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
     gangs_min = workers_min = vectors_min = INT_MAX;
     gangs_max = workers_max = vectors_max = INT_MIN;
-#pragma acc parallel copy (gangs_actual) \
+#pragma acc parallel copy (gangs_actual) /* { dg-warning "region is gang partitioned but does not contain gang partitioned code" } */ \
   reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
   num_gangs (gangs)
     {
@@ -324,6 +327,10 @@
 	  /* We're actually executing with num_workers (32).  */
 	  /* workers_actual = 32; */
 	}
+      else if (acc_on_device (acc_device_gcn))
+	{
+	  workers_actual = 4;
+	}
       else
 	__builtin_abort ();
 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@@ -403,6 +410,13 @@
 	  /* The GCC nvptx back end enforces vector_length (32).  */
 	  vectors_actual = 32;
 	}
+      else if (acc_on_device (acc_device_gcn))
+	{
+	  /* Because of the way vectors are implemented for GCN, a vector loop
+	     containing a seq routine call will not vectorize calls to that
+	     routine.  Hence, we'll only get one "vector".  */
+	  vectors_actual = 1;
+	}
       else
 	__builtin_abort ();
 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@@ -429,6 +443,9 @@
        in the following case.  So, limit ourselves here.  */
     if (acc_get_device_type () == acc_device_nvidia)
       gangs = 3;
+    /* Similar appears to be true for GCN.  */
+    if (acc_get_device_type () == acc_device_gcn)
+      gangs = 3;
     int gangs_actual = gangs;
 #define WORKERS 3
     int workers_actual = WORKERS;
@@ -455,6 +472,11 @@
 	  /* The GCC nvptx back end enforces vector_length (32).  */
 	  vectors_actual = 32;
 	}
+      else if (acc_on_device (acc_device_gcn))
+	{
+	  /* See above comments about GCN vectors_actual.  */
+	  vectors_actual = 1;
+	}
       else
 	__builtin_abort ();
 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828-2.c
new file mode 100644
index 0000000..357114c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828-2.c
@@ -0,0 +1,34 @@
+/* Subarray declared on data construct, accessed through pointer.  */
+
+#include <assert.h>
+
+void
+s1 (int *arr, int c)
+{
+#pragma acc data copy(arr[5:c-10])
+  {
+#pragma acc parallel loop
+    for (int i = 5; i < c - 5; i++)
+      arr[i] = i;
+  }
+}
+
+int
+main (int argc, char* argv[])
+{
+  const int c = 100;
+  int arr[c];
+
+  for (int i = 0; i < c; i++)
+    arr[i] = 0;
+
+  s1 (arr, c);
+
+  for (int i = 0; i < c; i++)
+    if (i >= 5 && i < c - 5)
+      assert (arr[i] == i);
+    else
+      assert (arr[i] == 0);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828.c
new file mode 100644
index 0000000..4b6dbd7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr70828.c
@@ -0,0 +1,27 @@
+/* Subarray declared on enclosing data construct.  */
+
+#include <assert.h>
+
+int
+main ()
+{
+  int a[100], i;
+
+  for (i = 0; i < 100; i++)
+    a[i] = 0;
+
+#pragma acc data copy(a[10:80])
+  {
+    #pragma acc parallel loop
+    for (i = 10; i < 90; i++)
+      a[i] = i;
+  }
+
+  for (i = 0; i < 100; i++)
+    if (i >= 10 && i < 90)
+      assert (a[i] == i);
+    else
+      assert (a[i] == 0);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables-2.c
new file mode 100644
index 0000000..51fb394
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables-2.c
@@ -0,0 +1,217 @@
+#include <assert.h>
+
+/* Worker propagation: plain scalar variables.  */
+
+void
+worker_bcast_1 (void)
+{
+  int i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(32) num_workers(32)
+  {
+    #pragma acc loop gang
+    for (i = 0; i < 32; i++)
+      {
+	int j;
+	int x = (i ^ 3) * 3;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += x * j;
+
+	x = (i | 5) * 5;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += x * j;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      {
+	int idx = i * 32 + j;
+	assert (arr[idx] == idx + (i ^ 3) * 3 * j + (i | 5) * 5 * j);
+      }
+}
+
+#pragma acc routine seq
+__attribute__((noinline)) static int
+select_var (int s, int x, int y)
+{
+  if (s)
+    return x;
+  else
+    return y;
+}
+
+/* Worker propagation: scalars through function calls.  */
+
+void
+worker_bcast_2 (void)
+{
+  int i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
+  {
+    int j;
+
+    #pragma acc loop gang
+    for (i = 0; i < 32; i++)
+      {
+	int x, y, z;
+
+	x = i * 5;
+	y = i * 7;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += select_var (j & 1, x, y) * j;
+
+	#pragma acc loop worker vector
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += select_var (j & 1, y, x) * j;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      {
+	int idx = i * 32 + j;
+	int answer = idx + ((j & 1) ? i * 5 : i * 7) * j
+		     + ((j & 1) ? i * 7 : i * 5) * j;
+        assert (arr[idx] == answer);
+      }
+}
+
+#pragma acc routine seq
+__attribute__((noinline)) static int
+select_addr (int s, int *x, int *y)
+{
+  if (s)
+    return *x;
+  else
+    return *y;
+}
+
+/* Worker propagation: addresses of locals through function calls.  */
+
+void
+worker_bcast_3 (void)
+{
+  int i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
+  {
+    int j;
+
+    #pragma acc loop gang
+    for (i = 0; i < 32; i++)
+      {
+        int x, y, z;
+
+	x = i * 5;
+	y = i * 7;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += select_addr (j & 1, &x, &y) * j;
+
+	#pragma acc loop worker vector
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += select_addr (j & 1, &y, &x) * j;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      {
+	int idx = i * 32 + j;
+	int answer = idx + ((j & 1) ? i * 5 : i * 7) * j
+		     + ((j & 1) ? i * 7 : i * 5) * j;
+	assert (arr[idx] == answer);
+      }
+}
+
+#pragma acc routine seq
+__attribute__((noinline)) static int *
+select_ptr (int s, int *x, int *y)
+{
+  if (s)
+    return x;
+  else
+    return y;
+}
+
+/* Worker propagation: writes through pointers.  */
+
+void
+worker_bcast_4 (void)
+{
+  int i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(32) num_workers(32)
+  {
+    int j;
+
+    #pragma acc loop gang
+    for (i = 0; i < 32; i++)
+      {
+	int x, y, z;
+	int *p, *q, *r;
+
+	p = &x;
+	q = &y;
+
+	x = i * 5;
+	y = i * 7;
+	r = select_ptr (i & 1, p, q);
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += (x + y + 2 * (*p) + (*q)) * j;
+
+	/* This write can affect either x or y: both should be broadcast into
+	   the next loop.  */
+	(*r) += 20;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += (x + y + 2 * (*p) + (*q)) * j;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      {
+	int idx = i * 32 + j;
+	int x = i * 5, y = i * 7;
+	int answer = idx + (3 * x + 2 * y) * j
+		     + ((i & 1) ? (3 * (x + 20) + 2 * y)
+				: (3 * x + 2 * (y + 20))) * j;
+	assert (arr[idx] == answer);
+      }
+}
+
+
+int main ()
+{
+  worker_bcast_1 ();
+  worker_bcast_2 ();
+  worker_bcast_3 ();
+  worker_bcast_4 ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables.c
index 53f03d1..f0c3447 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables.c
@@ -22,6 +22,8 @@
     arr[i] = 3;
 
   #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 24 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 24 } */
   {
     int x;
 
@@ -295,6 +297,8 @@
     arr[i] = i;
 
   #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 299 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 299 } */
   {
     #pragma acc loop gang private(x)
     for (i = 0; i < 32; i++)
@@ -320,6 +324,7 @@
     arr[i] = i;
 
   #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 326 } */
   {
     #pragma acc loop gang private(x)
     for (i = 0; i < 32; i++)
@@ -348,6 +353,7 @@
     arr[i] = i;
 
   #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 355 } */
   {
     #pragma acc loop gang private(x)
     for (i = 0; i < 32; i++)
@@ -376,6 +382,7 @@
     arr[i] = i;
 
   #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 384 } */
   {
     #pragma acc loop gang private(x)
     for (i = 0; i < 32; i++)
@@ -408,6 +415,7 @@
     arr[i] = i;
 
   #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 417 } */
   {
     #pragma acc loop gang private(x)
     for (i = 0; i < 32; i++)
@@ -438,6 +446,7 @@
     arr[i] = i;
 
   #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 448 } */
   {
     #pragma acc loop gang private(pt)
     for (i = 0; i < 32; i++)
@@ -559,6 +568,7 @@
     arr[i] = i;
 
   #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 570 } */
   {
     int j;
 
@@ -875,6 +885,8 @@
     arr[i] = 3;
 
   #pragma acc parallel private(x) copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 887 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 887 } */
   {
     #pragma acc loop gang(static:1)
     for (i = 0; i < 32; i++)
@@ -904,6 +916,7 @@
     arr[i] = i;
 
   #pragma acc parallel private(x) copy(arr) num_gangs(32) num_workers(2) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 918 } */
   {
     #pragma acc loop gang
     for (i = 0; i < 32; i++)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c
new file mode 100644
index 0000000..206e66f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c
@@ -0,0 +1,41 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+#define N 100
+  int n = N;
+  int i, j, tmp;
+  int input[N*N], output[N], houtput[N];
+
+  for (i = 0; i < n * n; i++)
+    input[i] = i;
+
+  for (i = 0; i < n; i++)
+    {
+      tmp = 0;
+      for (j = 0; j < n; j++)
+	tmp += input[i * n + j];
+      houtput[i] = tmp;
+    }
+
+  #pragma acc parallel loop gang
+  for (i = 0; i < n; i++)
+    {
+      tmp = 0;
+
+      #pragma acc loop worker reduction(+:tmp)
+      for (j = 0; j < n; j++)
+	tmp += input[i * n + j];
+
+      output[i] = tmp;
+    }
+
+  /* Test if every worker-level reduction had correct private result.  */
+  for (i = 0; i < n; i++)
+    if (houtput[i] != output[i])
+      abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c
new file mode 100644
index 0000000..0c317dc
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c
@@ -0,0 +1,23 @@
+#include <assert.h>
+
+int
+main ()
+{
+  const int n = 1000;
+  int i, j, temp, a[n];
+
+#pragma acc parallel loop
+  for (i = 0; i < n; i++)
+    {
+      temp = i;
+#pragma acc loop reduction (+:temp)
+      for (j = 0; j < n; j++)
+	temp ++;
+      a[i] = temp;
+    }
+
+  for (i = 0; i < n; i++)
+    assert (a[i] == i+n);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c
index c4940b8..68ae919 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-7.c
@@ -14,6 +14,8 @@
     arr[i] = i;
 
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 16 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 16 } */
   {
     #pragma acc loop gang reduction(+:res)
     for (i = 0; i < 1024; i++)
@@ -28,6 +30,8 @@
   res = hres = 1;
 
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 32 } */
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 32 } */
   {
     #pragma acc loop gang reduction(*:res)
     for (i = 0; i < 12; i++)
@@ -52,6 +56,7 @@
     arr[i] = i;
 
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 58 } */
   {
     #pragma acc loop gang vector reduction(+:res)
     for (i = 0; i < 1024; i++)
@@ -76,6 +81,7 @@
     arr[i] = i;
 
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 83 } */
   {
     #pragma acc loop gang worker reduction(+:res)
     for (i = 0; i < 1024; i++)
@@ -239,6 +245,7 @@
 
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
 		       private(res) copyout(out)
+  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 246 } */
   {
     #pragma acc loop gang
     for (j = 0; j < 32; j++)
@@ -315,6 +322,7 @@
 
   #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
 		       private(res) copyout(out)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 323 } */
   {
     #pragma acc loop gang
     for (j = 0; j < 32; j++)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c
new file mode 100644
index 0000000..350174a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt-2.c
@@ -0,0 +1,32 @@
+#include <complex.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef float _Complex Type;
+
+#define N 32
+
+int
+main (void)
+{
+  Type ary[N];
+
+  for (int ix = 0; ix < N;  ix++)
+    ary[ix] = 1.0 + 1.0j;
+
+  Type tprod = 1.0;
+
+#pragma acc parallel vector_length(32)
+  {
+#pragma acc loop vector reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
+      tprod *= ary[ix];
+  }
+
+  Type expected = 65536.0;
+
+  if (tprod != expected)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-3.c
new file mode 100644
index 0000000..2cdd6bf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-3.c
@@ -0,0 +1,33 @@
+/* At -O0, we do get the expected "undefined reference to `foo'" link-time
+   error message (but the check needs to be done differently; compare to
+   routine-nohost-1.c), but for -O2 we don't; presumably because the function
+   gets inlined.
+   { dg-xfail-if "TODO" { *-*-* } { "-O0" } { "" } } */
+
+#include <stdlib.h>
+
+#pragma acc routine nohost
+int
+foo (int n)
+{
+  if (n == 0 || n == 1)
+    return 1;
+
+  return n * n;
+}
+
+int
+main()
+{
+  int a, n = 10;
+
+#pragma acc parallel copy (a, n)
+  {
+    a = foo (n);
+  }
+
+  if (a != n * n)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-g-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-g-1.c
index a164f57..8c3b938 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-g-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-g-1.c
@@ -6,6 +6,8 @@
 
 #pragma acc routine gang
 void __attribute__ ((noinline)) gang (int ary[N])
+/* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 8 } */
+/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 8 } */
 {
 #pragma acc loop gang
     for (unsigned ix = 0; ix < N; ix++)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
index a97e046..da13d84 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
@@ -30,14 +30,18 @@
   int ix;
   int exit = 0;
   int ondev = 0;
+  int gangsize, workersize, vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
   {
     ondev = acc_on_device (acc_device_not_host);
     gang (ary);
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -45,11 +49,12 @@
       int expected = ix;
       if(ondev)
 	{
-	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+			   / (gangsize * workersize * vectorsize);
 	  
-	  int g = ix / (chunk_size * 32 * 32);
-	  int w = ix / 32 % 32;
-	  int v = ix % 32;
+	  int g = ix / (chunk_size * vectorsize * workersize);
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-nohost-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-nohost-1.c
new file mode 100644
index 0000000..365af93
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-nohost-1.c
@@ -0,0 +1,18 @@
+/* { dg-do link } */
+
+extern int three (void);
+
+#pragma acc routine (three) nohost
+__attribute__((noinline))
+int three(void)
+{
+  return 3;
+}
+
+int main(void)
+{
+  return (three() == 3) ? 0 : 1;
+}
+
+/* Expecting link to fail; "undefined reference to `three'" (or similar).
+   { dg-excess-errors "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
index b1e3e3a..dd7bb6c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
@@ -30,14 +30,17 @@
   int ix;
   int exit = 0;
   int ondev = 0;
+  int vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
+	    copyout(vectorsize)
   {
     ondev = acc_on_device (acc_device_not_host);
     vector (ary);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -47,7 +50,7 @@
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
index 81f1e03..bf92281 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
@@ -6,6 +6,7 @@
 
 #pragma acc routine worker
 void __attribute__ ((noinline)) worker (int ary[N])
+/* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 8 } */
 {
 #pragma acc loop worker
   for (unsigned ix = 0; ix < N; ix++)
@@ -30,14 +31,17 @@
   int ix;
   int exit = 0;
   int ondev = 0;
+  int workersize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize)
   {
     ondev = acc_on_device (acc_device_not_host);
     worker (ary);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -46,7 +50,7 @@
       if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;
 
 	  expected = (g << 16) | (w << 8) | v;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
index 23dbc1a..73696e4 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
@@ -30,14 +30,18 @@
   int ix;
   int exit = 0;
   int ondev = 0;
+  int workersize, vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize, vectorsize)
   {
     ondev = acc_on_device (acc_device_not_host);
     worker (ary);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -46,8 +50,8 @@
       if(ondev)
 	{
 	  int g = 0;
-	  int w = (ix / 32) % 32;
-	  int v = ix % 32;
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
index 8862148..b469eed 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
@@ -2,8 +2,14 @@
 #include <openacc.h>
 #include <gomp-constants.h>
 
+#ifdef ACC_DEVICE_TYPE_gcn
+/* FIXME: Max. number of workers may increase for GCN in the future.  */
+#define NUM_WORKERS 4
+#define NUM_VECTORS 1
+#else
 #define NUM_WORKERS 16
 #define NUM_VECTORS 32
+#endif
 #define WIDTH 64
 #define HEIGHT 32
 
@@ -37,7 +43,8 @@
       ary[ix][jx] = 0xdeadbeef;
 
   printf ("spawning %d ...", nw); fflush (stdout);
-  
+
+/* { dg-warning "region contains vector partitioned code but is not vector partitioned" "vector" { target openacc_amdgcn_accel_selected } 48 } */
 #pragma acc parallel num_workers(nw) vector_length (NUM_VECTORS) copy (ary)
   {
     WorkVec ((int *)ary, WIDTH, HEIGHT, nw, NUM_VECTORS);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/serial-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/serial-dims.c
new file mode 100644
index 0000000..e373ebd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/serial-dims.c
@@ -0,0 +1,106 @@
+/* OpenACC dimensions with the serial construct.  */
+
+#include <limits.h>
+#include <openacc.h>
+#include <gomp-constants.h>
+
+/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
+   not behaving as expected for -O0.  */
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
+{
+  if (acc_on_device ((int) acc_device_host))
+    return 0;
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
+    return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
+  else
+    __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
+{
+  if (acc_on_device ((int) acc_device_host))
+    return 0;
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
+    return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
+  else
+    __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
+{
+  if (acc_on_device ((int) acc_device_host))
+    return 0;
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
+    return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
+  else
+    __builtin_abort ();
+}
+
+
+int main ()
+{
+  acc_init (acc_device_default);
+
+  /* Serial OpenACC constructs must get launched as 1 x 1 x 1.  */
+  {
+    int gangs_min, gangs_max;
+    int workers_min, workers_max;
+    int vectors_min, vectors_max;
+    int gangs_actual, workers_actual, vectors_actual;
+    int i, j, k;
+
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+    gangs_actual = workers_actual = vectors_actual = 1;
+#pragma acc serial
+    /* { dg-warning "region contains gang partitioned code but is not gang partitioned" "" { target *-*-* } 61 } */
+    /* { dg-warning "region contains worker partitioned code but is not worker partitioned" "" { target *-*-* } 61 } */
+    /* { dg-warning "region contains vector partitioned code but is not vector partitioned" "" { target *-*-* } 61 } */
+    /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } 61 } */
+    {
+      if (acc_on_device (acc_device_nvidia))
+	{
+	  /* The GCC nvptx back end enforces vector_length (32).  */
+	  vectors_actual = 32;
+	}
+      else if (acc_on_device (acc_device_gcn))
+	{
+	  /* AMD GCN relies on the autovectorizer for the vector dimension:
+	     the loop below isn't likely to be vectorized, so vectors_actual
+	     is effectively 1.  */
+	  vectors_actual = 1;
+	}
+      else if (!acc_on_device (acc_device_host))
+	__builtin_abort ();
+#pragma acc loop gang \
+  reduction (min: gangs_min, workers_min, vectors_min) \
+  reduction (max: gangs_max, workers_max, vectors_max)
+      for (i = 100 * gangs_actual; i > -100 * gangs_actual; i--)
+#pragma acc loop worker \
+  reduction (min: gangs_min, workers_min, vectors_min) \
+  reduction (max: gangs_max, workers_max, vectors_max)
+	for (j = 100 * workers_actual; j > -100 * workers_actual; j--)
+#pragma acc loop vector \
+  reduction (min: gangs_min, workers_min, vectors_min) \
+  reduction (max: gangs_max, workers_max, vectors_max)
+	  for (k = 100 * vectors_actual; k > -100 * vectors_actual; k--)
+	    {
+	      gangs_min = gangs_max = acc_gang ();
+	      workers_min = workers_max = acc_worker ();
+	      vectors_min = vectors_max = acc_vector ();
+	    }
+      if (gangs_min != 0 || gangs_max != gangs_actual - 1
+	  || workers_min != 0 || workers_max != workers_actual - 1
+	  || vectors_min != 0 || vectors_max != vectors_actual - 1)
+	__builtin_abort ();
+    }
+  }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
index 9db236c..0c9096f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
@@ -1,46 +1,24 @@
 
-#if ACC_DEVICE_TYPE_nvidia
+#pragma acc routine nohost
+void
+delay ()
+{
+  int i, sum;
+  const int N = 500000;
+
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
+}
 
 #pragma acc routine nohost
-static int clock (void)
-{
-  int thetime;
-
-  asm __volatile__ ("mov.u32 %0, %%clock;" : "=r"(thetime));
-
-  return thetime;
-}
-
-#endif
-
 void
-delay (unsigned long *d_o, unsigned long delay)
+delay2 (unsigned long *d_o, unsigned long tid)
 {
-  int start, ticks;
+  int i, sum;
+  const int N = 500000;
 
-  start = clock ();
-
-  ticks = 0;
-
-  while (ticks < delay)
-    ticks = clock () - start;
-
-  return;
-}
-
-void
-delay2 (unsigned long *d_o, unsigned long delay, unsigned long tid)
-{
-  int start, ticks;
-
-  start = clock ();
-
-  ticks = 0;
-
-  while (ticks < delay)
-    ticks = clock () - start;
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
 
   d_o[0] = tid;
-
-  return;
 }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
index 6f748fc..88b63bf 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
@@ -1,148 +1,90 @@
-// BEGIN PREAMBLE
-	.version	3.1
-	.target	sm_30
+	.version 3.1
+	.target sm_30
 	.address_size 64
-// END PREAMBLE
 
-// BEGIN FUNCTION DEF: clock
-.func (.param.u32 %out_retval)clock
-{
-.reg.u32 %retval;
-	.reg.u64 %hr10;
-	.reg.u32 %r22;
-	.reg.u32 %r23;
-	.reg.u32 %r24;
-	.local.align 8 .b8 %frame[8];
-	// #APP 
-// 7 "subr.c" 1
-	mov.u32 %r24, %clock;
-// 0 "" 2
-	// #NO_APP 
-		st.local.u32	[%frame], %r24;
-		ld.local.u32	%r22, [%frame];
-		mov.u32	%r23, %r22;
-		mov.u32	%retval, %r23;
-	st.param.u32	[%out_retval], %retval;
+	.visible .entry delay
+	{
+	.reg .u64 %hr10;
+	.reg .u32 %r22;
+	.reg .u32 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .pred %r30;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[16];
+	cvta.local.u64 %frame,%farray;
+	mov.u32 %r22,500000;
+	st.u32 [%frame+8],%r22;
+	mov.u32 %r23,0;
+	st.u32 [%frame],%r23;
+	bra $L2;
+	$L3:
+	ld.u32 %r25,[%frame+4];
+	add.u32 %r24,%r25,1;
+	st.u32 [%frame+4],%r24;
+	ld.u32 %r27,[%frame];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame],%r26;
+	$L2:
+	ld.u32 %r28,[%frame];
+	ld.u32 %r29,[%frame+8];
+	setp.lt.s32 %r30,%r28,%r29;
+	@%r30 
+	bra $L3;
 	ret;
 	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay
-.visible .entry delay(.param.u64 %in_ar1, .param.u64 %in_ar2)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u32 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u64 %r31;
-	.reg.pred %r32;
-	.local.align 8 .b8 %frame[24];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-		mov.u64	%r24, %ar1;
-		st.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
+
+	.visible .entry delay2 (.param .u64 %in_ar1, .param .u64 %in_ar2)
 	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r26, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r26;
-		mov.u32	%r27, 0;
-		st.local.u32	[%frame], %r27;
-		bra	$L4;
-$L5:
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r28, [%retval_in];
-}
-		mov.u32	%r23, %r28;
-		ld.local.u32	%r30, [%frame+4];
-		sub.u32	%r29, %r23, %r30;
-		st.local.u32	[%frame], %r29;
-$L4:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r31, [%frame+16];
-		setp.lo.u64 %r32,%r22,%r31;
-	@%r32	bra	$L5;
+	.reg .u64 %ar1;
+	.reg .u64 %ar2;
+	.reg .u64 %hr10;
+	.reg .u64 %r22;
+	.reg .u64 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .u32 %r30;
+	.reg .u32 %r31;
+	.reg .pred %r32;
+	.reg .u64 %r33;
+	.reg .u64 %r34;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[32];
+	cvta.local.u64 %frame,%farray;
+	ld.param.u64 %ar1,[%in_ar1];
+	ld.param.u64 %ar2,[%in_ar2];
+	mov.u64 %r22,%ar1;
+	st.u64 [%frame+16],%r22;
+	mov.u64 %r23,%ar2;
+	st.u64 [%frame+24],%r23;
+	mov.u32 %r24,500000;
+	st.u32 [%frame+8],%r24;
+	mov.u32 %r25,0;
+	st.u32 [%frame],%r25;
+	bra $L5;
+	$L6:
+	ld.u32 %r27,[%frame+4];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame+4],%r26;
+	ld.u32 %r29,[%frame];
+	add.u32 %r28,%r29,1;
+	st.u32 [%frame],%r28;
+	$L5:
+	ld.u32 %r30,[%frame];
+	ld.u32 %r31,[%frame+8];
+	setp.lt.s32 %r32,%r30,%r31;
+	@%r32 
+	bra $L6;
+	ld.u64 %r33,[%frame+16];
+	ld.u64 %r34,[%frame+24];
+	st.u64 [%r33],%r34;
 	ret;
 	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay2
-.visible .entry delay2(.param.u64 %in_ar1, .param.u64 %in_ar2, .param.u64 %in_ar3)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %ar3;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u64 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u32 %r31;
-	.reg.u64 %r32;
-	.reg.pred %r33;
-	.reg.u64 %r34;
-	.reg.u64 %r35;
-	.local.align 8 .b8 %frame[32];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-	ld.param.u64 %ar3, [%in_ar3];
-		mov.u64	%r24, %ar1;
-		st.local.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
-		mov.u64	%r26, %ar3;
-		st.local.u64	[%frame+24], %r26;
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r27, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r27;
-		mov.u32	%r28, 0;
-		st.local.u32	[%frame], %r28;
-		bra	$L8;
-$L9:
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r29, [%retval_in];
-}
-		mov.u32	%r23, %r29;
-		ld.local.u32	%r31, [%frame+4];
-		sub.u32	%r30, %r23, %r31;
-		st.local.u32	[%frame], %r30;
-$L8:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r32, [%frame+16];
-		setp.lo.u64 %r33,%r22,%r32;
-	@%r33	bra	$L9;
-		ld.local.u64	%r34, [%frame+8];
-		ld.local.u64	%r35, [%frame+24];
-		st.u64	[%r34], %r35;
-	ret;
-	}
-// END FUNCTION DEF
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
index 5130591..076e3cd 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
@@ -1,3 +1,6 @@
+/* AMD GCN does not use 32-lane vectors.
+   { dg-skip-if "unsuitable dimensions" { openacc_amdgcn_accel_selected } { "*" } { "" } } */
+
 /* { dg-additional-options "-fopenacc-dim=32" } */
 
 #include <stdio.h>
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h b/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
index 53749da..e69de29 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
@@ -1,103 +0,0 @@
-
-#include <stdio.h>
-#include <cuda.h>
-
-static int _Tnum_timers;
-static CUevent *_Tstart_events, *_Tstop_events;
-static CUstream _Tstream;
-
-void
-init_timers (int ntimers)
-{
-  int i;
-  CUresult r;
-
-  _Tnum_timers = ntimers;
-
-  _Tstart_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-  _Tstop_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-
-  r = cuStreamCreate (&_Tstream, CU_STREAM_DEFAULT);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-      abort ();
-    }
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      r = cuEventCreate (&_Tstart_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-
-      r = cuEventCreate (&_Tstop_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-    }
-}
-
-void
-fini_timers (void)
-{
-  int i;
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      cuEventDestroy (_Tstart_events[i]);
-      cuEventDestroy (_Tstop_events[i]);
-    }
-
-  cuStreamDestroy (_Tstream);
-
-  free (_Tstart_events);
-  free (_Tstop_events);
-}
-
-void
-start_timer (int timer)
-{
-  CUresult r;
-
-  r = cuEventRecord (_Tstart_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-}
-
-float
-stop_timer (int timer)
-{
-  CUresult r;
-  float etime;
-
-  r = cuEventRecord (_Tstop_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventSynchronize (_Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventSynchronize failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventElapsedTime (&etime, _Tstart_events[timer], _Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventElapsedTime failed: %d\n", r);
-      abort ();
-    }
-
-  return etime;
-}
diff --git a/libgomp/testsuite/libgomp.oacc-c/c.exp b/libgomp/testsuite/libgomp.oacc-c/c.exp
index 55cd40f..9ab68bb 100644
--- a/libgomp/testsuite/libgomp.oacc-c/c.exp
+++ b/libgomp/testsuite/libgomp.oacc-c/c.exp
@@ -40,12 +40,12 @@
 
 # Test with all available offload targets, and with offloading disabled.
 set SAVE_ALWAYS_CFLAGS "$ALWAYS_CFLAGS"
-foreach offload_target [concat [split $offload_targets ","] "disable"] {
+global offload_target
+foreach offload_target [concat [split $offload_targets ":"] "disable"] {
     set ALWAYS_CFLAGS "$SAVE_ALWAYS_CFLAGS"
-    global openacc_device_type
-    set openacc_device_type [offload_target_to_openacc_device_type $offload_target]
-    set tagopt "-DACC_DEVICE_TYPE_$openacc_device_type=1"
 
+    set openacc_device_type [offload_target_to_openacc_device_type $offload_target]
+    set tagopt "-DACC_DEVICE_TYPE_$openacc_device_type=\"$offload_target\""
     switch $openacc_device_type {
 	"" {
 	    unsupported "$subdir $offload_target offloading"
@@ -53,6 +53,9 @@
 	}
 	host {
 	    set acc_mem_shared 1
+
+	    # Special case: pass the empty string instead of "disable".
+	    set tagopt "-DACC_DEVICE_TYPE_$openacc_device_type=\"\""
 	}
 	nvidia {
 	    if { ![check_effective_target_openacc_nvidia_accel_present] } {
@@ -69,6 +72,9 @@
 
 	    set acc_mem_shared 0
 	}
+	gcn {
+	    set acc_mem_shared 0
+	}
 	default {
 	    error "Unknown OpenACC device type: $openacc_device_type (offload target: $offload_target)"
 	}
@@ -79,8 +85,6 @@
     # handling in test cases, by default only build for the offload target
     # that we're actually going to test.
     set tagopt "$tagopt -foffload=$offload_target"
-    # Force usage of the corresponding OpenACC device type.
-    setenv ACC_DEVICE_TYPE $openacc_device_type
 
     # To get better test coverage for device-specific code that is only
     # ever used in offloading configurations, we'd like more thorough
diff --git a/libgomp/testsuite/libgomp.oacc-c/offload-targets-1.c b/libgomp/testsuite/libgomp.oacc-c/offload-targets-1.c
new file mode 100644
index 0000000..7265f48
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/offload-targets-1.c
@@ -0,0 +1,122 @@
+/* Test what happens for repeated GOMP_set_offload_targets calls, which happens
+   when shared libraries are involved, for example.  As in the libgomp
+   testsuite infrastructure, it is difficult to build and link against shared
+   libraries, we simulate that by replicating some relevant
+   GOMP_set_offload_targets calls.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <openacc.h>
+#include "libgomp_g.h"
+
+int main ()
+{
+  /* Before getting here, GOMP_set_offload_targets already got called via a
+     constructor.  */
+
+  bool acc_device_types_requested[_ACC_device_hwm];
+  for (int i = 0; i < _ACC_device_hwm; ++i)
+    acc_device_types_requested[i] = false;
+
+  /* We're building for only one offload target ("-foffload=[...]") which is
+     the following.  */
+  const char *offload_target_requested;
+  acc_device_t acc_device_type_requested;
+#if defined ACC_DEVICE_TYPE_nvidia
+  offload_target_requested = ACC_DEVICE_TYPE_nvidia;
+  acc_device_type_requested = acc_device_nvidia;
+#elif defined ACC_DEVICE_TYPE_gcn
+  offload_target_requested = ACC_DEVICE_TYPE_gcn;
+  acc_device_type_requested = acc_device_gcn;
+#elif defined ACC_DEVICE_TYPE_host
+  offload_target_requested = ACC_DEVICE_TYPE_host;
+  acc_device_type_requested = acc_device_host;
+#else
+# error Not ported to this ACC_DEVICE_TYPE
+#endif
+  acc_device_types_requested[acc_device_type_requested] = true;
+
+#ifdef OFFLOAD_TARGETS_SAME_AGAIN
+  /* Call again; will have no noticeable difference.  */
+  GOMP_set_offload_targets (offload_target_requested);
+#endif
+
+#ifdef OFFLOAD_TARGETS_ADD_EARLY
+  /* Request a (non-existing) offloading target (which will result in a
+     non-fatal diagnostic).  */
+  GOMP_set_offload_targets (OFFLOAD_TARGETS_ADD);
+#endif
+
+#ifdef OFFLOAD_TARGETS_SAME_AGAIN
+  /* Call again; will have no noticeable difference.  */
+  GOMP_set_offload_targets (offload_target_requested);
+  char *s;
+  {
+    size_t len = 3 * (strlen (offload_target_requested) + 1);
+# ifdef OFFLOAD_TARGETS_ADD_EARLY
+    len += 3 * (strlen (OFFLOAD_TARGETS_ADD) + 1);
+# endif
+    s = malloc (len);
+    if (s == NULL)
+      __builtin_abort ();
+    size_t len_;
+# ifndef OFFLOAD_TARGETS_ADD_EARLY
+    len_ = sprintf (s, "%s:%s:%s",
+		    offload_target_requested,
+		    offload_target_requested,
+		    offload_target_requested);
+# else
+    len_ = sprintf (s, "%s:%s:%s:%s:%s:%s",
+		    offload_target_requested,
+		    offload_target_requested,
+		    OFFLOAD_TARGETS_ADD,
+		    OFFLOAD_TARGETS_ADD,
+		    offload_target_requested,
+		    OFFLOAD_TARGETS_ADD);
+# endif
+    if (len_ + 1 != len)
+      __builtin_abort ();
+    GOMP_set_offload_targets (s);
+  }
+#endif
+
+  /* Calling acc_get_num_devices will implicitly initialize offloading.  */
+#if defined OFFLOAD_TARGETS_ADD_EARLY
+  fprintf (stderr, "CheCKpOInT1\n");
+#endif
+  /* acc_device_host is always available.  */
+  if ((acc_get_num_devices (acc_device_host) > 0) == false)
+    __builtin_abort ();
+#if defined OFFLOAD_TARGETS_ADD_EARLY
+  fprintf (stderr, "WrONg WAy1\n");
+#endif
+  for (acc_device_t acc_device_type = acc_device_not_host + 1;
+       acc_device_type < _ACC_device_hwm;
+       ++acc_device_type)
+    {
+      /* The requested device type must be available.  Any other device types
+	 must not be available.  */
+      if ((acc_get_num_devices (acc_device_type) > 0)
+	  != acc_device_types_requested[acc_device_type])
+	__builtin_abort ();
+    }
+
+#ifdef OFFLOAD_TARGETS_SAME_AGAIN
+  /* Request the same again; will have no noticeable difference.  */
+  GOMP_set_offload_targets (offload_target_requested);
+#endif
+#if defined OFFLOAD_TARGETS_ADD_LATE
+  fprintf (stderr, "CheCKpOInT2\n");
+  GOMP_set_offload_targets (OFFLOAD_TARGETS_ADD);
+  fprintf (stderr, "WrONg WAy2\n");
+#endif
+#ifdef OFFLOAD_TARGETS_SAME_AGAIN
+  GOMP_set_offload_targets (s);
+
+  /* Implementation defail: OK to "free (s)", in this case.  */
+  free (s);
+#endif
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c/offload-targets-2.c b/libgomp/testsuite/libgomp.oacc-c/offload-targets-2.c
new file mode 100644
index 0000000..977c559
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/offload-targets-2.c
@@ -0,0 +1,2 @@
+#define OFFLOAD_TARGETS_SAME_AGAIN
+#include "offload-targets-1.c"
diff --git a/libgomp/testsuite/libgomp.oacc-c/offload-targets-3.c b/libgomp/testsuite/libgomp.oacc-c/offload-targets-3.c
new file mode 100644
index 0000000..1eb080b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/offload-targets-3.c
@@ -0,0 +1,10 @@
+#define OFFLOAD_TARGETS_ADD "XYZ"
+#define OFFLOAD_TARGETS_ADD_EARLY
+#include "offload-targets-1.c"
+
+/*
+  { dg-output "CheCKpOInT1(\n|\r\n|\r)+" }
+  { dg-output "libgomp: Unknown offload target: XYZ(\n|\r\n|\r)+" }
+  { dg-output "$" }
+  { dg-shouldfail ""  }
+*/
diff --git a/libgomp/testsuite/libgomp.oacc-c/offload-targets-4.c b/libgomp/testsuite/libgomp.oacc-c/offload-targets-4.c
new file mode 100644
index 0000000..2bb7204
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/offload-targets-4.c
@@ -0,0 +1,11 @@
+#define OFFLOAD_TARGETS_SAME_AGAIN
+#define OFFLOAD_TARGETS_ADD "XYZ"
+#define OFFLOAD_TARGETS_ADD_EARLY
+#include "offload-targets-1.c"
+
+/*
+  { dg-output "CheCKpOInT1(\n|\r\n|\r)+" }
+  { dg-output "libgomp: Unknown offload target: XYZ(\n|\r\n|\r)+" }
+  { dg-output "$" }
+  { dg-shouldfail ""  }
+*/
diff --git a/libgomp/testsuite/libgomp.oacc-c/offload-targets-5.c b/libgomp/testsuite/libgomp.oacc-c/offload-targets-5.c
new file mode 100644
index 0000000..8ba0792
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/offload-targets-5.c
@@ -0,0 +1,10 @@
+#define OFFLOAD_TARGETS_ADD "XYZ"
+#define OFFLOAD_TARGETS_ADD_LATE
+#include "offload-targets-1.c"
+
+/*
+  { dg-output "CheCKpOInT2(\n|\r\n|\r)+" }
+  { dg-output "libgomp: Can't satisfy request for offload targets: XYZ; have loaded: \[a-z-\]*(\n|\r\n|\r)+" }
+  { dg-output "$" }
+  { dg-shouldfail ""  }
+*/
diff --git a/libgomp/testsuite/libgomp.oacc-c/offload-targets-6.c b/libgomp/testsuite/libgomp.oacc-c/offload-targets-6.c
new file mode 100644
index 0000000..4b15582
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/offload-targets-6.c
@@ -0,0 +1,11 @@
+#define OFFLOAD_TARGETS_SAME_AGAIN
+#define OFFLOAD_TARGETS_ADD "XYZ"
+#define OFFLOAD_TARGETS_ADD_LATE
+#include "offload-targets-1.c"
+
+/*
+  { dg-output "CheCKpOInT2(\n|\r\n|\r)+" }
+  { dg-output "libgomp: Can't satisfy request for offload targets: XYZ; have loaded: \[a-z-\]*(\n|\r\n|\r)+" }
+  { dg-output "$" }
+  { dg-shouldfail ""  }
+*/
diff --git a/libgomp/testsuite/libgomp.oacc-c/pr85465.c b/libgomp/testsuite/libgomp.oacc-c/pr85465.c
new file mode 100644
index 0000000..329e8a0
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/pr85465.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-w" } */
+
+int
+main (void)
+{
+#pragma acc parallel
+  foo ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c/print-1.c b/libgomp/testsuite/libgomp.oacc-c/print-1.c
new file mode 100644
index 0000000..593885b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/print-1.c
@@ -0,0 +1,17 @@
+/* Ensure that printf on the offload device works.  */
+
+/* { dg-do run } */
+/* { dg-output "The answer is 42(\n|\r\n|\r)+" } */
+
+#include <stdio.h>
+
+int var = 42;
+
+int
+main ()
+{
+#pragma acc parallel
+    {
+      printf ("The answer is %d\n", var);
+    }
+}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc-get-property.f b/libgomp/testsuite/libgomp.oacc-fortran/acc-get-property.f
new file mode 100644
index 0000000..9ffeb05
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/acc-get-property.f
@@ -0,0 +1,33 @@
+! Test the `acc_get_property' and '`acc_get_property_string' library
+! functions.
+! { dg-do run }
+
+      USE OPENACC
+      IMPLICIT NONE
+
+      INTEGER(ACC_DEVICE_PROPERTY) V
+      CHARACTER*256 S
+      LOGICAL R
+
+      ! Verify that the vendor is a non-empty string.
+      CALL ACC_GET_PROPERTY_STRING (0, ACC_DEVICE_DEFAULT,
+     +                              ACC_PROPERTY_VENDOR, S)
+      R = S /= ""
+      IF (S /= "") PRINT "(A, A)", "OpenACC vendor: ", TRIM (S)
+
+      ! For the rest just check that they do not crash.
+      V = ACC_GET_PROPERTY (0, ACC_DEVICE_DEFAULT,
+     +                      ACC_PROPERTY_MEMORY)
+      IF (V /= 0) PRINT "(A, I0)", "OpenACC total memory: ", V
+      V = ACC_GET_PROPERTY (0, ACC_DEVICE_DEFAULT,
+     +                      ACC_PROPERTY_FREE_MEMORY)
+      IF (V /= 0) PRINT "(A, I0)", "OpenACC free memory: ", V
+      CALL ACC_GET_PROPERTY_STRING (0, ACC_DEVICE_DEFAULT,
+     +                              ACC_PROPERTY_NAME, S)
+      IF (S /= "") PRINT "(A, A)", "OpenACC name: ", TRIM (S)
+      CALL ACC_GET_PROPERTY_STRING (0, ACC_DEVICE_DEFAULT,
+     +                              ACC_PROPERTY_DRIVER, S)
+      IF (S /= "") PRINT "(A, A)", "OpenACC driver: ", TRIM (S)
+
+      IF (.NOT. R) STOP 1
+      END
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90
index d6c67a0..6a82385 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-1.f90
@@ -25,7 +25,7 @@
 !$acc end parallel
 
 
-#if !ACC_DEVICE_TYPE_host
+#ifndef ACC_DEVICE_TYPE_host
 
 ! Offloaded.
 
@@ -33,7 +33,7 @@
 if (acc_on_device (acc_device_none)) STOP 9
 if (acc_on_device (acc_device_host)) STOP 10
 if (.not. acc_on_device (acc_device_not_host)) STOP 11
-#if ACC_DEVICE_TYPE_nvidia
+#ifdef ACC_DEVICE_TYPE_nvidia
 if (.not. acc_on_device (acc_device_nvidia)) STOP 12
 #else
 if (acc_on_device (acc_device_nvidia)) STOP 13
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f
index 75e2450..1b9f9ac 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-2.f
@@ -25,7 +25,7 @@
 !$ACC END PARALLEL
 
 
-#if !ACC_DEVICE_TYPE_host
+#ifndef ACC_DEVICE_TYPE_host
 
 ! Offloaded.
 
@@ -33,7 +33,7 @@
       IF (ACC_ON_DEVICE (ACC_DEVICE_NONE)) STOP 9
       IF (ACC_ON_DEVICE (ACC_DEVICE_HOST)) STOP 10
       IF (.NOT. ACC_ON_DEVICE (ACC_DEVICE_NOT_HOST)) STOP 11
-#if ACC_DEVICE_TYPE_nvidia
+#ifdef ACC_DEVICE_TYPE_nvidia
       IF (.NOT. ACC_ON_DEVICE (ACC_DEVICE_NVIDIA)) STOP 12
 #else
       IF (ACC_ON_DEVICE (ACC_DEVICE_NVIDIA)) STOP 13
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f
index 908d185..82bf954 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/acc_on_device-1-3.f
@@ -25,7 +25,7 @@
 !$ACC END PARALLEL
 
 
-#if !ACC_DEVICE_TYPE_host
+#ifndef ACC_DEVICE_TYPE_host
 
 ! Offloaded.
 
@@ -33,7 +33,7 @@
       IF (ACC_ON_DEVICE (ACC_DEVICE_NONE)) STOP 9
       IF (ACC_ON_DEVICE (ACC_DEVICE_HOST)) STOP 10
       IF (.NOT. ACC_ON_DEVICE (ACC_DEVICE_NOT_HOST)) STOP 11
-#if ACC_DEVICE_TYPE_nvidia
+#ifdef ACC_DEVICE_TYPE_nvidia
       IF (.NOT. ACC_ON_DEVICE (ACC_DEVICE_NVIDIA)) STOP 12
 #else
       IF (ACC_ON_DEVICE (ACC_DEVICE_NVIDIA)) STOP 13
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/allocatable-array-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-array-1.f90
new file mode 100644
index 0000000..3758031
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-array-1.f90
@@ -0,0 +1,30 @@
+! Ensure that dummy arguments of allocatable arrays don't cause
+! "libgomp: [...] is not mapped" errors.
+
+! { dg-do run }
+
+program main
+  integer, parameter :: n = 40
+  integer, allocatable :: ar(:,:,:)
+  integer :: i
+
+  allocate (ar(1:n,0:n-1,0:n-1))
+  !$acc enter data copyin (ar)
+
+  !$acc update host (ar)
+
+  !$acc update device (ar)
+
+  call update_ar (ar, n)
+
+  !$acc exit data copyout (ar)
+end program main
+
+subroutine update_ar (ar, n)
+  integer :: n
+  integer, dimension (1:n,0:n-1,0:n-1) :: ar
+
+  !$acc update host (ar)
+
+  !$acc update device (ar)
+end subroutine update_ar
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90 b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90
new file mode 100644
index 0000000..42b3408
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/allocatable-scalar.f90
@@ -0,0 +1,33 @@
+! Test non-declared allocatable scalars in OpenACC data clauses.
+
+! { dg-do run }
+
+program main
+  implicit none
+  integer, parameter :: n = 100
+  integer, allocatable :: a, c
+  integer :: i, b(n)
+
+  allocate (a)
+
+  a = 50
+
+  !$acc parallel loop
+  do i = 1, n;
+     b(i) = a
+  end do
+
+  do i = 1, n
+     if (b(i) /= a) stop 1
+  end do
+
+  allocate (c)
+
+  !$acc parallel copyout(c) num_gangs(1)
+  c = a
+  !$acc end parallel
+
+  if (c /= a) stop 2
+
+  deallocate (a, c)
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90
index 5a4a1e0..9dd2339 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90
@@ -275,7 +275,7 @@
   if (ltmp .neqv. .not. lexp) STOP 33
   if (lgot .neqv. lexp) STOP 34
 
-  igot = 1
+  igot = 0
   iexp = N
 
   !$acc parallel loop copy (igot, itmp)
@@ -287,12 +287,16 @@
     end do
   !$acc end parallel loop
 
+  itmp = 0
   do i = 1, N
-     if (.not. (1 <= iarr(i) .and. iarr(i) < iexp)) STOP 35
+     if (iarr(i) == 0 .and. itmp == 0) itmp = i
+  end do
+  do i = 1, N
+     if (iarr(i) == 0 .and. i /= itmp) STOP 35
   end do
   if (igot /= iexp) STOP 36
 
-  igot = N
+  igot = N + 1
   iexp = 1
 
   !$acc parallel loop copy (igot, itmp)
@@ -304,8 +308,12 @@
     end do
   !$acc end parallel loop
 
+  itmp = 0
   do i = 1, N
-     if (.not. (iarr(i) == 1 .or. iarr(i) == N)) STOP 37
+     if (iarr(i) == N + 1 .and. itmp == 0) itmp = i
+  end do
+  do i = 1, N
+      if (iarr(i) == N + 1 .and. i /= itmp) STOP 37
   end do
   if (igot /= iexp) STOP 38
 
@@ -314,7 +322,7 @@
 
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
-      iexpr = ibclr (-2, i)
+      iexpr = ibclr (-1, i)
   !$acc atomic capture
       iarr(i) = igot
       igot = iand (igot, iexpr)
@@ -322,9 +330,13 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) < 0)) STOP 39
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 528) STOP 39
   if (igot /= iexp) STOP 40
 
   igot = 0
@@ -340,10 +352,14 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) >= 0)) STOP 41
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
-  if (igot /= iexp) STOP 42
+  if (igot /= iexp) STOP 41
+  if (itmp /= 496) STOP 42
 
   igot = -1
   iexp = 0 
@@ -358,12 +374,16 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) < 0)) STOP 43
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
-  if (igot /= iexp) STOP 44
+  if (igot /= iexp) STOP 43
+  if (itmp /= 528) STOP 44
 
-  igot = 1
+  igot = 0
   iexp = N
 
   !$acc parallel loop copy (igot, itmp)
@@ -375,12 +395,16 @@
     end do
   !$acc end parallel loop
 
+  itmp = 0
   do i = 1, N
-     if (.not. (1 <= iarr(i) .and. iarr(i) < iexp)) STOP 45
+     if (iarr(i) == 0 .and. itmp == 0) itmp = i
+  end do
+  do i = 1, N
+     if (iarr(i) == 0 .and. itmp /= i) STOP 45
   end do
   if (igot /= iexp) STOP 46
 
-  igot = N
+  igot = N + 1
   iexp = 1
 
   !$acc parallel loop copy (igot, itmp)
@@ -392,8 +416,12 @@
     end do
   !$acc end parallel loop
 
+  itmp = 0
   do i = 1, N
-     if (.not. (iarr(i) == 1 .or. iarr(i) == N)) STOP 47
+     if (iarr(i) == N + 1 .and. itmp == 0) itmp = i
+  end do
+  do i = 1, N
+      if (iarr(i) == N + 1 .and. i /= itmp) STOP 47
   end do
   if (igot /= iexp) STOP 48
 
@@ -402,7 +430,7 @@
 
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
-      iexpr = ibclr (-2, i)
+      iexpr = ibclr (-1, i)
   !$acc atomic capture
       iarr(i) = igot
       igot = iand (iexpr, igot)
@@ -410,14 +438,18 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) < 0)) STOP 49
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 528) STOP 49
   if (igot /= iexp) STOP 50
 
   igot = 0
   iexp = -1 
-	!!
+
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
       iexpr = lshift (1, i)
@@ -428,10 +460,14 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) >= 0)) STOP 51
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
-  if (igot /= iexp) STOP 52
+  if (igot /= iexp) STOP 51
+  if (itmp /= 496) STOP 52
 
   igot = -1
   iexp = 0 
@@ -446,10 +482,14 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) < 0)) STOP 53
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
-  if (igot /= iexp) STOP 54
+  if (igot /= iexp) STOP 53
+  if (itmp /= 528) STOP 54
 
   fgot = 1234.0
   fexp = 1266.0
@@ -720,7 +760,7 @@
   end do
   if (igot /= iexp) STOP 88
 
-  igot = N
+  igot = N + 1
   iexp = 1
 
   !$acc parallel loop copy (igot, itmp)
@@ -733,7 +773,7 @@
   !$acc end parallel loop
 
   do i = 1, N
-     if (.not. (iarr(i) == iexp)) STOP 89
+     if (iarr(i) .lt. 1 .or. iarr(i) .gt. N) STOP 89
   end do
   if (igot /= iexp) STOP 90
 
@@ -742,7 +782,7 @@
 
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
-      iexpr = ibclr (-2, i)
+      iexpr = ibclr (-1, i)
   !$acc atomic capture
       igot = iand (igot, iexpr)
       iarr(i) = igot
@@ -750,9 +790,13 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) <= 0)) STOP 91
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 496) STOP 91
   if (igot /= iexp) STOP 92
 
   igot = 0
@@ -768,9 +812,13 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) >= -1)) STOP 93
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 528) STOP 93
   if (igot /= iexp) STOP 94
 
   igot = -1
@@ -786,9 +834,13 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) <= 0)) STOP 95
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 496) STOP 95
   if (igot /= iexp) STOP 96
 
   igot = 1
@@ -808,7 +860,7 @@
   end do
   if (igot /= iexp) STOP 98
 
-  igot = N
+  igot = N + 1
   iexp = 1
 
   !$acc parallel loop copy (igot, itmp)
@@ -821,7 +873,7 @@
   !$acc end parallel loop
 
   do i = 1, N
-     if (.not. (iarr(i) == iexp )) STOP 99
+     if (iarr(i) .lt. 1 .or. iarr(i) .gt. N) STOP 99
   end do
   if (igot /= iexp) STOP 100
 
@@ -830,7 +882,7 @@
 
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
-      iexpr = ibclr (-2, i)
+      iexpr = ibclr (-1, i)
   !$acc atomic capture
       igot = iand (iexpr, igot)
       iarr(i) = igot
@@ -838,9 +890,13 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) <= 0)) STOP 101
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 496) STOP 101
   if (igot /= iexp) STOP 102
 
   igot = 0
@@ -856,9 +912,13 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) >= iexp)) STOP 103
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 528) STOP 103
   if (igot /= iexp) STOP 104
 
   igot = -1
@@ -874,9 +934,12 @@
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) <= iexp)) STOP 105
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 496) STOP 105
   if (igot /= iexp) STOP 106
-
 end program
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/class-ptr-param.f95 b/libgomp/testsuite/libgomp.oacc-fortran/class-ptr-param.f95
new file mode 100644
index 0000000..8014733
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/class-ptr-param.f95
@@ -0,0 +1,34 @@
+! { dg-do run }
+
+module typemod
+
+type mytype
+  integer :: a
+end type mytype
+
+contains
+
+subroutine mysub(c)
+  implicit none
+
+  class(mytype), allocatable :: c
+
+!$acc parallel copy(c)
+  c%a = 5
+!$acc end parallel
+end subroutine mysub
+
+end module typemod
+
+program main
+  use typemod
+  implicit none
+
+  class(mytype), allocatable :: myvar
+  allocate(mytype :: myvar)
+
+  myvar%a = 0
+  call mysub(myvar)
+
+  if (myvar%a .ne. 5) stop 1
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/classtypes-1.f95 b/libgomp/testsuite/libgomp.oacc-fortran/classtypes-1.f95
new file mode 100644
index 0000000..f16f42f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/classtypes-1.f95
@@ -0,0 +1,48 @@
+! { dg-do run }
+
+module typemod
+
+type :: typeimpl
+  real, pointer :: p(:) => null()
+end type typeimpl
+
+type :: basictype
+  class(typeimpl), pointer :: p => null()
+end type basictype
+
+type, extends(basictype) :: regulartype
+  character :: void
+end type regulartype
+
+end module typemod
+
+program main
+  use typemod
+  implicit none
+  type(regulartype), pointer :: myvar
+  integer :: i
+  real :: j, k
+
+  allocate(myvar)
+  allocate(myvar%p)
+  allocate(myvar%p%p(1:100))
+
+  do i=1,100
+    myvar%p%p(i) = -1.0
+  end do
+
+!$acc enter data copyin(myvar, myvar%p) create(myvar%p%p)
+
+!$acc parallel loop present(myvar%p%p)
+  do i=1,100
+    myvar%p%p(i) = i * 2
+  end do
+!$acc end parallel loop
+
+!$acc exit data copyout(myvar%p%p) delete(myvar, myvar%p)
+
+  do i=1,100
+    if (myvar%p%p(i) .ne. i * 2) stop 1
+  end do
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/classtypes-2.f95 b/libgomp/testsuite/libgomp.oacc-fortran/classtypes-2.f95
new file mode 100644
index 0000000..ad80ec2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/classtypes-2.f95
@@ -0,0 +1,106 @@
+! { dg-do run }
+
+module wrapper_mod
+
+type compute
+  integer, allocatable :: block(:,:)
+contains
+  procedure :: initialize
+end type compute
+
+type, extends(compute) :: cpu_compute
+  integer :: blocksize
+contains
+  procedure :: setblocksize
+end type cpu_compute
+
+type, extends(compute) :: gpu_compute
+  integer :: numgangs
+  integer :: numworkers
+  integer :: vectorsize
+  integer, allocatable :: gpu_block(:,:)
+contains
+  procedure :: setdims
+end type gpu_compute
+
+contains
+
+subroutine initialize(c, length, width)
+  implicit none
+  class(compute) :: c
+  integer :: length
+  integer :: width
+  integer :: i
+  integer :: j
+
+  allocate (c%block(length, width))
+
+  do i=1,length
+    do j=1, width
+      c%block(i,j) = i + j
+    end do
+  end do
+end subroutine initialize
+
+subroutine setdims(c, g, w, v)
+  implicit none
+  class(gpu_compute) :: c
+  integer :: g
+  integer :: w
+  integer :: v
+  c%numgangs = g
+  c%numworkers = w
+  c%vectorsize = v
+end subroutine setdims
+
+subroutine setblocksize(c, bs)
+  implicit none
+  class(cpu_compute) :: c
+  integer :: bs
+  c%blocksize = bs
+end subroutine setblocksize
+
+end module wrapper_mod
+
+program main
+  use wrapper_mod
+  implicit none
+  class(compute), allocatable, target :: mycomp
+  integer :: i, j
+
+  allocate(gpu_compute::mycomp)
+
+  call mycomp%initialize(1024,1024)
+
+  !$acc enter data copyin(mycomp)
+
+  select type (mycomp)
+  type is (cpu_compute)
+    call mycomp%setblocksize(32)
+  type is (gpu_compute)
+    call mycomp%setdims(32,32,32)
+    allocate(mycomp%gpu_block(1024,1024))
+    !$acc update device(mycomp)
+    !$acc parallel copyin(mycomp%block) copyout(mycomp%gpu_block)
+    !$acc loop gang worker vector collapse(2)
+    do i=1,1024
+      do j=1,1024
+        mycomp%gpu_block(i,j) = mycomp%block(i,j) + 1
+      end do
+    end do
+    !$acc end parallel
+  end select
+
+  !$acc exit data copyout(mycomp)
+
+  select type (g => mycomp)
+  type is (gpu_compute)
+  do i = 1, 1024
+    do j = 1, 1024
+      if (g%gpu_block(i,j) .ne. i + j + 1) stop 1
+    end do
+  end do
+  end select
+
+  deallocate(mycomp)
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/collapse-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/collapse-1.f90
index 918c5d0..659e5db 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/collapse-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/collapse-1.f90
@@ -6,7 +6,7 @@
   l = .false.
   a(:, :, :) = 0
   !$acc parallel
-  !$acc loop collapse(4 - 1)
+  !$acc loop collapse(4 - 1) gang(static:*)
     do i = 1, 3
       do j = 4, 6
         do k = 5, 7
@@ -14,7 +14,7 @@
         end do
       end do
     end do
-  !$acc loop collapse(2) reduction(.or.:l)
+  !$acc loop collapse(3) gang(static:*) reduction(.or.:l)
     do i = 1, 3
       do j = 4, 6
         do k = 5, 7
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/collapse-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/collapse-2.f90
index 98b6987..409d979 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/collapse-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/collapse-2.f90
@@ -7,13 +7,13 @@
   l = .false.
   a(:, :, :) = 0
   !$acc parallel
-  !$acc loop collapse(4 - 1)
+  !$acc loop collapse(4 - 1) gang(static:*)
     do 164 i = 1, 3
       do 164 j = 4, 6
         do 164 k = 5, 7
           a(i, j, k) = i + j + k
 164      end do
-  !$acc loop collapse(2) reduction(.or.:l)
+  !$acc loop collapse(3) gang(static:*) reduction(.or.:l)
 firstdo: do i = 1, 3
       do j = 4, 6
         do k = 5, 7
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/common-block-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/common-block-1.f90
new file mode 100644
index 0000000..9f40297
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/common-block-1.f90
@@ -0,0 +1,105 @@
+! Test data located inside common blocks.  This test does not execrise
+! ACC DECLARE.
+
+module const
+  integer, parameter :: n = 100
+end module const
+
+subroutine check
+  use const
+
+  implicit none
+  integer i, x(n), y
+  common /BLOCK/ x, y
+
+  do i = 1, n
+     if (x(i) .ne. y) call abort
+  end do
+end subroutine check
+
+module m
+  use const
+  integer a(n), b
+  common /BLOCK/ a, b
+
+contains
+  subroutine mod_implicit_incr
+    implicit none
+    integer i
+
+    !$acc parallel loop
+    do i = 1, n
+       a(i) = b
+    end do
+    !$acc end parallel loop
+
+    call check
+  end subroutine mod_implicit_incr
+
+  subroutine mod_explicit_incr
+    implicit none
+    integer i
+
+    !$acc parallel loop copy(a(1:n)) copyin(b)
+    do i = 1, n
+       a(i) = b
+    end do
+    !$acc end parallel loop
+
+    call check
+  end subroutine mod_explicit_incr
+end module m
+
+subroutine sub_implicit_incr
+  use const
+
+  implicit none
+  integer i, x(n), y
+  common /BLOCK/ x, y
+
+  !$acc parallel loop
+  do i = 1, n
+     x(i) = y
+  end do
+  !$acc end parallel loop
+
+  call check
+end subroutine sub_implicit_incr
+
+subroutine sub_explicit_incr
+  use const
+
+  implicit none
+  integer i, x(n), y
+  common /BLOCK/ x, y
+
+  !$acc parallel loop copy(x(1:n)) copyin(y)
+  do i = 1, n
+     x(i) = y
+  end do
+  !$acc end parallel loop
+
+  call check
+end subroutine sub_explicit_incr
+
+program main
+  use m
+
+  implicit none
+
+  a(:) = -1
+  b = 5
+  call mod_implicit_incr
+
+  a(:) = -2
+  b = 6
+  call mod_explicit_incr
+
+  a(:) = -3
+  b = 7
+  call sub_implicit_incr
+
+  a(:) = -4
+  b = 8
+  call sub_explicit_incr
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/common-block-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/common-block-2.f90
new file mode 100644
index 0000000..bf17fc5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/common-block-2.f90
@@ -0,0 +1,150 @@
+! Test data located inside common blocks.  This test does not execrise
+! ACC DECLARE.  All data clauses are explicit.
+
+module consts
+  integer, parameter :: n = 100
+end module consts
+
+subroutine validate
+  use consts
+
+  implicit none
+  integer i, j
+  real*4 x(n), y(n), z
+  common /BLOCK/ x, y, z, j
+
+  do i = 1, n
+     if (abs(x(i) - i - z) .ge. 0.0001) call abort
+  end do
+end subroutine validate
+
+subroutine incr
+  use consts
+
+  implicit none
+  integer i, j
+  real*4 x(n), y(n), z
+  common /BLOCK/ x, y, z, j
+
+  !$acc parallel loop pcopy(/BLOCK/)
+  do i = 1, n
+     x(i) = x(i) + z
+  end do
+  !$acc end parallel loop
+end subroutine incr
+
+program main
+  use consts
+
+  implicit none
+  integer i, j
+  real*4 a(n), b(n), c
+  common /BLOCK/ a, b, c, j
+
+  ! Test copyout, pcopy, device
+
+  !$acc data copyout(a, c)
+
+  c = 1.0
+
+  !$acc update device(c)
+
+  !$acc parallel loop pcopy(a)
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end parallel loop
+
+  call incr
+  call incr
+  call incr
+  !$acc end data
+
+  c = 3.0
+  call validate
+
+  ! Test pcopy without copyout
+
+  c = 2.0
+  call incr
+  c = 5.0
+  call validate
+
+  ! Test create, delete, host, copyout, copyin
+
+  !$acc enter data create(b)
+
+  !$acc parallel loop pcopy(b)
+  do i = 1, n
+     b(i) = i
+  end do
+  !$acc end parallel loop
+
+  !$acc update host (b)
+
+  !$acc parallel loop pcopy(b) copyout(a) copyin(c)
+  do i = 1, n
+     a(i) = b(i) + c
+  end do
+  !$acc end parallel loop
+
+  !$acc exit data delete(b)
+
+  call validate
+
+  a(:) = b(:)
+  c = 0.0
+  call validate
+
+  ! Test copy
+
+  c = 1.0
+  !$acc parallel loop copy(/BLOCK/)
+  do i = 1, n
+     a(i) = b(i) + c
+  end do
+  !$acc end parallel loop
+
+  call validate
+
+  ! Test pcopyin, pcopyout FIXME
+
+  c = 2.0
+  !$acc data copyin(b, c) copyout(a)
+
+  !$acc parallel loop pcopyin(b, c) pcopyout(a)
+  do i = 1, n
+     a(i) = b(i) + c
+  end do
+  !$acc end parallel loop
+
+  !$acc end data
+
+  call validate
+
+  ! Test reduction, private
+
+  j = 0
+
+  !$acc parallel private(i) copy(j)
+  !$acc loop reduction(+:j)
+  do i = 1, n
+     j = j + 1
+  end do
+  !$acc end parallel
+
+  if (j .ne. n) call abort
+
+  ! Test firstprivate, copy
+
+  a(:) = 0
+  c = j
+
+  !$acc parallel loop firstprivate(c) copyout(a)
+  do i = 1, n
+     a(i) = i + c
+  end do
+  !$acc end parallel loop
+
+  call validate
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/common-block-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/common-block-3.f90
new file mode 100644
index 0000000..134e2d1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/common-block-3.f90
@@ -0,0 +1,137 @@
+! Test data located inside common blocks.  This test does not execrise
+! ACC DECLARE.  Most of the data clauses are implicit.
+
+module consts
+  integer, parameter :: n = 100
+end module consts
+
+subroutine validate
+  use consts
+
+  implicit none
+  integer i, j
+  real*4 x(n), y(n), z
+  common /BLOCK/ x, y, z, j
+
+  do i = 1, n
+     if (abs(x(i) - i - z) .ge. 0.0001) call abort
+  end do
+end subroutine validate
+
+subroutine incr_parallel
+  use consts
+
+  implicit none
+  integer i, j
+  real*4 x(n), y(n), z
+  common /BLOCK/ x, y, z, j
+
+  !$acc parallel loop
+  do i = 1, n
+     x(i) = x(i) + z
+  end do
+  !$acc end parallel loop
+end subroutine incr_parallel
+
+subroutine incr_kernels
+  use consts
+
+  implicit none
+  integer i, j
+  real*4 x(n), y(n), z
+  common /BLOCK/ x, y, z, j
+
+  !$acc kernels
+  do i = 1, n
+     x(i) = x(i) + z
+  end do
+  !$acc end kernels
+end subroutine incr_kernels
+
+program main
+  use consts
+
+  implicit none
+  integer i, j
+  real*4 a(n), b(n), c
+  common /BLOCK/ a, b, c, j
+
+  !$acc data copyout(a, c)
+
+  c = 1.0
+
+  !$acc update device(c)
+
+  !$acc parallel loop
+  do i = 1, n
+     a(i) = i
+  end do
+  !$acc end parallel loop
+
+  call incr_parallel
+  call incr_parallel
+  call incr_parallel
+  !$acc end data
+
+  c = 3.0
+  call validate
+
+  ! Test pcopy without copyout
+
+  c = 2.0
+  call incr_kernels
+  c = 5.0
+  call validate
+
+  !$acc kernels
+  do i = 1, n
+     b(i) = i
+  end do
+  !$acc end kernels
+
+  !$acc parallel loop
+  do i = 1, n
+     a(i) = b(i) + c
+  end do
+  !$acc end parallel loop
+
+  call validate
+
+  a(:) = b(:)
+  c = 0.0
+  call validate
+
+  ! Test copy
+
+  c = 1.0
+  !$acc parallel loop
+  do i = 1, n
+     a(i) = b(i) + c
+  end do
+  !$acc end parallel loop
+
+  call validate
+
+  c = 2.0
+  !$acc data copyin(b, c) copyout(a)
+
+  !$acc kernels
+  do i = 1, n
+     a(i) = b(i) + c
+  end do
+  !$acc end kernels
+
+  !$acc end data
+
+  call validate
+
+  j = 0
+
+  !$acc parallel loop reduction(+:j)
+  do i = 1, n
+     j = j + 1
+  end do
+  !$acc end parallel loop
+
+  if (j .ne. n) call abort
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/data-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
index 83a5400..6bb92c1 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
@@ -1,4 +1,5 @@
 ! { dg-do run }
+! { dg-additional-options "-cpp" }
 
 program test
   use openacc
@@ -70,10 +71,14 @@
     end do
   !$acc end parallel
   
-  !$acc exit data copyout (d(1:N)) async
+  !$acc exit data delete (c(1:N)) copyout (d(1:N)) async
   !$acc exit data async
   !$acc wait
 
+#if !ACC_MEM_SHARED
+  if (acc_is_present (c) .eqv. .TRUE.) call abort
+#endif
+
   do i = 1, N
     if (d(i) .ne. 4.0) call abort
   end do
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
index 19eb4bd..b5586be 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
@@ -55,7 +55,8 @@
   c(:) = 0.0
   d(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N))
 
   !$acc parallel async (1)
   do i = 1, N
@@ -76,7 +77,8 @@
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N))
 
   do i = 1, N
      if (a(i) .ne. 3.0) STOP 5
@@ -91,7 +93,8 @@
   d(:) = 0.0
   e(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N)) copyin (e(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N)) copyin (e(1:N))
 
   !$acc parallel async (1)
   do i = 1, N
@@ -118,7 +121,8 @@
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N)) copyout (e(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N)) copyout (e(1:N))
   !$acc exit data delete (N)
 
   do i = 1, N
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90
new file mode 100644
index 0000000..ec7f7c1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-1.f90
@@ -0,0 +1,211 @@
+! Test declare create with allocatable arrays.
+
+! { dg-do run }
+
+module vars
+  implicit none
+  integer, parameter :: n = 100
+  real*8, allocatable :: b(:)
+ !$acc declare create (b)
+end module vars
+
+program test
+  use vars
+  use openacc
+  implicit none
+  real*8 :: a
+  integer :: i
+
+  interface
+     subroutine sub1
+       !$acc routine gang
+     end subroutine sub1
+
+     subroutine sub2
+     end subroutine sub2
+
+     real*8 function fun1 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun1
+
+     real*8 function fun2 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun2
+  end interface
+
+  if (allocated (b)) stop 1
+
+  ! Test local usage of an allocated declared array.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 2
+  if (acc_is_present (b) .neqv. .true.) stop 3
+
+  a = 2.0
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = i * a
+  end do
+
+  if (.not.acc_is_present (b)) stop 4
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= i*a) stop 5
+  end do
+
+  deallocate (b)
+
+  ! Test the usage of an allocated declared array inside an acc
+  ! routine subroutine.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 6
+  if (acc_is_present (b) .neqv. .true.) stop 7
+
+  !$acc parallel
+  call sub1
+  !$acc end parallel
+
+  if (.not.acc_is_present (b)) stop 8
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= i*2) stop 9
+  end do
+
+  deallocate (b)
+
+  ! Test the usage of an allocated declared array inside a host
+  ! subroutine.
+
+  call sub2
+
+  if (.not.acc_is_present (b)) stop 10
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= 1.0) stop 11
+  end do
+
+  deallocate (b)
+
+  if (allocated (b)) stop 12
+
+  ! Test the usage of an allocated declared array inside an acc
+  ! routine function.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 13
+  if (acc_is_present (b) .neqv. .true.) stop 14
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = 1.0
+  end do
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = fun1 (i)
+  end do
+
+  if (.not.acc_is_present (b)) stop 15
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= i) stop 16
+  end do
+
+  deallocate (b)
+
+  ! Test the usage of an allocated declared array inside a host
+  ! function.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 17
+  if (acc_is_present (b) .neqv. .true.) stop 18
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = 1.0
+  end do
+
+  !$acc update host(b)
+
+  do i = 1, n
+     b(i) = fun2 (i)
+  end do
+
+  if (.not.acc_is_present (b)) stop 19
+
+  do i = 1, n
+     if (b(i) /= i*i) stop 20
+  end do
+
+  deallocate (b)
+end program test
+
+! Set each element in array 'b' at index i to i*2.
+
+subroutine sub1 ! { dg-warning "region is worker partitioned" }
+  use vars
+  implicit none
+  integer i
+  !$acc routine gang
+
+  !$acc loop
+  do i = 1, n
+     b(i) = i*2
+  end do
+end subroutine sub1
+
+! Allocate array 'b', and set it to all 1.0.
+
+subroutine sub2
+  use vars
+  use openacc
+  implicit none
+  integer i
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 21
+  if (acc_is_present (b) .neqv. .true.) stop 22
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = 1.0
+  end do
+end subroutine sub2
+
+! Return b(i) * i;
+
+real*8 function fun1 (i)
+  use vars
+  implicit none
+  integer i
+  !$acc routine seq
+
+  fun1 = b(i) * i
+end function fun1
+
+! Return b(i) * i * i;
+
+real*8 function fun2 (i)
+  use vars
+  implicit none
+  integer i
+
+  fun2 = b(i) * i * i
+end function fun2
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-2.f90
new file mode 100644
index 0000000..df5ab26
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-2.f90
@@ -0,0 +1,48 @@
+! Test declare create with allocatable scalars.
+
+! { dg-do run }
+
+program main
+  use openacc
+  implicit none
+  integer, parameter :: n = 100
+  integer, allocatable :: a, c
+  integer :: i, b(n)
+  !$acc declare create (c)
+
+  allocate (a)
+
+  a = 50
+
+  !$acc parallel loop firstprivate(a)
+  do i = 1, n;
+     b(i) = a
+  end do
+
+  do i = 1, n
+     if (b(i) /= a) stop 1
+  end do
+
+  allocate (c)
+  a = 100
+
+  if (.not.acc_is_present(c)) stop 2
+
+  !$acc parallel num_gangs(1) present(c)
+  c = a
+  !$acc end parallel
+
+  !$acc update host(c)
+  if (c /= a) stop 3
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = c
+  end do
+
+  do i = 1, n
+     if (b(i) /= a) stop 4
+  end do
+
+  deallocate (a, c)
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-3.f90
new file mode 100644
index 0000000..9381b00
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-3.f90
@@ -0,0 +1,218 @@
+! Test declare create with allocatable arrays.
+
+! { dg-do run }
+
+module vars
+  implicit none
+  integer, parameter :: n = 100
+  real*8, allocatable :: a, b(:)
+ !$acc declare create (a, b)
+end module vars
+
+program test
+  use vars
+  use openacc
+  implicit none
+  integer :: i
+
+  interface
+     subroutine sub1
+       !$acc routine gang
+     end subroutine sub1
+
+     subroutine sub2
+     end subroutine sub2
+
+     real*8 function fun1 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun1
+
+     real*8 function fun2 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun2
+  end interface
+
+  if (allocated (a)) stop 1
+  if (allocated (b)) stop 2
+
+  ! Test local usage of an allocated declared array.
+
+  allocate (a)
+
+  if (.not.allocated (a)) stop 3
+  if (acc_is_present (a) .neqv. .true.) stop 4
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 5
+  if (acc_is_present (b) .neqv. .true.) stop 6
+
+  a = 2.0
+  !$acc update device(a)
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = i * a
+  end do
+
+  if (.not.acc_is_present (b)) stop 7
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= i*a) stop 8
+  end do
+
+  deallocate (b)
+
+  ! Test the usage of an allocated declared array inside an acc
+  ! routine subroutine.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 9
+  if (acc_is_present (b) .neqv. .true.) stop 10
+
+  !$acc parallel
+  call sub1
+  !$acc end parallel
+
+  if (.not.acc_is_present (b)) stop 11
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= a+i*2) stop 12
+  end do
+
+  deallocate (b)
+
+  ! Test the usage of an allocated declared array inside a host
+  ! subroutine.
+
+  call sub2
+
+  if (.not.acc_is_present (b)) stop 13
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= 1.0) stop 14
+  end do
+
+  deallocate (b)
+
+  if (allocated (b)) stop 15
+
+  ! Test the usage of an allocated declared array inside an acc
+  ! routine function.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 16
+  if (acc_is_present (b) .neqv. .true.) stop 17
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = 1.0
+  end do
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = fun1 (i)
+  end do
+
+  if (.not.acc_is_present (b)) stop 18
+
+  !$acc update host(b)
+
+  do i = 1, n
+     if (b(i) /= i) stop 19
+  end do
+
+  deallocate (b)
+
+  ! Test the usage of an allocated declared array inside a host
+  ! function.
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 20
+  if (acc_is_present (b) .neqv. .true.) stop 21
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = 1.0
+  end do
+
+  !$acc update host(b)
+
+  do i = 1, n
+     b(i) = fun2 (i)
+  end do
+
+  if (.not.acc_is_present (b)) stop 22
+
+  do i = 1, n
+     if (b(i) /= i*a) stop 23
+  end do
+
+  deallocate (a)
+  deallocate (b)
+end program test
+
+! Set each element in array 'b' at index i to a+i*2.
+
+subroutine sub1 ! { dg-warning "region is worker partitioned" }
+  use vars
+  implicit none
+  integer i
+  !$acc routine gang
+
+  !$acc loop
+  do i = 1, n
+     b(i) = a+i*2
+  end do
+end subroutine sub1
+
+! Allocate array 'b', and set it to all 1.0.
+
+subroutine sub2
+  use vars
+  use openacc
+  implicit none
+  integer i
+
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 24
+  if (acc_is_present (b) .neqv. .true.) stop 25
+
+  !$acc parallel loop
+  do i = 1, n
+     b(i) = 1.0
+  end do
+end subroutine sub2
+
+! Return b(i) * i;
+
+real*8 function fun1 (i)
+  use vars
+  implicit none
+  integer i
+  !$acc routine seq
+
+  fun1 = b(i) * i
+end function fun1
+
+! Return b(i) * i * a;
+
+real*8 function fun2 (i)
+  use vars
+  implicit none
+  integer i
+
+  fun2 = b(i) * i * a
+end function fun2
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-4.f90 b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-4.f90
new file mode 100644
index 0000000..afbe52f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/declare-allocatable-4.f90
@@ -0,0 +1,66 @@
+! Test declare create with allocatable arrays and scalars.  The unused
+! declared array 'b' caused an ICE in the past.
+
+! { dg-do run }
+
+module vars
+  implicit none
+  integer, parameter :: n = 100
+  real*8, allocatable :: a, b(:)
+ !$acc declare create (a, b)
+end module vars
+
+program test
+  use vars
+  implicit none
+  integer :: i
+
+  interface
+     subroutine sub1
+     end subroutine sub1
+
+     subroutine sub2
+     end subroutine sub2
+
+     real*8 function fun1 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun1
+
+     real*8 function fun2 (ix)
+       integer ix
+       !$acc routine seq
+     end function fun2
+  end interface
+
+  if (allocated (a)) stop 1
+  if (allocated (b)) stop 2
+
+  ! Test the usage of an allocated declared array inside an acc
+  ! routine subroutine.
+
+  allocate (a)
+  allocate (b(n))
+
+  if (.not.allocated (b)) stop 3
+
+  call sub1
+
+  !$acc update self(a)
+  if (a /= 50) stop 4
+
+  deallocate (a)
+  deallocate (b)
+
+end program test
+
+! Set 'a' to 50.
+
+subroutine sub1
+  use vars
+  implicit none
+  integer i
+
+  a = 50
+  !$acc update device(a)
+end subroutine sub1
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-1.f90
new file mode 100644
index 0000000..c4cea11
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-1.f90
@@ -0,0 +1,35 @@
+! { dg-do run }
+
+! Test of attach/detach with "acc data".
+
+program dtype
+  implicit none
+  integer, parameter :: n = 512
+  type mytype
+    integer, allocatable :: a(:)
+  end type mytype
+  integer i
+
+  type(mytype) :: var
+
+  allocate(var%a(1:n))
+
+!$acc data copy(var)
+!$acc data copy(var%a)
+
+!$acc parallel loop
+  do i = 1,n
+    var%a(i) = i
+  end do
+!$acc end parallel loop
+
+!$acc end data
+!$acc end data
+
+  do i = 1,n
+    if (i .ne. var%a(i)) stop 1
+  end do
+
+  deallocate(var%a)
+
+end program dtype
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-2.f90
new file mode 100644
index 0000000..3593661
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-2.f90
@@ -0,0 +1,33 @@
+! { dg-do run }
+
+! Test of attach/detach with "acc data", two clauses at once.
+
+program dtype
+  implicit none
+  integer, parameter :: n = 512
+  type mytype
+    integer, allocatable :: a(:)
+  end type mytype
+  integer i
+
+  type(mytype) :: var
+
+  allocate(var%a(1:n))
+
+!$acc data copy(var) copy(var%a)
+
+!$acc parallel loop
+  do i = 1,n
+    var%a(i) = i
+  end do
+!$acc end parallel loop
+
+!$acc end data
+
+  do i = 1,n
+    if (i .ne. var%a(i)) stop 1
+  end do
+
+  deallocate(var%a)
+
+end program dtype
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-3.f90
new file mode 100644
index 0000000..667d944
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-3.f90
@@ -0,0 +1,34 @@
+! { dg-do run }
+
+! Test of attach/detach with "acc parallel".
+
+program dtype
+  implicit none
+  integer, parameter :: n = 512
+  type mytype
+    integer, allocatable :: a(:)
+    integer, allocatable :: b(:)
+  end type mytype
+  integer i
+
+  type(mytype) :: var
+
+  allocate(var%a(1:n))
+  allocate(var%b(1:n))
+
+!$acc parallel loop copy(var) copy(var%a(1:n)) copy(var%b(1:n))
+  do i = 1,n
+    var%a(i) = i
+    var%b(i) = i
+  end do
+!$acc end parallel loop
+
+  do i = 1,n
+    if (i .ne. var%a(i)) stop 1
+    if (i .ne. var%b(i)) stop 2
+  end do
+
+  deallocate(var%a)
+  deallocate(var%b)
+
+end program dtype
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-4.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-4.f90
new file mode 100644
index 0000000..6949e12
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-4.f90
@@ -0,0 +1,49 @@
+! { dg-do run }
+
+! Test of attach/detach with "acc enter/exit data".
+
+program dtype
+  implicit none
+  integer, parameter :: n = 512
+  type mytype
+    integer, allocatable :: a(:)
+    integer, allocatable :: b(:)
+  end type mytype
+  integer, allocatable :: r(:)
+  integer i
+
+  type(mytype) :: var
+
+  allocate(var%a(1:n))
+  allocate(var%b(1:n))
+  allocate(r(1:n))
+
+!$acc enter data copyin(var)
+
+!$acc enter data copyin(var%a, var%b, r)
+
+!$acc parallel loop
+  do i = 1,n
+    var%a(i) = i
+    var%b(i) = i * 2
+    r(i) = i * 3
+  end do
+!$acc end parallel loop
+
+!$acc exit data copyout(var%a)
+!$acc exit data copyout(var%b)
+!$acc exit data copyout(r)
+
+  do i = 1,n
+    if (i .ne. var%a(i)) stop 1
+    if (i * 2 .ne. var%b(i)) stop 2
+    if (i * 3 .ne. r(i)) stop 3
+  end do
+
+!$acc exit data delete(var)
+
+  deallocate(var%a)
+  deallocate(var%b)
+  deallocate(r)
+
+end program dtype
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-5.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-5.f90
new file mode 100644
index 0000000..6843cf1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-5.f90
@@ -0,0 +1,57 @@
+! { dg-do run }
+
+! Test of attach/detach, "enter data" inside "data", and subarray.
+
+program dtype
+  implicit none
+  integer, parameter :: n = 512
+  type mytype
+    integer, allocatable :: a(:)
+    integer, allocatable :: b(:)
+  end type mytype
+  integer i
+
+  type(mytype) :: var
+
+  allocate(var%a(1:n))
+  allocate(var%b(1:n))
+
+!$acc data copy(var)
+
+  do i = 1, n
+    var%a(i) = 0
+    var%b(i) = 0
+  end do
+
+!$acc enter data copyin(var%a(5:n - 5), var%b(5:n - 5))
+
+!$acc parallel loop
+  do i = 5,n - 5
+    var%a(i) = i
+    var%b(i) = i * 2
+  end do
+!$acc end parallel loop
+
+!$acc exit data copyout(var%a(5:n - 5), var%b(5:n - 5))
+
+!$acc end data
+
+  do i = 1,4
+    if (var%a(i) .ne. 0) stop 1
+    if (var%b(i) .ne. 0) stop 2
+  end do
+
+  do i = 5,n - 5
+    if (i .ne. var%a(i)) stop 3
+    if (i * 2 .ne. var%b(i)) stop 4
+  end do
+
+  do i = n - 4,n
+    if (var%a(i) .ne. 0) stop 5
+    if (var%b(i) .ne. 0) stop 6
+  end do
+
+  deallocate(var%a)
+  deallocate(var%b)
+
+end program dtype
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-6.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-6.f90
new file mode 100644
index 0000000..12910d0
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-6.f90
@@ -0,0 +1,61 @@
+! { dg-do run }
+
+! Test of attachment counters and finalize.
+
+program dtype
+  implicit none
+  integer, parameter :: n = 512
+  type mytype
+    integer, allocatable :: a(:)
+    integer, allocatable :: b(:)
+  end type mytype
+  integer i
+
+  type(mytype) :: var
+
+  allocate(var%a(1:n))
+  allocate(var%b(1:n))
+
+!$acc data copy(var)
+
+  do i = 1, n
+    var%a(i) = 0
+    var%b(i) = 0
+  end do
+
+!$acc enter data copyin(var%a(5:n - 5), var%b(5:n - 5))
+
+  do i = 1,20
+    !$acc enter data attach(var%a)
+  end do
+
+!$acc parallel loop
+  do i = 5,n - 5
+    var%a(i) = i
+    var%b(i) = i * 2
+  end do
+!$acc end parallel loop
+
+!$acc exit data copyout(var%a(5:n - 5), var%b(5:n - 5)) finalize
+
+!$acc end data
+
+  do i = 1,4
+    if (var%a(i) .ne. 0) stop 1
+    if (var%b(i) .ne. 0) stop 2
+  end do
+
+  do i = 5,n - 5
+    if (i .ne. var%a(i)) stop 3
+    if (i * 2 .ne. var%b(i)) stop 4
+  end do
+
+  do i = n - 4,n
+    if (var%a(i) .ne. 0) stop 5
+    if (var%b(i) .ne. 0) stop 6
+  end do
+
+  deallocate(var%a)
+  deallocate(var%b)
+
+end program dtype
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-7.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-7.f90
new file mode 100644
index 0000000..ab44f0a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-7.f90
@@ -0,0 +1,89 @@
+! { dg-do run }
+
+! Test of attach/detach with scalar elements and nested derived types.
+
+program dtype
+  implicit none
+  integer, parameter :: n = 512
+  type subtype
+    integer :: g, h
+    integer, allocatable :: q(:)
+  end type subtype
+  type mytype
+    integer, allocatable :: a(:)
+    integer, allocatable :: c, d
+    integer, allocatable :: b(:)
+    integer :: f
+    type(subtype) :: s
+  end type mytype
+  integer i
+
+  type(mytype) :: var
+
+  allocate(var%a(1:n))
+  allocate(var%b(1:n))
+  allocate(var%c)
+  allocate(var%d)
+  allocate(var%s%q(1:n))
+
+  var%c = 16
+  var%d = 20
+  var%f = 7
+  var%s%g = 21
+  var%s%h = 38
+
+!$acc enter data copyin(var)
+
+  do i = 1, n
+    var%a(i) = 0
+    var%b(i) = 0
+    var%s%q(i) = 0
+  end do
+
+!$acc data copy(var%a(5:n - 5), var%b(5:n - 5), var%c, var%d) &
+!$acc & copy(var%s%q)
+
+!$acc parallel loop default(none) present(var)
+  do i = 5,n - 5
+    var%a(i) = i
+    var%b(i) = i * 2
+    var%s%q(i) = i * 3
+    var%s%g = 100
+    var%s%h = 101
+  end do
+!$acc end parallel loop
+
+!$acc end data
+
+!$acc exit data copyout(var)
+
+  do i = 1,4
+    if (var%a(i) .ne. 0) stop 1
+    if (var%b(i) .ne. 0) stop 2
+    if (var%s%q(i) .ne. 0) stop 3
+  end do
+
+  do i = 5,n - 5
+    if (i .ne. var%a(i)) stop 4
+    if (i * 2 .ne. var%b(i)) stop 5
+    if (i * 3 .ne. var%s%q(i)) stop 6
+  end do
+
+  do i = n - 4,n
+    if (var%a(i) .ne. 0) stop 7
+    if (var%b(i) .ne. 0) stop 8
+    if (var%s%q(i) .ne. 0) stop 9
+  end do
+
+  if (var%c .ne. 16) stop 10
+  if (var%d .ne. 20) stop 11
+  if (var%s%g .ne. 100 .or. var%s%h .ne. 101) stop 12
+  if (var%f .ne. 7) stop 13
+
+  deallocate(var%a)
+  deallocate(var%b)
+  deallocate(var%c)
+  deallocate(var%d)
+  deallocate(var%s%q)
+
+end program dtype
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-8.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-8.f90
new file mode 100644
index 0000000..d142763a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deep-copy-8.f90
@@ -0,0 +1,41 @@
+! { dg-do run }
+
+! Test of explicit attach/detach clauses and attachment counters. There are no
+! acc_attach/acc_detach API routines in Fortran.
+
+program dtype
+  use openacc
+  implicit none
+  integer, parameter :: n = 512
+  type mytype
+    integer, allocatable :: a(:)
+  end type mytype
+  integer i
+
+  type(mytype) :: var
+
+  allocate(var%a(1:n))
+
+  call acc_copyin(var)
+  call acc_copyin(var%a)
+
+  !$acc enter data attach(var%a)
+
+!$acc parallel loop attach(var%a)
+  do i = 1,n
+    var%a(i) = i
+  end do
+!$acc end parallel loop
+
+  !$acc exit data detach(var%a)
+
+  call acc_copyout(var%a)
+  call acc_copyout(var)
+
+  do i = 1,n
+    if (i .ne. var%a(i)) stop 1
+  end do
+
+  deallocate(var%a)
+
+end program dtype
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/derived-type-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/derived-type-1.f90
new file mode 100644
index 0000000..eb7812d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/derived-type-1.f90
@@ -0,0 +1,28 @@
+! Test derived types with subarrays
+
+! { dg-do run }
+
+  implicit none
+  type dtype
+     integer :: a, b, c
+  end type dtype
+  integer, parameter :: n = 100
+  integer i
+  type (dtype), dimension(n) :: d
+
+  !$acc data copy(d(1:n))
+  !$acc parallel loop
+  do i = 1, n
+     d(i)%a = i
+     d(i)%b = i-1
+     d(i)%c = i+1
+  end do
+  !$acc end data
+
+  do i = 1, n
+     if (d(i)%a /= i) stop 1
+     if (d(i)%b /= i-1) stop 2
+     if (d(i)%c /= i+1) stop 3
+  end do
+end program
+
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/derivedtype-1.f95 b/libgomp/testsuite/libgomp.oacc-fortran/derivedtype-1.f95
new file mode 100644
index 0000000..75ce48d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/derivedtype-1.f95
@@ -0,0 +1,30 @@
+! { dg-do run }
+
+program main
+  implicit none
+
+  type mytype
+    integer :: a, b, c
+  end type mytype
+
+  type(mytype) :: myvar
+  integer :: i
+
+  myvar%a = 0
+  myvar%b = 0
+  myvar%c = 0
+
+!$acc enter data copyin(myvar)
+
+!$acc parallel present(myvar)
+  myvar%a = 1
+  myvar%b = 2
+  myvar%c = 3
+!$acc end parallel
+
+!$acc exit data copyout(myvar)
+
+  if (myvar%a .ne. 1) stop 1
+  if (myvar%b .ne. 2) stop 2
+  if (myvar%c .ne. 3) stop 3
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/derivedtype-2.f95 b/libgomp/testsuite/libgomp.oacc-fortran/derivedtype-2.f95
new file mode 100644
index 0000000..3088b83
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/derivedtype-2.f95
@@ -0,0 +1,41 @@
+! { dg-do run }
+
+program main
+  implicit none
+
+  type tnest
+    integer :: ia, ib, ic
+  end type tnest
+
+  type mytype
+    type(tnest) :: nest
+    integer :: a, b, c
+  end type mytype
+
+  type(mytype) :: myvar
+  integer :: i
+
+  myvar%a = 0
+  myvar%b = 0
+  myvar%c = 0
+  myvar%nest%ia = 0
+  myvar%nest%ib = 0
+  myvar%nest%ic = 0
+
+!$acc enter data copyin(myvar%nest)
+
+!$acc parallel present(myvar%nest)
+  myvar%nest%ia = 4
+  myvar%nest%ib = 5
+  myvar%nest%ic = 6
+!$acc end parallel
+
+!$acc exit data copyout(myvar%nest)
+
+  if (myvar%a .ne. 0) stop 1
+  if (myvar%b .ne. 0) stop 2
+  if (myvar%c .ne. 0) stop 3
+  if (myvar%nest%ia .ne. 4) stop 4
+  if (myvar%nest%ib .ne. 5) stop 5
+  if (myvar%nest%ic .ne. 6) stop 6
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90
new file mode 100644
index 0000000..276a172
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90
@@ -0,0 +1,197 @@
+! { dg-do run }
+
+! Test the deviceptr clause with various directives
+! and in combination with other directives where
+! the deviceptr variable is implied.
+
+subroutine subr1 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc data deviceptr (a)
+
+  !$acc parallel copy (b)
+    do i = 1, N
+      a(i) = i * 2
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  !$acc end data
+
+end subroutine
+
+subroutine subr2 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  !$acc declare deviceptr (a)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc parallel copy (b)
+    do i = 1, N
+      a(i) = i * 4
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr3 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  !$acc declare deviceptr (a)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc kernels copy (b)
+    do i = 1, N
+      a(i) = i * 8
+      b(i) = a(i)
+    end do
+  !$acc end kernels
+
+end subroutine
+
+subroutine subr4 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc parallel deviceptr (a) copy (b)
+    do i = 1, N
+      a(i) = i * 16
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr5 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc kernels deviceptr (a) copy (b)
+    do i = 1, N
+      a(i) = i * 32
+      b(i) = a(i)
+    end do
+  !$acc end kernels
+
+end subroutine
+
+subroutine subr6 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc parallel deviceptr (a) copy (b)
+    do i = 1, N
+      b(i) = i
+    end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr7 (a, b)
+  implicit none
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: i = 0
+
+  !$acc data deviceptr (a)
+
+  !$acc parallel copy (b)
+    do i = 1, N
+      a(i) = i * 2
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  !$acc parallel copy (b)
+    do i = 1, N
+      a(i) = b(i) * 2
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  !$acc end data
+
+end subroutine
+
+program main
+  use iso_c_binding, only: c_ptr, c_f_pointer
+  implicit none
+  type (c_ptr) :: cp
+  integer, parameter :: N = 8
+  integer, pointer :: fp(:)
+  integer :: i = 0
+  integer :: b(N)
+
+  interface
+    function acc_malloc (s) bind (C)
+      use iso_c_binding, only: c_ptr, c_size_t
+      integer (c_size_t), value :: s
+      type (c_ptr) :: acc_malloc
+    end function
+  end interface
+
+  cp = acc_malloc (N * sizeof (fp(N)))
+  call c_f_pointer (cp, fp, [N])
+
+  call subr1 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 2) call abort
+  end do
+
+  call subr2 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 4) call abort
+  end do
+
+  call subr3 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 8) call abort
+  end do
+
+  call subr4 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 16) call abort
+  end do
+
+  call subr5 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 32) call abort
+  end do
+
+  call subr6 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i) call abort
+  end do
+
+  call subr7 (fp, b)
+
+  do i = 1, N
+    if (b(i) .ne. i * 4) call abort
+  end do
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-1.f b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-1.f
index 4965e67..95810a6 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-1.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-1.f
@@ -15,6 +15,6 @@
 ! { dg-output "ERROR STOP (\n|\r\n|\r)+" }
 ! PR85463.  The "minimal" libgfortran implementation used with nvptx
 ! offloading is a little bit different.
-! { dg-output "Error termination.*" { target { ! openacc_nvidia_accel_selected } } }
+! { dg-output "Error termination.*" { target { { ! openacc_nvidia_accel_selected } && { ! openacc_amdgcn_accel_selected } } } }
 ! { dg-output "libgomp: cuStreamSynchronize error.*" { target openacc_nvidia_accel_selected } }
 ! { dg-shouldfail "" }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-2.f b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-2.f
index 7103fdb..ce59bbd 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-2.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-2.f
@@ -15,6 +15,6 @@
 ! { dg-output "ERROR STOP 35(\n|\r\n|\r)+" }
 ! PR85463.  The "minimal" libgfortran implementation used with nvptx
 ! offloading is a little bit different.
-! { dg-output "Error termination.*" { target { ! openacc_nvidia_accel_selected } } }
+! { dg-output "Error termination.*" { target { { ! openacc_nvidia_accel_selected } && { ! openacc_amdgcn_accel_selected } } } }
 ! { dg-output "libgomp: cuStreamSynchronize error.*" { target openacc_nvidia_accel_selected } }
 ! { dg-shouldfail "" }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-3.f b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-3.f
index 9c217f1..9b606c8 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-3.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-3.f
@@ -15,6 +15,6 @@
 ! { dg-output "ERROR STOP SiGN(\n|\r\n|\r)+" }
 ! PR85463.  The "minimal" libgfortran implementation used with nvptx
 ! offloading is a little bit different.
-! { dg-output "Error termination.*" { target { ! openacc_nvidia_accel_selected } } }
+! { dg-output "Error termination.*" { target { { ! openacc_nvidia_accel_selected } && { ! openacc_amdgcn_accel_selected } } } }
 ! { dg-output "libgomp: cuStreamSynchronize error.*" { target openacc_nvidia_accel_selected } }
 ! { dg-shouldfail "" }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/firstprivate-int.f90 b/libgomp/testsuite/libgomp.oacc-fortran/firstprivate-int.f90
new file mode 100644
index 0000000..3b148ce
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/firstprivate-int.f90
@@ -0,0 +1,205 @@
+! Verify the GOMP_MAP_FIRSTPRIVATE_INT optimziation on various types.
+
+! { dg-do run }
+
+program test
+  implicit none
+
+  integer (kind=1)  :: i1i, i1o
+  integer (kind=2)  :: i2i, i2o
+  integer (kind=4)  :: i4i, i4o
+  integer (kind=8)  :: i8i, i8o
+  integer (kind=16) :: i16i, i16o
+
+  logical (kind=1)  :: l1i, l1o
+  logical (kind=2)  :: l2i, l2o
+  logical (kind=4)  :: l4i, l4o
+  logical (kind=8)  :: l8i, l8o
+  logical (kind=16) :: l16i, l16o
+
+  real (kind=4)  :: r4i, r4o
+  real (kind=8)  :: r8i, r8o
+
+  complex (kind=4)  :: c4i, c4o
+  complex (kind=8)  :: c8i, c8o
+
+  character (kind=1) :: ch1i, ch1o
+  character (kind=4) :: ch4i, ch4o
+
+  i1i = 1
+  i2i = 2
+  i4i = 3
+  i8i = 4
+  i16i = 5
+
+  l1i = .true.
+  l2i = .false.
+  l4i = .true.
+  l8i = .true.
+  l16i = .false.
+
+  r4i = .5
+  r8i = .25
+
+  c4i = (2, -2)
+  c8i = (4, -4)
+
+  ch1i = "a"
+  ch4i = "b"
+
+  !$acc parallel firstprivate(i1i, i2i, i4i, i8i, i16i) &
+  !$acc copyout(i1o, i2o, i4o, i8o, i16o) &
+  !$acc firstprivate(l1i, l2i, l4i, l8i, l16i) &
+  !$acc copyout(l1o, l2o, l4o, l8o, l16o) &
+  !$acc firstprivate(r4i, r8i) copyout(r4o, r8o) &
+  !$acc firstprivate(c4i, c8i) copyout(c4o, c8o) &
+  !$acc firstprivate(ch1i, ch4i) &
+  !$acc copyout(ch1o, ch4o)
+  i1o = i1i
+  i2o = i2i
+  i4o = i4i
+  i8o = i8i
+  i16o = i16i
+
+  l1o = l1i
+  l2o = l2i
+  l4o = l4i
+  l8o = l8i
+  l16o = l16i
+
+  r4o = r4i
+  r8o = r8i
+
+  c4o = c4i
+  c8o = c8i
+
+  ch1o = ch1i
+  ch4o = ch4i
+  !$acc end parallel
+
+  if (i1i /= i1o) stop 1
+  if (i2i /= i2o) stop 2
+  if (i4i /= i4o) stop 3
+  if (i8i /= i8o) stop 4
+  if (i16i /= i16o) stop 5
+
+  if (l1i .neqv. l1o) stop 6
+  if (l2i .neqv. l2o) stop 7
+  if (l4i .neqv. l4o) stop 8
+  if (l8i .neqv. l8o) stop 9
+  if (l16i .neqv. l16o) stop 10
+
+  if (r4i /= r4o) stop 11
+  if (r8i /= r8o) stop 12
+
+  if (c4i /= c4o) stop 13
+  if (c8i /= c8o) stop 14
+
+  if (ch1i /= ch1o) stop 15
+  if (ch4i /= ch4o) stop 16
+
+  call subtest(i1i, i2i, i4i, i8i, i16i, i1o, i2o, i4o, i8o, i16o, &
+               l1i, l2i, l4i, l8i, l16i, l1o, l2o, l4o, l8o, l16o, &
+               r4i, r8i, r4o, r8o, c4i, c8i, c4o, c8o, &
+               ch1i, ch4i, ch1o, ch4o)
+end program test
+
+subroutine subtest(i1i, i2i, i4i, i8i, i16i, i1o, i2o, i4o, i8o, i16o, &
+                   l1i, l2i, l4i, l8i, l16i, l1o, l2o, l4o, l8o, l16o, &
+                   r4i, r8i, r4o, r8o, c4i, c8i, c4o, c8o, &
+                   ch1i, ch4i, ch1o, ch4o)
+  implicit none
+
+  integer (kind=1)  :: i1i, i1o
+  integer (kind=2)  :: i2i, i2o
+  integer (kind=4)  :: i4i, i4o
+  integer (kind=8)  :: i8i, i8o
+  integer (kind=16) :: i16i, i16o
+
+  logical (kind=1)  :: l1i, l1o
+  logical (kind=2)  :: l2i, l2o
+  logical (kind=4)  :: l4i, l4o
+  logical (kind=8)  :: l8i, l8o
+  logical (kind=16) :: l16i, l16o
+
+  real (kind=4)  :: r4i, r4o
+  real (kind=8)  :: r8i, r8o
+
+  complex (kind=4)  :: c4i, c4o
+  complex (kind=8)  :: c8i, c8o
+
+  character (kind=1) :: ch1i, ch1o
+  character (kind=4) :: ch4i, ch4o
+
+  i1i = -i1i
+  i2i = -i2i
+  i4i = -i4i
+  i8i = -i8i
+  i16i = -i16i
+
+  l1i = .not. l1i
+  l2i = .not. l2i
+  l4i = .not. l4i
+  l8i = .not. l8i
+  l16i = .not. l16i
+
+  r4i = -r4i
+  r8i = -r8i
+
+  c4i = -c4i
+  c8i = -c8i
+
+  ch1i = "z"
+  ch4i = "y"
+
+  !$acc parallel firstprivate(i1i, i2i, i4i, i8i, i16i) &
+  !$acc copyout(i1o, i2o, i4o, i8o, i16o) &
+  !$acc firstprivate(l1i, l2i, l4i, l8i, l16i) &
+  !$acc copyout(l1o, l2o, l4o, l8o, l16o) &
+  !$acc firstprivate(r4i, r8i) copyout(r4o, r8o) &
+  !$acc firstprivate(c4i, c8i) copyout(c4o, c8o) &
+  !$acc firstprivate(ch1i, ch4i) &
+  !$acc copyout(ch1o, ch4o)
+  i1o = i1i
+  i2o = i2i
+  i4o = i4i
+  i8o = i8i
+  i16o = i16i
+
+  l1o = l1i
+  l2o = l2i
+  l4o = l4i
+  l8o = l8i
+  l16o = l16i
+
+  r4o = r4i
+  r8o = r8i
+
+  c4o = c4i
+  c8o = c8i
+
+  ch1o = ch1i
+  ch4o = ch4i
+  !$acc end parallel
+
+  if (i1i /= i1o) stop 17
+  if (i2i /= i2o) stop 18
+  if (i4i /= i4o) stop 19
+  if (i8i /= i8o) stop 20
+  if (i16i /= i16o) stop 21
+
+  if (l1i .neqv. l1o) stop 22
+  if (l2i .neqv. l2o) stop 23
+  if (l4i .neqv. l4o) stop 24
+  if (l8i .neqv. l8o) stop 25
+  if (l16i .neqv. l16o) stop 26
+
+  if (r4i /= r4o) stop 27
+  if (r8i /= r8o) stop 28
+
+  if (c4i /= c4o) stop 29
+  if (c8i /= c8o) stop 30
+
+  if (ch1i /= ch1o) stop 31
+  if (ch4i /= ch4o) stop 32
+end subroutine subtest
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
index af25a22..e36fb15 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
+++ b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
@@ -9,7 +9,7 @@
 
 set shlib_ext [get_shlib_extension]
 set lang_library_path	"../libgfortran/.libs"
-set lang_link_flags	"-lgfortran -foffload=-lgfortran"
+set lang_link_flags	"-lgfortran -foffload=-lgfortran -lquadmath"
 if [info exists lang_include_flags] then {
     unset lang_include_flags
 }
@@ -66,11 +66,10 @@
     set_ld_library_path_env_vars
 
     # Test with all available offload targets, and with offloading disabled.
-    foreach offload_target [concat [split $offload_targets ","] "disable"] {
-	global openacc_device_type
+    global offload_target
+    foreach offload_target [concat [split $offload_targets ":"] "disable"] {
 	set openacc_device_type [offload_target_to_openacc_device_type $offload_target]
-	set tagopt "-DACC_DEVICE_TYPE_$openacc_device_type=1"
-
+	set tagopt "-DACC_DEVICE_TYPE_$openacc_device_type=\"$offload_target\""
 	switch $openacc_device_type {
 	    "" {
 		unsupported "$subdir $offload_target offloading"
@@ -78,6 +77,9 @@
 	    }
 	    host {
 		set acc_mem_shared 1
+
+		# Special case: pass the empty string instead of "disable".
+		set tagopt "-DACC_DEVICE_TYPE_$openacc_device_type=\"\""
 	    }
 	    nvidia {
 		if { ![check_effective_target_openacc_nvidia_accel_present] } {
@@ -88,6 +90,9 @@
 
 		set acc_mem_shared 0
 	    }
+	    gcn {
+		set acc_mem_shared 0
+	    }
 	    default {
 		error "Unknown OpenACC device type: $openacc_device_type (offload target: $offload_target)"
 	    }
@@ -98,8 +103,6 @@
 	# handling in test cases, by default only build for the offload target
 	# that we're actually going to test.
 	set tagopt "$tagopt -foffload=$offload_target"
-	# Force usage of the corresponding OpenACC device type.
-	setenv ACC_DEVICE_TYPE $openacc_device_type
 
 	# For Fortran we're doing torture testing, as Fortran has far more tests
 	# with arrays etc. that testing just -O0 or -O2 is insufficient, that is
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90
new file mode 100644
index 0000000..dafc70c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90
@@ -0,0 +1,25 @@
+! Test for "oacc gangprivate" attribute on gang-private variables
+
+! { dg-do run }
+! { dg-additional-options "-fdump-tree-oaccdevlow-details" }
+! { dg-final { scan-tree-dump-times "Decl UID \[0-9\]+ has gang partitioning:  integer\\(kind=4\\) w;" 1 "oaccdevlow" } } */
+
+program main
+  integer :: w, arr(0:31)
+
+  !$acc parallel num_gangs(32) num_workers(32) copyout(arr) ! { dg-warning "region is worker partitioned" }
+    !$acc loop gang private(w)
+    do j = 0, 31
+      w = 0
+      !$acc loop seq
+      do i = 0, 31
+        !$acc atomic update
+        w = w + 1
+        !$acc end atomic
+      end do
+      arr(j) = w
+    end do
+  !$acc end parallel
+
+  if (any (arr .ne. 32)) stop 1
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/gangprivate-attrib-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/gangprivate-attrib-2.f90
new file mode 100644
index 0000000..90e06be
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/gangprivate-attrib-2.f90
@@ -0,0 +1,25 @@
+! Test for worker-private variables
+
+! { dg-do run }
+! { dg-additional-options "-fdump-tree-oaccdevlow-details" }
+! { dg-final { scan-tree-dump-times "Decl UID \[0-9\]+ has worker partitioning:  integer\\(kind=4\\) w;" 1 "oaccdevlow" } } */
+
+program main
+  integer :: w, arr(0:31)
+
+  !$acc parallel num_gangs(32) num_workers(32) copyout(arr)
+    !$acc loop gang worker private(w)
+    do j = 0, 31
+      w = 0
+      !$acc loop seq
+      do i = 0, 31
+        !$acc atomic update
+        w = w + 1
+        !$acc end atomic
+      end do
+      arr(j) = w
+    end do
+  !$acc end parallel
+
+  if (any (arr .ne. 32)) stop 1
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/implicit_copy.f90 b/libgomp/testsuite/libgomp.oacc-fortran/implicit_copy.f90
new file mode 100644
index 0000000..7a99f29
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/implicit_copy.f90
@@ -0,0 +1,30 @@
+! { dg-do run }
+
+integer function test()
+  implicit none
+  integer, parameter :: n = 10
+  real(8), dimension(n) :: a, b, c
+  integer i
+
+  do i = 1, n
+     a(i) = i
+     b(i) = 1
+  end do
+
+  !$acc data copyin(a(1:n), b(1:n))
+  !$acc parallel loop
+  do i = 1, n
+     c(i) = a(i) * b(i)
+  end do
+  !$acc end data
+
+  do i = 1, n
+     if (c(i) /= a(i) * b(i)) call abort
+  end do
+end function test
+
+program main
+  implicit none
+  integer i, test
+  i = test()
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-2.f95 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-2.f95
index 4b69e81..deb9070 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-2.f95
+++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-2.f95
@@ -1,4 +1,6 @@
 ! { dg-do run }
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 
 program main
   implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95
index 4008743..c718584 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95
+++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-2.f95
@@ -1,4 +1,6 @@
 ! { dg-do run }
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 
 program main
   implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95
index 11ae17c..ade470a 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95
+++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit-2.f95
@@ -1,4 +1,6 @@
 ! { dg-do run }
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 
 program main
   implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95
index 4fdb862..8eb5a83 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95
+++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-enter-exit.f95
@@ -1,4 +1,6 @@
 ! { dg-do run }
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 
 program main
   implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95
index 4bee0e1..3352b76 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95
+++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data-update.f95
@@ -1,4 +1,6 @@
 ! { dg-do run }
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 
 program main
   implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data.f95 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data.f95
index 307e433..6697eaf 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data.f95
+++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-data.f95
@@ -1,4 +1,6 @@
 ! { dg-do run }
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 
 program main
   implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop.f95 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop.f95
index 0090f43..8ae247f 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop.f95
+++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-loop.f95
@@ -1,4 +1,6 @@
 ! { dg-do run }
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 
 program main
   implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95
index fe1088c..18daccc 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95
+++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-parallel-loop-data-enter-exit.f95
@@ -1,4 +1,6 @@
 ! { dg-do run }
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 
 program main
   implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90
index c7a52ed..fe6986d 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90
@@ -1,6 +1,8 @@
 ! Test a simple acc loop reduction inside a kernels region. 
 
 ! { dg-do run }
+! { dg-additional-options "-fopenacc-kernels=parloops" } as this is
+! specifically testing "parloops" handling.
 
 program reduction
   integer, parameter     :: n = 20
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90 b/libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90
index da944c3..ea35d71 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90
@@ -19,11 +19,10 @@
         end do
       !$acc end parallel
     end do
-  !$acc end data
 
   call acc_wait_all_async (nprocs + 1)
-
   call acc_wait (nprocs + 1)
+  !$acc end data
 
   if (acc_async_test (1) .neqv. .TRUE.) call abort
   if (acc_async_test (2) .neqv. .TRUE.) call abort
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90
index fa76f65..94b80d0 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90
@@ -27,6 +27,9 @@
 
   if (acc_is_present (h) .neqv. .TRUE.) call abort
 
+  ! We must wait for the update to be done.
+  call acc_wait (async)
+
   h(:) = 0
 
   call acc_copyout_async (h, sizeof (h), async)
@@ -45,6 +48,8 @@
   
   if (acc_is_present (h) .neqv. .TRUE.) call abort
 
+  call acc_wait (async)
+
   do i = 1, N
     if (h(i) /= i + i) call abort
   end do 
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90 b/libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90
index 011f9cf..5e01099 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/lib-16.f90
@@ -27,6 +27,9 @@
 
   if (acc_is_present (h) .neqv. .TRUE.) call abort
 
+  ! We must wait for the update to be done.
+  call acc_wait (async)
+
   h(:) = 0
 
   call acc_copyout_async (h, sizeof (h), async)
@@ -45,6 +48,8 @@
   
   if (acc_is_present (h) .neqv. .TRUE.) call abort
 
+  call acc_wait (async)
+
   do i = 1, N
     if (h(i) /= i + i) call abort
   end do 
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/multidim-slice.f95 b/libgomp/testsuite/libgomp.oacc-fortran/multidim-slice.f95
new file mode 100644
index 0000000..a9b40ee
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/multidim-slice.f95
@@ -0,0 +1,50 @@
+! { dg-do run }
+
+program main
+  implicit none
+  real, allocatable :: myarr(:,:,:,:,:)
+  integer i, j, k, l, m
+
+  allocate(myarr(1:10,1:10,1:10,1:10,1:10))
+
+  do i=1,10
+    do j=1,10
+      do k=1,10
+        do l=1,10
+          do m=1,10
+            myarr(m,l,k,j,i) = i+j+k+l+m
+          end do
+        end do
+      end do
+    end do
+  end do
+
+  do i=1,10
+    !$acc data copy(myarr(:,:,:,:,i))
+    !$acc parallel loop collapse(4) present(myarr(:,:,:,:,i))
+    do j=1,10
+      do k=1,10
+        do l=1,10
+          do m=1,10
+            myarr(m,l,k,j,i) = myarr(m,l,k,j,i) + 1
+          end do
+        end do
+      end do
+    end do
+    !$acc end parallel loop
+    !$acc end data
+  end do
+
+  do i=1,10
+    do j=1,10
+      do k=1,10
+        do l=1,10
+          do m=1,10
+            if (myarr(m,l,k,j,i) .ne. i+j+k+l+m+1) stop 1
+          end do
+        end do
+      end do
+    end do
+  end do
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/nocreate-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/nocreate-1.f90
new file mode 100644
index 0000000..f048355
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/nocreate-1.f90
@@ -0,0 +1,29 @@
+! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } }
+
+! Test no_create clause with data construct when data is present/not present.
+
+program nocreate
+  use openacc
+  implicit none
+  integer, parameter :: n = 512
+  integer :: myarr(n)
+  integer i
+
+  do i = 1, n
+    myarr(i) = 0
+  end do
+
+  !$acc data no_create (myarr)
+  if (acc_is_present (myarr)) stop 1
+  !$acc end data
+
+  !$acc enter data copyin (myarr)
+  !$acc data no_create (myarr)
+  if (acc_is_present (myarr) .eqv. .false.) stop 2
+  !$acc end data
+  !$acc exit data copyout (myarr)
+
+  do i = 1, n
+    if (myarr(i) .ne. 0) stop 3
+  end do
+end program nocreate
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/nocreate-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/nocreate-2.f90
new file mode 100644
index 0000000..34444ec
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/nocreate-2.f90
@@ -0,0 +1,61 @@
+! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } }
+
+! Test no_create clause with data/parallel constructs.
+
+program nocreate
+  use openacc
+  implicit none
+  integer, parameter :: n = 512
+  integer :: myarr(n)
+  integer i
+
+  do i = 1, n
+    myarr(i) = 0
+  end do
+
+  call do_on_target(myarr, n)
+
+  do i = 1, n
+    if (myarr(i) .ne. i) stop 1
+  end do
+
+  do i = 1, n
+    myarr(i) = 0
+  end do
+
+  !$acc enter data copyin(myarr)
+  call do_on_target(myarr, n)
+  !$acc exit data copyout(myarr)
+
+  do i = 1, n
+    if (myarr(i) .ne. i * 2) stop 2
+  end do
+end program nocreate
+
+subroutine do_on_target (arr, n)
+  use openacc
+  implicit none
+  integer :: n, arr(n)
+  integer :: i
+
+!$acc data no_create (arr)
+
+if (acc_is_present(arr)) then
+  ! The no_create clause is meant for partially shared-memory machines.  This
+  ! test is written to work on non-shared-memory machines, though this is not
+  ! necessarily a useful way to use the no_create clause in practice.
+
+  !$acc parallel loop no_create (arr)
+  do i = 1, n
+    arr(i) = i * 2
+  end do
+  !$acc end parallel loop
+else
+  do i = 1, n
+    arr(i) = i
+  end do
+end if
+
+!$acc end data
+
+end subroutine do_on_target
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-1.f b/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-1.f
index 537212e..36e9844 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-1.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-1.f
@@ -4,6 +4,6 @@
       implicit none
       include "openacc_lib.h"
 
-      if (openacc_version .ne. 201306) STOP 1
+      if (openacc_version .ne. 201711) STOP 1
 
       end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-2.f90
index 54f301b..e815bc1 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/openacc_version-2.f90
@@ -4,6 +4,6 @@
   use openacc
   implicit none
 
-  if (openacc_version .ne. 201306) STOP 1
+  if (openacc_version .ne. 201711) STOP 1
 
 end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-cache.f95 b/libgomp/testsuite/libgomp.oacc-fortran/optional-cache.f95
new file mode 100644
index 0000000..d828497
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-cache.f95
@@ -0,0 +1,23 @@
+! Test that the cache directives work with optional arguments.  The effect
+! of giving a non-present argument to the cache directive is not tested as
+! it is undefined.  The test is based on gfortran.dg/goacc/cache-1.f95.
+
+! { dg-additional-options "-std=f2008" }
+
+program cache_test
+  implicit none
+  integer :: d(10), e(5,13)
+
+  call do_test(d, e)
+contains
+  subroutine do_test(d, e)
+    integer, optional :: d(10), e(5,13)
+    integer :: i
+    do concurrent (i=1:5)
+      !$acc cache (d(1:3))
+      !$acc cache (d(i:i+2))
+      !$acc cache (e(1:3,2:4))
+      !$acc cache (e(i:i+2,i+1:i+3))
+    enddo
+  end
+end
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-data-copyin-by-value.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-data-copyin-by-value.f90
new file mode 100644
index 0000000..5cadeed
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-data-copyin-by-value.f90
@@ -0,0 +1,29 @@
+! Test OpenACC data regions with optional arguments passed by value.
+
+! { dg-do run }
+
+program test
+  implicit none
+
+  integer :: res
+
+  if (foo(27) .ne. 27) stop 1
+  if (foo(16, 18) .ne. 288) stop 1
+contains
+  function foo(x, y)
+    integer, value :: x
+    integer, value, optional :: y
+    integer :: res, foo
+
+    !$acc data copyin(x, y) copyout(res)
+    !$acc parallel
+    res = x
+    if (present(y)) then
+      res = res * y
+    end if
+    !$acc end parallel
+    !$acc end data
+
+    foo = res
+  end function foo
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-data-copyin.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-data-copyin.f90
new file mode 100644
index 0000000..a30908d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-data-copyin.f90
@@ -0,0 +1,140 @@
+! Test OpenACC data regions with a copy-in of optional arguments.
+
+! { dg-do run }
+
+program test
+  implicit none
+
+  integer, parameter :: n = 64
+  integer :: i
+  integer :: a_int, b_int, c_int, res_int
+  integer :: a_arr(n), b_arr(n), c_arr(n), res_arr(n)
+  integer, allocatable :: a_alloc(:), b_alloc(:), c_alloc(:), res_alloc(:)
+
+  a_int = 7
+  b_int = 3
+  c_int = 11
+
+  call test_int(res_int, a_int)
+  if (res_int .ne. a_int) stop 1
+
+  call test_int(res_int, a_int, b_int)
+  if (res_int .ne. a_int * b_int) stop 2
+
+  call test_int(res_int, a_int, b_int, c_int)
+  if (res_int .ne. a_int * b_int + c_int) stop 3
+
+  do i = 1, n
+    a_arr(i) = i
+    b_arr(i) = n - i + 1
+    c_arr(i) = i * 3
+  end do
+
+  call test_array(res_arr, a_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i)) stop 4
+  end do
+
+  call test_array(res_arr, a_arr, b_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_arr(i)) stop 5
+  end do
+
+  call test_array(res_arr, a_arr, b_arr, c_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_arr(i) + c_arr(i)) stop 6
+  end do
+
+  allocate (a_alloc(n))
+  allocate (b_alloc(n))
+  allocate (c_alloc(n))
+  allocate (res_alloc(n))
+
+  do i = 1, n
+    a_alloc(i) = i
+    b_alloc(i) = n - i + 1
+    c_alloc(i) = i * 3
+  end do
+
+  call test_allocatable(res_alloc, a_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. a_alloc(i)) stop 7
+  end do
+
+  call test_allocatable(res_alloc, a_alloc, b_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. a_alloc(i) * b_alloc(i)) stop 8
+  end do
+
+  call test_allocatable(res_alloc, a_alloc, b_alloc, c_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. a_alloc(i) * b_alloc(i) + c_alloc(i)) stop 9
+  end do
+
+  deallocate (a_alloc)
+  deallocate (b_alloc)
+  deallocate (c_alloc)
+  deallocate (res_alloc)
+contains
+  subroutine test_int(res, a, b, c)
+    integer :: res
+    integer :: a
+    integer, optional :: b, c
+
+    !$acc data copyin(a, b, c) copyout(res)
+    !$acc parallel
+    res = a
+
+    if (present(b)) res = res * b
+
+    if (present(c)) res = res + c
+    !$acc end parallel
+    !$acc end data
+  end subroutine test_int
+
+  subroutine test_array(res, a, b, c)
+    integer :: res(n)
+    integer :: a(n)
+    integer, optional :: b(n), c(n)
+
+    !$acc data copyin(a, b, c) copyout(res)
+    !$acc parallel loop
+    do i = 1, n
+      res(i) = a(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(b)) res(i) = res(i) * b(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(c)) res(i) = res(i) + c(i)
+    end do
+    !$acc end data
+  end subroutine test_array
+
+  subroutine test_allocatable(res, a, b, c)
+    integer, allocatable :: res(:)
+    integer, allocatable  :: a(:)
+    integer, allocatable, optional :: b(:), c(:)
+
+    !$acc data copyin(a, b, c) copyout(res)
+    !$acc parallel loop
+    do i = 1, n
+      res(i) = a(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(b)) res(i) = res(i) * b(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(c)) res(i) = res(i) + c(i)
+    end do
+    !$acc end data
+  end subroutine test_allocatable
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-data-copyout.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-data-copyout.f90
new file mode 100644
index 0000000..feaa31f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-data-copyout.f90
@@ -0,0 +1,96 @@
+! Test OpenACC data regions with a copy-out of optional arguments.
+
+! { dg-do run }
+
+program test
+  implicit none
+
+  integer, parameter :: n = 64
+  integer :: i
+  integer :: a_int, b_int, res_int
+  integer :: a_arr(n), b_arr(n), res_arr(n)
+  integer, allocatable :: a_alloc(:), b_alloc(:), res_alloc(:)
+
+  res_int = 0
+
+  call test_int(a_int, b_int)
+  if (res_int .ne. 0) stop 1
+
+  call test_int(a_int, b_int, res_int)
+  if (res_int .ne. a_int * b_int) stop 2
+
+  res_arr(:) = 0
+  do i = 1, n
+    a_arr(i) = i
+    b_arr(i) = n - i + 1
+  end do
+
+  call test_array(a_arr, b_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. 0) stop 3
+  end do
+
+  call test_array(a_arr, b_arr, res_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_arr(i)) stop 4
+  end do
+
+  allocate (a_alloc(n))
+  allocate (b_alloc(n))
+  allocate (res_alloc(n))
+
+  res_alloc(:) = 0
+  do i = 1, n
+    a_alloc(i) = i
+    b_alloc(i) = n - i + 1
+  end do
+
+  call test_allocatable(a_alloc, b_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. 0) stop 5
+  end do
+
+  call test_allocatable(a_alloc, b_alloc, res_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. a_alloc(i) * b_alloc(i)) stop 6
+  end do
+
+  deallocate (a_alloc)
+  deallocate (b_alloc)
+  deallocate (res_alloc)
+contains
+  subroutine test_int(a, b, res)
+    integer :: a, b
+    integer, optional :: res
+
+    !$acc data copyin(a, b) copyout(res)
+    !$acc parallel
+    if (present(res)) res = a * b
+    !$acc end parallel
+    !$acc end data
+  end subroutine test_int
+
+  subroutine test_array(a, b, res)
+    integer :: a(n), b(n)
+    integer, optional :: res(n)
+
+    !$acc data copyin(a, b) copyout(res)
+    !$acc parallel loop
+    do i = 1, n
+      if (present(res)) res(i) = a(i) * b(i)
+    end do
+    !$acc end data
+  end subroutine test_array
+
+  subroutine test_allocatable(a, b, res)
+    integer, allocatable :: a(:), b(:)
+    integer, allocatable, optional :: res(:)
+
+    !$acc data copyin(a, b) copyout(res)
+    !$acc parallel loop
+    do i = 1, n
+      if (present(res)) res(i) = a(i) * b(i)
+    end do
+    !$acc end data
+  end subroutine test_allocatable
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-data-enter-exit.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-data-enter-exit.f90
new file mode 100644
index 0000000..9ed0f75
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-data-enter-exit.f90
@@ -0,0 +1,91 @@
+! Test OpenACC unstructured enter data/exit data regions with optional
+! arguments.
+
+! { dg-do run }
+
+program test
+  implicit none
+
+  integer, parameter :: n = 64
+  integer :: a(n), b(n), c(n), res(n)
+  integer :: x, y, z, r, i
+
+  do i = 1, n
+    a(i) = i
+    b(i) = n - i + 1
+    c(i) = i * 3
+  end do
+
+  res = test_array(a)
+  do i = 1, n
+    if (res(i) .ne. a(i)) stop 1
+  end do
+
+  res = test_array(a, b)
+  do i = 1, n
+    if (res(i) .ne. a(i) * b(i)) stop 2
+  end do
+
+  res = test_array(a, b, c)
+  do i = 1, n
+    if (res(i) .ne. a(i) * b(i) + c(i)) stop 3
+  end do
+
+  x = 7
+  y = 3
+  z = 11
+
+  r = test_int(x)
+  if (r .ne. x) stop 4
+
+  r = test_int(x, y)
+  if (r .ne. x * y) stop 5
+
+  r = test_int(x, y, z)
+  if (r .ne. x * y + z) stop 6
+contains
+  function test_array(a, b, c)
+    integer :: a(n)
+    integer, optional :: b(n), c(n)
+    integer :: test_array(n), res(n)
+
+    !$acc enter data copyin(a, b, c) create(res)
+    !$acc parallel loop
+    do i = 1, n
+      res(i) = a(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(b)) then
+        res(i) = res(i) * b(i)
+      end if
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(c)) then
+        res(i) = res(i) + c(i)
+      end if
+    end do
+    !$acc exit data copyout(res) delete(a, b, c)
+
+    test_array = res
+  end function test_array
+
+  function test_int(a, b, c)
+    integer :: a
+    integer, optional :: b, c
+    integer :: test_int, res
+
+    !$acc enter data copyin(a, b, c) create(res)
+    !$acc parallel present(a, b, c, res)
+    res = a
+    if (present(b)) res = res * b
+    if (present(c)) res = res + c
+    !$acc end parallel
+    !$acc exit data copyout(res) delete(a, b, c)
+
+    test_int = res
+  end function test_int
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-declare.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-declare.f90
new file mode 100644
index 0000000..074e5a2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-declare.f90
@@ -0,0 +1,87 @@
+! Test OpenACC declare directives with optional arguments.
+
+! { dg-do run }
+
+program test
+  implicit none
+
+  integer, parameter :: n = 64
+  integer :: i
+  integer :: a_int, b_int, c_int, res_int
+  integer :: a_arr(n), b_arr(n), c_arr(n), res_arr(n)
+
+  a_int = 7
+  b_int = 3
+  c_int = 11
+
+  call test_int(res_int, a_int)
+  if (res_int .ne. a_int) stop 1
+
+  call test_int(res_int, a_int, b_int)
+  if (res_int .ne. a_int * b_int) stop 2
+
+  call test_int(res_int, a_int, b_int, c_int)
+  if (res_int .ne. a_int * b_int + c_int) stop 3
+
+  do i = 1, n
+    a_arr(i) = i
+    b_arr(i) = n - i + 1
+    c_arr(i) = i * 3
+  end do
+
+  call test_array(res_arr, a_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i)) stop 4
+  end do
+
+  call test_array(res_arr, a_arr, b_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_arr(i)) stop 5
+  end do
+
+  call test_array(res_arr, a_arr, b_arr, c_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_arr(i) + c_arr(i)) stop 6
+  end do
+contains
+  subroutine test_int(res, a, b, c)
+    integer :: a
+    integer, optional :: b, c
+    !$acc declare present_or_copyin(a, b, c)
+    integer :: res
+    !$acc declare present_or_copyout(res)
+
+    !$acc parallel
+    res = a
+    if (present(b)) res = res * b
+    if (present(c)) res = res + c
+    !$acc end parallel
+  end subroutine test_int
+
+  subroutine test_array(res, a, b, c)
+    integer :: a(n)
+    integer, optional :: b(n), c(n)
+    !$acc declare present_or_copyin(a, b, c)
+    integer :: res(n)
+    !$acc declare present_or_copyout(res)
+
+    !$acc parallel loop
+    do i = 1, n
+      res(i) = a(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(b)) then
+        res(i) = res(i) * b(i)
+      end if
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(c)) then
+        res(i) = res(i) + c(i)
+      end if
+    end do
+  end subroutine test_array
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-firstprivate.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-firstprivate.f90
new file mode 100644
index 0000000..693e611
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-firstprivate.f90
@@ -0,0 +1,112 @@
+! Test that optional arguments work in firstprivate clauses.  The effect of
+! non-present arguments in firstprivate clauses is undefined, and is not
+! tested for.
+
+! { dg-do run }
+
+program test_firstprivate
+  implicit none
+  integer, parameter :: n = 64
+
+  integer :: i, j
+  integer :: a_int, b_int, c_int, res_int
+  integer :: a_arr(n), b_arr(n), c_arr(n), res_arr(n)
+  integer, allocatable :: a_alloc(:), b_alloc(:), c_alloc(:), res_alloc(:)
+
+  a_int = 14
+  b_int = 5
+  c_int = 12
+
+  call test_int(res_int, a_int, b_int, c_int)
+  if (res_int .ne. a_int * b_int + c_int) stop 1
+
+  do i = 1, n
+    a_arr(i) = i
+    b_arr(i) = n - i + 1
+    c_arr(i) = i * 3
+  end do
+
+  call test_array(res_arr, a_arr, b_arr, c_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_arr(i) + c_arr(i)) stop 2
+  end do
+
+  allocate(a_alloc(n))
+  allocate(b_alloc(n))
+  allocate(c_alloc(n))
+  allocate(res_alloc(n))
+
+  do i = 1, n
+    a_arr(i) = i
+    b_arr(i) = n - i + 1
+    c_arr(i) = i * 3
+  end do
+
+  call test_allocatable(res_alloc, a_alloc, b_alloc, c_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. a_alloc(i) * b_alloc(i) + c_alloc(i)) stop 2
+  end do
+
+  deallocate(a_alloc)
+  deallocate(b_alloc)
+  deallocate(c_alloc)
+  deallocate(res_alloc)
+contains
+  subroutine test_int(res, a, b, c)
+    integer :: a
+    integer, optional :: b, c
+    integer :: res
+
+    !$acc parallel firstprivate(a, b, c) copyout(res)
+    res = a
+    if (present(b)) res = res * b
+    if (present(c)) res = res + c
+    !$acc end parallel
+  end subroutine test_int
+
+  subroutine test_array(res, a, b, c)
+    integer :: a(n)
+    integer, optional :: b(n), c(n)
+    integer :: res(n)
+
+    !$acc data copyin(a, b, c) copyout(res)
+    !$acc parallel loop firstprivate(a)
+    do i = 1, n
+      res(i) = a(i)
+    end do
+
+    !$acc parallel loop firstprivate(b)
+    do i = 1, n
+      if (present(b)) res(i) = res(i) * b(i)
+    end do
+
+    !$acc parallel loop firstprivate(c)
+    do i = 1, n
+      if (present(c)) res(i) = res(i) + c(i)
+    end do
+    !$acc end data
+  end subroutine test_array
+
+  subroutine test_allocatable(res, a, b, c)
+    integer, allocatable :: a(:)
+    integer, allocatable, optional :: b(:), c(:)
+    integer, allocatable :: res(:)
+
+    !$acc data copyin(a, b, c) copyout(res)
+    !$acc parallel loop firstprivate(a)
+    do i = 1, n
+      res(i) = a(i)
+    end do
+
+    !$acc parallel loop firstprivate(b)
+    do i = 1, n
+      if (present(b)) res(i) = res(i) * b(i)
+    end do
+
+    !$acc parallel loop firstprivate(c)
+    do i = 1, n
+      if (present(c)) res(i) = res(i) + c(i)
+    end do
+    !$acc end data
+  end subroutine test_allocatable
+end program test_firstprivate
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-host_data.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-host_data.f90
new file mode 100644
index 0000000..a6e41e2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-host_data.f90
@@ -0,0 +1,39 @@
+! Test the host_data construct with optional arguments.
+! Based on host_data-1.f90.
+
+! { dg-do run }
+! { dg-additional-options "-cpp" }
+
+program test
+  implicit none
+
+  integer, target :: i
+  integer, pointer :: ip, iph
+
+  ! Assign the same targets
+  ip => i
+  iph => i
+
+  call foo(iph)
+  call foo(iph, ip)
+contains
+  subroutine foo(iph, ip)
+    integer, pointer :: iph
+    integer, pointer, optional :: ip
+
+    !$acc data copyin(i)
+    !$acc host_data use_device(ip)
+
+    ! Test how the pointers compare inside a host_data construct
+    if (present(ip)) then
+#if ACC_MEM_SHARED
+      if (.not. associated(ip, iph)) STOP 1
+#else
+      if (associated(ip, iph)) STOP 2
+#endif
+    end if
+
+    !$acc end host_data
+    !$acc end data
+  end subroutine foo
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-nested-calls.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-nested-calls.f90
new file mode 100644
index 0000000..279139f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-nested-calls.f90
@@ -0,0 +1,135 @@
+! Test propagation of optional arguments from within an OpenACC parallel region.
+
+! { dg-do run }
+
+program test
+  implicit none
+
+  integer, parameter :: n = 64
+  integer :: i
+  integer :: res_int
+  integer :: a_arr(n), b_arr(n), res_arr(n)
+  integer, allocatable :: a_alloc(:), b_alloc(:), res_alloc(:)
+
+  call test_int_caller(res_int, 5)
+  if (res_int .ne. 10) stop 1
+
+  call test_int_caller(res_int, 2, 3)
+  if (res_int .ne. 11) stop 2
+
+  do i = 1, n
+    a_arr(i) = i
+    b_arr(i) = n - i + 1
+  end do
+
+  call test_array_caller(res_arr, a_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. 2 * a_arr(i)) stop 3
+  end do
+
+  call test_array_caller(res_arr, a_arr, b_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_arr(i) + a_arr(i) + b_arr(i)) stop 4
+  end do
+
+  allocate(a_alloc(n))
+  allocate(b_alloc(n))
+  allocate(res_alloc(n))
+
+  do i = 1, n
+    a_alloc(i) = i
+    b_alloc(i) = n - i + 1
+  end do
+
+  call test_array_caller(res_arr, a_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. 2 * a_alloc(i)) stop 5
+  end do
+
+  call test_array_caller(res_arr, a_arr, b_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_alloc(i) + a_alloc(i) + b_alloc(i)) stop 6
+  end do
+
+  deallocate(a_alloc)
+  deallocate(b_alloc)
+  deallocate(res_alloc)
+contains
+  subroutine test_int_caller(res, a, b)
+    integer :: res, a
+    integer, optional :: b
+
+    !$acc data copyin(a, b) copyout (res)
+    !$acc parallel
+    res = a
+    if (present(b)) res = res * b
+    call test_int_callee(res, a, b)
+    !$acc end parallel
+    !$acc end data
+  end subroutine test_int_caller
+
+  subroutine test_int_callee(res, a, b)
+    !$acc routine seq
+    integer :: res, a
+    integer, optional :: b
+
+    res = res + a
+    if (present(b)) res = res + b
+  end subroutine test_int_callee
+
+  subroutine test_array_caller(res, a, b)
+    integer :: res(n), a(n), i
+    integer, optional :: b(n)
+
+    !$acc data copyin(a, b) copyout(res)
+    !$acc parallel
+    !$acc loop seq
+    do i = 1, n
+      res(i) = a(i)
+      if (present(b)) res(i) = res(i) * b(i)
+    end do
+    call test_array_callee(res, a, b)
+    !$acc end parallel
+    !$acc end data
+  end subroutine test_array_caller
+
+  subroutine test_array_callee(res, a, b)
+    !$acc routine seq
+    integer :: res(n), a(n), i
+    integer, optional :: b(n)
+
+    do i = 1, n
+      res(i) = res(i) + a(i)
+      if (present(b)) res(i) = res(i) + b(i)
+    end do
+  end subroutine test_array_callee
+
+  subroutine test_allocatable_caller(res, a, b)
+    integer :: i
+    integer, allocatable :: res(:), a(:)
+    integer, allocatable, optional :: b(:)
+
+    !$acc data copyin(a, b) copyout(res)
+    !$acc parallel
+    !$acc loop seq
+    do i = 1, n
+      res(i) = a(i)
+      if (present(b)) res(i) = res(i) * b(i)
+    end do
+    call test_array_callee(res, a, b)
+    !$acc end parallel
+    !$acc end data
+  end subroutine test_allocatable_caller
+
+  subroutine test_allocatable_callee(res, a, b)
+    !$acc routine seq
+    integer :: i
+    integer, allocatable :: res(:), a(:)
+    integer, allocatable, optional :: b(:)
+
+    do i = 1, n
+      res(i) = res(i) + a(i)
+      if (present(b)) res(i) = res(i) + b(i)
+    end do
+  end subroutine test_allocatable_callee
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-private.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-private.f90
new file mode 100644
index 0000000..6bc91b7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-private.f90
@@ -0,0 +1,118 @@
+! Test that optional arguments work in private clauses.  The effect of
+! non-present arguments in private clauses is undefined, and is not tested
+! for.  The tests are based on those in private-variables.f90.
+
+! { dg-do run }
+
+program main
+  implicit none
+
+  type vec3
+     integer x, y, z, attr(13)
+  end type vec3
+  integer :: x
+  type(vec3) :: pt
+  integer :: arr(2)
+
+  call t1(x)
+  call t2(pt)
+  call t3(arr)
+contains
+
+  ! Test of gang-private variables declared on loop directive.
+
+  subroutine t1(x)
+    integer, optional :: x
+    integer :: i, arr(32)
+
+    do i = 1, 32
+       arr(i) = i
+    end do
+
+    !$acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+    ! { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 32 }
+    ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 32 }
+    !$acc loop gang private(x)
+    do i = 1, 32
+       x = i * 2;
+       arr(i) = arr(i) + x
+    end do
+    !$acc end parallel
+
+    do i = 1, 32
+       if (arr(i) .ne. i * 3) STOP 1
+    end do
+  end subroutine t1
+
+
+  ! Test of gang-private addressable variable declared on loop directive, with
+  ! broadcasting to partitioned workers.
+
+  subroutine t2(pt)
+    integer i, j, arr(0:32*32)
+    type(vec3), optional :: pt
+
+    do i = 0, 32*32-1
+       arr(i) = i
+    end do
+
+    !$acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+    ! { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 59 }
+    !$acc loop gang private(pt)
+    do i = 0, 31
+       pt%x = i
+       pt%y = i * 2
+       pt%z = i * 4
+       pt%attr(5) = i * 6
+
+       !$acc loop vector
+       do j = 0, 31
+          arr(i * 32 + j) = arr(i * 32 + j) + pt%x + pt%y + pt%z + pt%attr(5);
+       end do
+    end do
+    !$acc end parallel
+
+    do i = 0, 32 * 32 - 1
+       if (arr(i) .ne. i + (i / 32) * 13) STOP 2
+    end do
+  end subroutine t2
+
+  ! Test of vector-private variables declared on loop directive. Array type.
+
+  subroutine t3(pt)
+    integer, optional :: pt(2)
+    integer :: i, j, k, idx, arr(0:32*32*32)
+
+    do i = 0, 32*32*32-1
+       arr(i) = i
+    end do
+
+    !$acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+    !$acc loop gang
+    do i = 0, 31
+       !$acc loop worker
+       do j = 0, 31
+          !$acc loop vector private(pt)
+          do k = 0, 31
+             pt(1) = ieor(i, j * 3)
+             pt(2) = ior(i, j * 5)
+             arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + pt(1) * k
+             arr(i * 1024 + j * 32 + k) = arr(i * 1024 + j * 32 + k) + pt(2) * k
+          end do
+       end do
+    end do
+    !$acc end parallel
+
+    do i = 0, 32 - 1
+       do j = 0, 32 -1
+          do k = 0, 32 - 1
+             idx = i * 1024 + j * 32 + k
+             if (arr(idx) .ne. idx + ieor(i, j * 3) * k + ior(i, j * 5) * k) then
+                STOP 3
+             end if
+          end do
+       end do
+    end do
+  end subroutine t3
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90
new file mode 100644
index 0000000..b76db3e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-reduction.f90
@@ -0,0 +1,69 @@
+! Test optional arguments in reduction clauses.  The effect of
+! non-present arguments in reduction clauses is undefined, and is not tested
+! for.  The tests are based on those in reduction-1.f90.
+
+! { dg-do run }
+! { dg-additional-options "-w" }
+
+program optional_reduction
+  implicit none
+
+  integer :: rg, rw, rv, rc
+
+  rg = 0
+  rw = 0
+  rv = 0
+  rc = 0
+
+  call do_test(rg, rw, rv, rc)
+contains
+  subroutine do_test(rg, rw, rv, rc)
+    integer, parameter     :: n = 10, ng = 8, nw = 4, vl = 32
+    integer, optional      :: rg, rw, rv, rc
+    integer                :: i, vresult
+    integer, dimension (n) :: array
+
+    vresult = 0
+    do i = 1, n
+       array(i) = i
+    end do
+
+    !$acc parallel num_gangs(ng) copy(rg)
+    !$acc loop reduction(+:rg) gang
+    do i = 1, n
+       rg = rg + array(i)
+    end do
+    !$acc end parallel
+
+    !$acc parallel num_workers(nw) copy(rw)
+    !$acc loop reduction(+:rw) worker
+    do i = 1, n
+       rw = rw + array(i)
+    end do
+    !$acc end parallel
+
+    !$acc parallel vector_length(vl) copy(rv)
+    !$acc loop reduction(+:rv) vector
+    do i = 1, n
+       rv = rv + array(i)
+    end do
+    !$acc end parallel
+
+    !$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
+    !$acc loop reduction(+:rc) gang worker vector
+    do i = 1, n
+       rc = rc + array(i)
+    end do
+    !$acc end parallel
+
+    ! Verify the results
+    do i = 1, n
+       vresult = vresult + array(i)
+    end do
+
+    if (rg .ne. vresult) STOP 1
+    if (rw .ne. vresult) STOP 2
+    if (rv .ne. vresult) STOP 3
+    if (rc .ne. vresult) STOP 4
+  end subroutine do_test
+end program optional_reduction
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-update-device.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-update-device.f90
new file mode 100644
index 0000000..57f6900
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-update-device.f90
@@ -0,0 +1,121 @@
+! Test OpenACC update to device with an optional argument.
+
+! { dg-do run }
+
+program optional_update_device
+  implicit none
+
+  integer, parameter :: n = 64
+  integer :: i
+  integer :: a_int, b_int, res_int
+  integer :: a_arr(n), b_arr(n), res_arr(n)
+  integer, allocatable :: a_alloc(:), b_alloc(:), res_alloc(:)
+
+  a_int = 5
+  b_int = 11
+
+  call test_int(res_int, a_int)
+  if (res_int .ne. a_int) stop 1
+
+  call test_int(res_int, a_int, b_int)
+  if (res_int .ne. a_int * b_int) stop 2
+
+  res_arr(:) = 0
+  do i = 1, n
+    a_arr(i) = i
+    b_arr(i) = n - i + 1
+  end do
+
+  call test_array(res_arr, a_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i)) stop 3
+  end do
+
+  call test_array(res_arr, a_arr, b_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_arr(i)) stop 4
+  end do
+
+  allocate (a_alloc(n))
+  allocate (b_alloc(n))
+  allocate (res_alloc(n))
+
+  res_alloc(:) = 0
+  do i = 1, n
+    a_alloc(i) = i
+    b_alloc(i) = n - i + 1
+  end do
+
+  call test_allocatable(res_alloc, a_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. a_alloc(i)) stop 5
+  end do
+
+  call test_allocatable(res_alloc, a_alloc, b_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. a_alloc(i) * b_alloc(i)) stop 6
+  end do
+
+  deallocate (a_alloc)
+  deallocate (b_alloc)
+  deallocate (res_alloc)
+contains
+  subroutine test_int(res, a, b)
+    integer :: res
+    integer :: a
+    integer, optional :: b
+
+    !$acc data create(a, b, res)
+    !$acc update device(a, b)
+    !$acc parallel
+    res = a
+    if (present(b)) res = res * b
+    !$acc end parallel
+    !$acc update self(res)
+    !$acc end data
+  end subroutine test_int
+
+  subroutine test_array(res, a, b)
+    integer :: res(n)
+    integer :: a(n)
+    integer, optional :: b(n)
+
+    !$acc data create(a, b, res)
+    !$acc update device(a, b)
+    !$acc parallel loop
+    do i = 1, n
+      res(i) = a(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(b)) then
+        res(i) = res(i) * b(i)
+      end if
+    end do
+    !$acc update self(res)
+    !$acc end data
+  end subroutine test_array
+
+  subroutine test_allocatable(res, a, b)
+    integer, allocatable :: res(:)
+    integer, allocatable :: a(:)
+    integer, allocatable, optional :: b(:)
+
+    !$acc data create(a, b, res)
+    !$acc update device(a, b)
+    !$acc parallel loop
+    do i = 1, n
+      res(i) = a(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(b)) then
+        res(i) = res(i) * b(i)
+      end if
+    end do
+    !$acc update self(res)
+    !$acc end data
+  end subroutine test_allocatable
+end program optional_update_device
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/optional-update-host.f90 b/libgomp/testsuite/libgomp.oacc-fortran/optional-update-host.f90
new file mode 100644
index 0000000..0f3a903
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/optional-update-host.f90
@@ -0,0 +1,115 @@
+! Test OpenACC update to host with an optional argument.
+
+! { dg-do run }
+
+program optional_update_host
+  implicit none
+
+  integer, parameter :: n = 64
+  integer :: i
+  integer :: a_int, b_int, res_int
+  integer :: a_arr(n), b_arr(n), res_arr(n)
+  integer, allocatable :: a_alloc(:), b_alloc(:), res_alloc(:)
+
+  a_int = 5
+  b_int = 11
+  res_int = 0
+
+  call test_int(a_int, b_int)
+  if (res_int .ne. 0) stop 1
+
+  call test_int(a_int, b_int, res_int)
+  if (res_int .ne. a_int * b_int) stop 2
+
+  res_arr(:) = 0
+  do i = 1, n
+    a_arr(i) = i
+    b_arr(i) = n - i + 1
+  end do
+
+  call test_array(a_arr, b_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. 0) stop 1
+  end do
+
+  call test_array(a_arr, b_arr, res_arr)
+  do i = 1, n
+    if (res_arr(i) .ne. a_arr(i) * b_arr(i)) stop 2
+  end do
+
+  allocate(a_alloc(n))
+  allocate(b_alloc(n))
+  allocate(res_alloc(n))
+
+  res_alloc(:) = 0
+  do i = 1, n
+    a_alloc(i) = i
+    b_alloc(i) = n - i + 1
+  end do
+
+  call test_allocatable(a_alloc, b_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. 0) stop 1
+  end do
+
+  call test_allocatable(a_alloc, b_alloc, res_alloc)
+  do i = 1, n
+    if (res_alloc(i) .ne. a_alloc(i) * b_alloc(i)) stop 2
+  end do
+
+  deallocate(a_alloc)
+  deallocate(b_alloc)
+  deallocate(res_alloc)
+contains
+  subroutine test_int(a, b, res)
+    integer :: a, b
+    integer, optional :: res
+
+    !$acc data create(a, b, res)
+    !$acc update device(a, b)
+    !$acc parallel
+    if (present(res)) res = a
+    if (present(res)) res = res * b
+    !$acc end parallel
+    !$acc update self(res)
+    !$acc end data
+  end subroutine test_int
+
+  subroutine test_array(a, b, res)
+    integer :: a(n), b(n)
+    integer, optional :: res(n)
+
+    !$acc data create(a, b, res)
+    !$acc update device(a, b)
+    !$acc parallel loop
+    do i = 1, n
+      if (present(res)) res(i) = a(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(res)) res(i) = res(i) * b(i)
+    end do
+    !$acc update self(res)
+    !$acc end data
+  end subroutine test_array
+
+  subroutine test_allocatable(a, b, res)
+    integer, allocatable :: a(:), b(:)
+    integer, allocatable, optional :: res(:)
+
+    !$acc data create(a, b, res)
+    !$acc update device(a, b)
+    !$acc parallel loop
+    do i = 1, n
+      if (present(res)) res(i) = a(i)
+    end do
+
+    !$acc parallel loop
+    do i = 1, n
+      if (present(res)) res(i) = res(i) * b(i)
+    end do
+    !$acc update self(res)
+    !$acc end data
+  end subroutine test_allocatable
+end program optional_update_host
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/par-reduction-2-1.f b/libgomp/testsuite/libgomp.oacc-fortran/par-reduction-2-1.f
index aa1bb63..ff31116 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/par-reduction-2-1.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/par-reduction-2-1.f
@@ -14,7 +14,7 @@
       RES2 = 0
 
 !$ACC PARALLEL NUM_GANGS(256) NUM_WORKERS(32) VECTOR_LENGTH(32)
-!$ACC& REDUCTION(+:RES1) COPY(RES1, RES2) ASYNC(1)
+!$ACC& REDUCTION(+:RES1) COPY(RES1, RES2) ASYNC(1) ! { dg-warning "region is (gang|worker|vector) partitioned" }
       res1 = res1 + 5
 
 !$ACC ATOMIC
@@ -36,7 +36,7 @@
       RES2 = 1
 
 !$ACC PARALLEL NUM_GANGS(8) NUM_WORKERS(32) VECTOR_LENGTH(32)
-!$ACC& REDUCTION(*:RES1) COPY(RES1, RES2) ASYNC(1)
+!$ACC& REDUCTION(*:RES1) COPY(RES1, RES2) ASYNC(1) ! { dg-warning "region is (gang|worker|vector) partitioned" }
       res1 = res1 * 5
 
 !$ACC ATOMIC
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/par-reduction-2-2.f b/libgomp/testsuite/libgomp.oacc-fortran/par-reduction-2-2.f
index 5694de1..47c5ff3 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/par-reduction-2-2.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/par-reduction-2-2.f
@@ -14,7 +14,7 @@
       RES2 = 0
 
 !$ACC PARALLEL NUM_GANGS(256) NUM_WORKERS(32) VECTOR_LENGTH(32)
-!$ACC& REDUCTION(+:RES1) COPY(RES1, RES2) ASYNC(1)
+!$ACC& REDUCTION(+:RES1) COPY(RES1, RES2) ASYNC(1) ! { dg-warning "region is (gang|worker|vector) partitioned" }
       res1 = res1 + 5
 
 !$ACC ATOMIC
@@ -36,7 +36,7 @@
       RES2 = 1
 
 !$ACC PARALLEL NUM_GANGS(8) NUM_WORKERS(32) VECTOR_LENGTH(32)
-!$ACC& REDUCTION(*:RES1) COPY(RES1, RES2) ASYNC(1)
+!$ACC& REDUCTION(*:RES1) COPY(RES1, RES2) ASYNC(1) ! { dg-warning "region is (gang|worker|vector) partitioned" }
       res1 = res1 * 5
 
 !$ACC ATOMIC
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-2.f90
new file mode 100644
index 0000000..22a9566
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-2.f90
@@ -0,0 +1,31 @@
+! Subarrays declared on data construct: assumed-shape array.
+
+subroutine s1(n, arr)
+  integer :: n
+  integer :: arr(n)
+
+  !$acc data copy(arr(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     arr(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+end subroutine s1
+
+program test
+  integer, parameter :: n = 100
+  integer i, data(n)
+
+  data(:) = 0
+
+  call s1(n, data)
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-3.f90
new file mode 100644
index 0000000..ff17d10
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-3.f90
@@ -0,0 +1,34 @@
+! Subarrays declared on data construct: deferred-shape array.
+
+subroutine s1(n, arr)
+  integer :: n
+  integer :: arr(n)
+
+  !$acc data copy(arr(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     arr(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+end subroutine s1
+
+program test
+  integer, parameter :: n = 100
+  integer i
+  integer, allocatable :: data(:)
+
+  allocate (data(1:n))
+
+  data(:) = 0
+
+  call s1(n, data)
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-4.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-4.f90
new file mode 100644
index 0000000..01da999
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-4.f90
@@ -0,0 +1,31 @@
+! Subarrays declared on data construct: assumed-size array.
+
+subroutine s1(n, arr)
+  integer :: n
+  integer :: arr(*)
+
+  !$acc data copy(arr(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     arr(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+end subroutine s1
+
+program test
+  integer, parameter :: n = 100
+  integer i, data(n)
+
+  data(:) = 0
+
+  call s1(n, data)
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-5.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-5.f90
new file mode 100644
index 0000000..8a16e3d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-5.f90
@@ -0,0 +1,29 @@
+! Subarrays on parallel construct (no data construct): assumed-size array.
+
+subroutine s1(n, arr)
+  integer :: n
+  integer :: arr(*)
+
+  !$acc parallel loop copy(arr(5:n-10))
+  do i = 10, n - 10
+     arr(i) = i
+  end do
+  !$acc end parallel loop
+end subroutine s1
+
+program test
+  integer, parameter :: n = 100
+  integer i, data(n)
+
+  data(:) = 0
+
+  call s1(n, data)
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828-6.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-6.f90
new file mode 100644
index 0000000..e99c364
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828-6.f90
@@ -0,0 +1,28 @@
+! Subarrays declared on data construct: allocatable array (with array
+! descriptor).
+
+program test
+  integer, parameter :: n = 100
+  integer i
+  integer, allocatable :: data(:)
+
+  allocate (data(1:n))
+
+  data(:) = 0
+
+  !$acc data copy(data(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     data(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr70828.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr70828.f90
new file mode 100644
index 0000000..f87d232
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr70828.f90
@@ -0,0 +1,24 @@
+! Subarrays on data construct: explicit-shape array.
+
+program test
+  integer, parameter :: n = 100
+  integer i, data(n)
+
+  data(:) = 0
+
+  !$acc data copy(data(5:n-10))
+  !$acc parallel loop
+  do i = 10, n - 10
+     data(i) = i
+  end do
+  !$acc end parallel loop
+  !$acc end data
+
+  do i = 1, n
+     if ((i < 10 .or. i > n-10)) then
+        if ((data(i) .ne. 0)) call abort
+     else if (data(i) .ne. i) then
+        call abort
+     end if
+  end do
+end program test
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/pr84028.f90 b/libgomp/testsuite/libgomp.oacc-fortran/pr84028.f90
index 2b36122..8cb76a9 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/pr84028.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/pr84028.f90
@@ -5,7 +5,7 @@
 
   a = 1
 
-  !$acc parallel num_gangs(1) num_workers(2)
+  !$acc parallel num_gangs(1) num_workers(2) ! { dg-warning "region is worker partitioned" }
 
   if (any(a(1:3,1:3,1:3).ne.1)) STOP 1
 
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/print-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/print-1.f90
new file mode 100644
index 0000000..9a94195
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/print-1.f90
@@ -0,0 +1,15 @@
+! Ensure that printf on the offload device works.
+
+! { dg-do run }
+! { dg-output "The answer is 42(\n|\r\n|\r)+" }
+! { dg-xfail-if "no write for nvidia" { openacc_nvidia_accel_selected } }
+
+program main
+  implicit none
+  integer :: var = 42
+
+!$acc parallel
+  write (0, '("The answer is ", I2)') var
+!$acc end parallel
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/private-variables.f90 b/libgomp/testsuite/libgomp.oacc-fortran/private-variables.f90
index 472a6a1..fbff5cc 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/private-variables.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/private-variables.f90
@@ -13,6 +13,8 @@
   end do
 
   !$acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  ! { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 15 }
+  ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 15 }
   !$acc loop gang private(x)
   do i = 1, 32
      x = i * 2;
@@ -37,6 +39,7 @@
   end do
 
   !$acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 41 }
   !$acc loop gang private(x)
   do i = 0, 31
      x = i * 2;
@@ -65,6 +68,7 @@
   end do
 
   !$acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  ! { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 70 }
   !$acc loop gang private(x)
   do i = 0, 31
      x = i * 2;
@@ -98,6 +102,7 @@
   end do
 
   !$acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  ! { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 104 }
   !$acc loop gang private(pt)
   do i = 0, 31
      pt%x = i
@@ -208,6 +213,7 @@
   end do
 
   !$acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
+  ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 215 }
   !$acc loop gang private(x)
   do i = 0, 31
      !$acc loop worker private(x)
@@ -507,6 +513,8 @@
   end do
 
   !$acc parallel private(x) copy(arr) num_gangs(n) num_workers(8) vector_length(32)
+  ! { dg-warning "region is worker partitioned but does not contain worker partitioned code" "worker" { target *-*-* } 515 }
+  ! { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 515 }
     !$acc loop gang(static:1)
     do i = 1, n
       x = i * 2;
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95 b/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95
new file mode 100644
index 0000000..f16f69c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/privatized-ref-1.f95
@@ -0,0 +1,71 @@
+! { dg-do run }
+
+program main
+  implicit none
+  integer :: myint
+  integer :: i
+  real :: res(65536), tmp
+
+  res(:) = 0.0
+
+  myint = 5
+  call workers(myint, res)
+
+  do i=1,65536
+    tmp = i * 99
+    if (res(i) .ne. tmp) stop 1
+  end do
+
+  res(:) = 0.0
+
+  myint = 7
+  call vectors(myint, res)
+
+  do i=1,65536
+    tmp = i * 101
+    if (res(i) .ne. tmp) stop 2
+  end do
+
+contains
+
+  subroutine workers(t1, res)
+    implicit none
+    integer :: t1
+    integer :: i, j
+    real, intent(out) :: res(:)
+
+    !$acc parallel copyout(res) num_gangs(64) num_workers(64)
+
+    !$acc loop gang
+    do i=0,255
+      !$acc loop worker private(t1)
+      do j=1,256
+        t1 = (i * 256 + j) * 99
+        res(i * 256 + j) = t1
+      end do
+    end do
+
+    !$acc end parallel
+  end subroutine workers
+
+  subroutine vectors(t1, res)
+    implicit none
+    integer :: t1
+    integer :: i, j
+    real, intent(out) :: res(:)
+
+    !$acc parallel copyout(res) num_gangs(64) num_workers(64)
+
+    !$acc loop gang worker
+    do i=0,255
+      !$acc loop vector private(t1)
+      do j=1,256
+        t1 = (i * 256 + j) * 101
+        res(i * 256 + j) = t1
+      end do
+    end do
+
+    !$acc end parallel
+  end subroutine vectors
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-9.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-9.f90
new file mode 100644
index 0000000..fd64d88
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-9.f90
@@ -0,0 +1,54 @@
+! Test gang reductions on dummy variables.
+
+! { dg-do run }
+
+program main
+  implicit none
+
+  integer g, w, v, c
+
+  g = 0
+  w = 0
+  v = 0
+  c = 0
+
+  call reduction (g, w, v, c)
+
+  if (g /= 10) call abort
+  if (w /= 10) call abort
+  if (v /= 10) call abort
+  if (c /= 100) call abort
+end program main
+
+subroutine reduction (g, w, v, c)
+  implicit none
+
+  integer g, w, v, c, i
+
+  !$acc parallel
+  !$acc loop reduction(+:g) gang
+  do i = 1, 10
+     g = g + 1
+  end do
+  !$acc end parallel
+
+  !$acc parallel
+  !$acc loop reduction(+:w) worker
+  do i = 1, 10
+     w = w + 1
+  end do
+  !$acc end parallel
+
+  !$acc parallel
+  !$acc loop reduction(+:v) vector
+  do i = 1, 10
+     v = v + 1
+  end do
+  !$acc end parallel
+
+  !$acc parallel loop reduction(+:c) gang worker vector
+  do i = 1, 100
+     c = c + 1
+  end do
+  !$acc end parallel loop
+end subroutine reduction
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/routine-6.f90 b/libgomp/testsuite/libgomp.oacc-fortran/routine-6.f90
new file mode 100644
index 0000000..1bae09c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/routine-6.f90
@@ -0,0 +1,28 @@
+! { dg-do run }
+! { dg-xfail-if "TODO" { *-*-* } }
+
+program main
+  integer :: a, n
+  
+  n = 10
+
+  !$acc parallel copy (a, n)
+     a = foo (n)
+  !$acc end parallel 
+
+  if (a .ne. n * n) call abort
+
+contains
+
+function foo (n) result (rc)
+  !$acc routine nohost
+
+  integer, intent (in) :: n
+  integer :: rc
+
+  rc = n * n
+
+end function
+
+end program main
+
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/routine-7.f90 b/libgomp/testsuite/libgomp.oacc-fortran/routine-7.f90
index 1009f4a..005d90b 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/routine-7.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/routine-7.f90
@@ -1,4 +1,3 @@
-
 ! { dg-do run }
 ! { dg-additional-options "-cpp" }
 
@@ -100,7 +99,7 @@
   integer, intent (inout) :: a(N)
   integer :: i
 
-  !$acc loop gang
+  !$acc loop gang worker vector
   do i = 1, N
     a(i) = a(i) - i 
   end do
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/update-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/update-2.f90
new file mode 100644
index 0000000..c3c8a07
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/update-2.f90
@@ -0,0 +1,284 @@
+! Test ACC UPDATE with derived types.
+
+! { dg-do run }
+
+module dt
+  integer, parameter :: n = 10
+  type inner
+     integer :: d(n)
+  end type inner
+  type mytype
+     integer(8) :: a, b, c(n)
+     type(inner) :: in
+  end type mytype
+end module dt
+
+program derived_acc
+  use dt
+
+  implicit none
+  integer i, res
+  type(mytype) :: var
+
+  var%a = 0
+  var%b = 1
+  var%c(:) = 10
+  var%in%d(:) = 100
+
+  var%c(:) = 10
+
+  !$acc enter data copyin(var)
+
+  !$acc parallel loop present(var)
+  do i = 1, 1
+     var%a = var%b
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%a)
+
+  if (var%a /= var%b) stop 1
+
+  var%b = 100
+
+  !$acc update device(var%b)
+
+  !$acc parallel loop present(var)
+  do i = 1, 1
+     var%a = var%b
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%a)
+
+  if (var%a /= var%b) stop 2
+
+  !$acc parallel loop present (var)
+  do i = 1, n
+     var%c(i) = i
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%c)
+
+  var%a = -1
+
+  do i = 1, n
+     if (var%c(i) /= i) stop 3
+     var%c(i) = var%a
+  end do
+
+  !$acc update device(var%a)
+  !$acc update device(var%c)
+
+  res = 0
+
+  !$acc parallel loop present(var) reduction(+:res)
+  do i = 1, n
+     if (var%c(i) /= var%a) res = res + 1
+  end do
+
+  if (res /= 0) stop 4
+
+  var%c(:) = 0
+
+  !$acc update device(var%c)
+
+  !$acc parallel loop present(var)
+  do i = 5, 5
+     var%c(i) = 1
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%c(5))
+
+  do i = 1, n
+     if (i /= 5 .and. var%c(i) /= 0) stop 5
+     if (i == 5 .and. var%c(i) /= 1) stop 6
+  end do
+
+  !$acc parallel loop present(var)
+  do i = 1, n
+     var%in%d = var%a
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%in%d)
+
+  do i = 1, n
+     if (var%in%d(i) /= var%a) stop 7
+  end do
+
+  var%c(:) = 0
+
+  !$acc update device(var%c)
+
+  var%c(:) = -1
+
+  !$acc parallel loop present(var)
+  do i = n/2, n
+     var%c(i) = i
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%c(n/2:n))
+
+  do i = 1,n
+     if (i < n/2 .and. var%c(i) /= -1) stop 8
+     if (i >= n/2 .and. var%c(i) /= i) stop 9
+  end do
+
+  var%in%d(:) = 0
+  !$acc update device(var%in%d)
+
+  !$acc parallel loop present(var)
+  do i = 5, 5
+     var%in%d(i) = 1
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%in%d(5))
+
+  do i = 1, n
+     if (i /= 5 .and. var%in%d(i) /= 0) stop 10
+     if (i == 5 .and. var%in%d(i) /= 1) stop 11
+  end do
+
+  !$acc exit data delete(var)
+
+  call derived_acc_subroutine(var)
+end program derived_acc
+
+subroutine derived_acc_subroutine(var)
+  use dt
+
+  implicit none
+  integer i, res
+  type(mytype) :: var
+
+  var%a = 0
+  var%b = 1
+  var%c(:) = 10
+  var%in%d(:) = 100
+
+  var%c(:) = 10
+
+  !$acc enter data copyin(var)
+
+  !$acc parallel loop present(var)
+  do i = 1, 1
+     var%a = var%b
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%a)
+
+  if (var%a /= var%b) stop 12
+
+  var%b = 100
+
+  !$acc update device(var%b)
+
+  !$acc parallel loop present(var)
+  do i = 1, 1
+     var%a = var%b
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%a)
+
+  if (var%a /= var%b) stop 13
+
+  !$acc parallel loop present (var)
+  do i = 1, n
+     var%c(i) = i
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%c)
+
+  var%a = -1
+
+  do i = 1, n
+     if (var%c(i) /= i) stop 14
+     var%c(i) = var%a
+  end do
+
+  !$acc update device(var%a)
+  !$acc update device(var%c)
+
+  res = 0
+
+  !$acc parallel loop present(var) reduction(+:res)
+  do i = 1, n
+     if (var%c(i) /= var%a) res = res + 1
+  end do
+
+  if (res /= 0) stop 15
+
+  var%c(:) = 0
+
+  !$acc update device(var%c)
+
+  !$acc parallel loop present(var)
+  do i = 5, 5
+     var%c(i) = 1
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%c(5))
+
+  do i = 1, n
+     if (i /= 5 .and. var%c(i) /= 0) stop 16
+     if (i == 5 .and. var%c(i) /= 1) stop 17
+  end do
+
+  !$acc parallel loop present(var)
+  do i = 1, n
+     var%in%d = var%a
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%in%d)
+
+  do i = 1, n
+     if (var%in%d(i) /= var%a) stop 18
+  end do
+
+  var%c(:) = 0
+
+  !$acc update device(var%c)
+
+  var%c(:) = -1
+
+  !$acc parallel loop present(var)
+  do i = n/2, n
+     var%c(i) = i
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%c(n/2:n))
+
+  do i = 1,n
+     if (i < n/2 .and. var%c(i) /= -1) stop 19
+     if (i >= n/2 .and. var%c(i) /= i) stop 20
+  end do
+
+  var%in%d(:) = 0
+  !$acc update device(var%in%d)
+
+  !$acc parallel loop present(var)
+  do i = 5, 5
+     var%in%d(i) = 1
+  end do
+  !$acc end parallel loop
+
+  !$acc update host(var%in%d(5))
+
+  do i = 1, n
+     if (i /= 5 .and. var%in%d(i) /= 0) stop 21
+     if (i == 5 .and. var%in%d(i) /= 1) stop 22
+  end do
+
+  !$acc exit data delete(var)
+end subroutine derived_acc_subroutine
diff --git a/libitm/acinclude.m4 b/libitm/acinclude.m4
index cd8150c..1e1bb6b 100644
--- a/libitm/acinclude.m4
+++ b/libitm/acinclude.m4
@@ -247,7 +247,7 @@
   fi
   changequote(,)
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
   changequote([,])
   libitm_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) [$]3=0; print ([$]1*100+[$]2)*100+[$]3 }'`
diff --git a/libitm/configure b/libitm/configure
index fb742d7..35147c3 100644
--- a/libitm/configure
+++ b/libitm/configure
@@ -17059,7 +17059,7 @@
   fi
 
   ldver=`$LD --version 2>/dev/null |
-         sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+         sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
   libitm_gnu_ld_version=`echo $ldver | \
          $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
diff --git a/liboffloadmic/ChangeLog.omp b/liboffloadmic/ChangeLog.omp
new file mode 100644
index 0000000..75f8ee5
--- /dev/null
+++ b/liboffloadmic/ChangeLog.omp
@@ -0,0 +1,9 @@
+2019-01-08  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* plugin/libgomp-plugin-intelmic.cpp (GOMP_OFFLOAD_get_property):
+	New function.
+
+2019-02-26  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* plugin/libgomp-plugin-intelmic.cpp (GOMP_OFFLOAD_version):
+	Update to return GOMP_PLUGIN_IF_VERSION.
diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
index d1678d0..ed78c8d 100644
--- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
+++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
@@ -174,6 +174,27 @@
   return num_devices;
 }
 
+extern "C" union gomp_device_property_value
+GOMP_OFFLOAD_get_property (int n, int prop)
+{
+  union gomp_device_property_value nullval = { .val = 0 };
+
+  if (n >= num_devices)
+    {
+      GOMP_PLUGIN_error
+       ("Request for a property of a non-existing Intel MIC device %i", n);
+      return nullval;
+    }
+
+  switch (prop)
+    {
+    case GOMP_DEVICE_PROPERTY_VENDOR:
+      return (union gomp_device_property_value) { .ptr = /* TODO: "error: invalid conversion from 'const void*' to 'void*' [-fpermissive]" */ (char *) "Intel" };
+    default:
+      return nullval;
+    }
+}
+
 static bool
 offload (const char *file, uint64_t line, int device, const char *name,
 	 int num_vars, VarDesc *vars, const void **async_data)
diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4
index d22e302..d335014 100644
--- a/libstdc++-v3/acinclude.m4
+++ b/libstdc++-v3/acinclude.m4
@@ -211,7 +211,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
     changequote([,])
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) [$]3=0; print ([$]1*100+[$]2)*100+[$]3 }'`
diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure
index 1225edc..a21248d 100755
--- a/libstdc++-v3/configure
+++ b/libstdc++-v3/configure
@@ -22521,7 +22521,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -29688,7 +29688,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -35666,7 +35666,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -41778,7 +41778,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -53685,7 +53685,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -53950,7 +53950,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -54417,7 +54417,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -60785,7 +60785,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -66716,7 +66716,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -66917,7 +66917,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`
@@ -67137,7 +67137,7 @@
       glibcxx_ld_is_gold=yes
     fi
     ldver=`$LD --version 2>/dev/null |
-	   sed -e 's/GNU gold /GNU ld /;s/GNU ld version /GNU ld /;s/GNU ld ([^)]*) /GNU ld /;s/GNU ld \([0-9.][0-9.]*\).*/\1/; q'`
+	   sed -e 's/[. ][0-9]\{8\}$//;s/.* \([^ ]\+\)$/\1/; q'`
 
     glibcxx_gnu_ld_version=`echo $ldver | \
 	   $AWK -F. '{ if (NF<3) $3=0; print ($1*100+$2)*100+$3 }'`